autobuild v6.0-5
Resolves: bz#1573077 bz#1694595 bz#1703434 bz#1714536 bz#1714588 Resolves: bz#1715407 bz#1715438 Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
This commit is contained in:
parent
9dfd1f220c
commit
dddf13c543
119
0170-geo-rep-Convert-gfid-conflict-resolutiong-logs-into-.patch
Normal file
119
0170-geo-rep-Convert-gfid-conflict-resolutiong-logs-into-.patch
Normal file
@ -0,0 +1,119 @@
|
||||
From c4e292379928eaf1ebb47ee1c8e9b1eabbe90574 Mon Sep 17 00:00:00 2001
|
||||
From: Kotresh HR <khiremat@redhat.com>
|
||||
Date: Tue, 14 May 2019 11:05:45 +0530
|
||||
Subject: [PATCH 170/178] geo-rep: Convert gfid conflict resolutiong logs into
|
||||
debug
|
||||
|
||||
The gfid conflict resolution code path is not supposed
|
||||
to hit in generic code path. But few of the heavy rename
|
||||
workload (BUG: 1694820) makes it a generic case. So
|
||||
logging the entries to be fixed as INFO floods the log
|
||||
in these particular workloads. Hence convert them to DEBUG.
|
||||
|
||||
Backport of:
|
||||
> Patch: https://review.gluster.org/22720
|
||||
> fixes: bz#1709653
|
||||
> Change-Id: I4d5e102b87be5fe5b54f78f329e588882d72b9d9
|
||||
|
||||
BUG: 1714536
|
||||
Change-Id: I4d5e102b87be5fe5b54f78f329e588882d72b9d9
|
||||
Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172731
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
geo-replication/syncdaemon/master.py | 21 ++++++++++++---------
|
||||
1 file changed, 12 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
|
||||
index 42c86d7..3f98337 100644
|
||||
--- a/geo-replication/syncdaemon/master.py
|
||||
+++ b/geo-replication/syncdaemon/master.py
|
||||
@@ -811,7 +811,7 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
st = lstat(os.path.join(pfx, slave_gfid))
|
||||
# Takes care of scenarios with no hardlinks
|
||||
if isinstance(st, int) and st == ENOENT:
|
||||
- logging.info(lf('Entry not present on master. Fixing gfid '
|
||||
+ logging.debug(lf('Entry not present on master. Fixing gfid '
|
||||
'mismatch in slave. Deleting the entry',
|
||||
retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
@@ -843,7 +843,7 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
if matching_disk_gfid(slave_gfid, pbname):
|
||||
# Safe to ignore the failure as master contains same
|
||||
# file with same gfid. Remove entry from entries list
|
||||
- logging.info(lf('Fixing gfid mismatch in slave. '
|
||||
+ logging.debug(lf('Fixing gfid mismatch in slave. '
|
||||
' Safe to ignore, take out entry',
|
||||
retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
@@ -865,14 +865,14 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
dst_entry = os.path.join(pfx, realpath.split('/')[-2],
|
||||
realpath.split('/')[-1])
|
||||
src_entry = pbname
|
||||
- logging.info(lf('Fixing dir name/gfid mismatch in '
|
||||
+ logging.debug(lf('Fixing dir name/gfid mismatch in '
|
||||
'slave', retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
if src_entry == dst_entry:
|
||||
# Safe to ignore the failure as master contains
|
||||
# same directory as in slave with same gfid.
|
||||
# Remove the failure entry from entries list
|
||||
- logging.info(lf('Fixing dir name/gfid mismatch'
|
||||
+ logging.debug(lf('Fixing dir name/gfid mismatch'
|
||||
' in slave. Safe to ignore, '
|
||||
'take out entry',
|
||||
retry_count=retry_count,
|
||||
@@ -886,7 +886,7 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
entry=src_entry,
|
||||
entry1=dst_entry, stat=st,
|
||||
link=None)
|
||||
- logging.info(lf('Fixing dir name/gfid mismatch'
|
||||
+ logging.debug(lf('Fixing dir name/gfid mismatch'
|
||||
' in slave. Renaming',
|
||||
retry_count=retry_count,
|
||||
entry=repr(rename_dict)))
|
||||
@@ -896,7 +896,7 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
# renamed file exists and we are sure from
|
||||
# matching_disk_gfid check that the entry doesn't
|
||||
# exist with same gfid so we can safely delete on slave
|
||||
- logging.info(lf('Fixing file gfid mismatch in slave. '
|
||||
+ logging.debug(lf('Fixing file gfid mismatch in slave. '
|
||||
'Hardlink/Rename Case. Deleting entry',
|
||||
retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
@@ -915,7 +915,7 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
# Safe to ignore the failure as master doesn't contain
|
||||
# parent directory.
|
||||
if isinstance(st, int):
|
||||
- logging.info(lf('Fixing ENOENT error in slave. Parent '
|
||||
+ logging.debug(lf('Fixing ENOENT error in slave. Parent '
|
||||
'does not exist on master. Safe to '
|
||||
'ignore, take out entry',
|
||||
retry_count=retry_count,
|
||||
@@ -925,7 +925,7 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
- logging.info(lf('Fixing ENOENT error in slave. Create '
|
||||
+ logging.debug(lf('Fixing ENOENT error in slave. Create '
|
||||
'parent directory on slave.',
|
||||
retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
@@ -1223,10 +1223,13 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
|
||||
if gconf.get("gfid-conflict-resolution"):
|
||||
count = 0
|
||||
+ if failures:
|
||||
+ logging.info(lf('Entry ops failed with gfid mismatch',
|
||||
+ count=len(failures)))
|
||||
while failures and count < self.MAX_OE_RETRIES:
|
||||
count += 1
|
||||
self.handle_entry_failures(failures, entries)
|
||||
- logging.info("Retry original entries. count = %s" % count)
|
||||
+ logging.info(lf('Retry original entries', count=count))
|
||||
failures = self.slave.server.entry_ops(entries)
|
||||
if not failures:
|
||||
logging.info("Successfully fixed all entry ops with "
|
||||
--
|
||||
1.8.3.1
|
||||
|
295
0171-posix-add-storage.reserve-size-option.patch
Normal file
295
0171-posix-add-storage.reserve-size-option.patch
Normal file
@ -0,0 +1,295 @@
|
||||
From 4d82c7879387e6f7963b4d9c84c4ff8a1788055d Mon Sep 17 00:00:00 2001
|
||||
From: Sheetal Pamecha <sheetal.pamecha08@gmail.com>
|
||||
Date: Mon, 19 Nov 2018 22:15:25 +0530
|
||||
Subject: [PATCH 171/178] posix: add storage.reserve-size option
|
||||
|
||||
storage.reserve-size option will take size as input
|
||||
instead of percentage. If set, priority will be given to
|
||||
storage.reserve-size over storage.reserve. Default value
|
||||
of this option is 0.
|
||||
|
||||
> fixes: bz#1651445
|
||||
> Change-Id: I7a7342c68e436e8bf65bd39c567512ee04abbcea
|
||||
> Signed-off-by: Sheetal Pamecha <sheetal.pamecha08@gmail.com>
|
||||
> Cherry pick from commit 950726dfc8e3171bef625b563c0c6dbba1ec2928
|
||||
> Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/21686/
|
||||
|
||||
BUG: 1573077
|
||||
Change-Id: I7a7342c68e436e8bf65bd39c567512ee04abbcea
|
||||
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172709
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
---
|
||||
tests/bugs/posix/bug-1651445.t | 58 +++++++++++++++++++++++++
|
||||
xlators/mgmt/glusterd/src/glusterd-volume-set.c | 33 ++++++++++++++
|
||||
xlators/storage/posix/src/posix-common.c | 34 ++++++++++++---
|
||||
xlators/storage/posix/src/posix-helpers.c | 13 ++++--
|
||||
xlators/storage/posix/src/posix-inode-fd-ops.c | 10 +++--
|
||||
xlators/storage/posix/src/posix.h | 3 +-
|
||||
6 files changed, 138 insertions(+), 13 deletions(-)
|
||||
create mode 100644 tests/bugs/posix/bug-1651445.t
|
||||
|
||||
diff --git a/tests/bugs/posix/bug-1651445.t b/tests/bugs/posix/bug-1651445.t
|
||||
new file mode 100644
|
||||
index 0000000..f6f1833
|
||||
--- /dev/null
|
||||
+++ b/tests/bugs/posix/bug-1651445.t
|
||||
@@ -0,0 +1,58 @@
|
||||
+#!/bin/bash
|
||||
+
|
||||
+. $(dirname $0)/../../include.rc
|
||||
+. $(dirname $0)/../../volume.rc
|
||||
+. $(dirname $0)/../../snapshot.rc
|
||||
+
|
||||
+cleanup
|
||||
+
|
||||
+TEST verify_lvm_version
|
||||
+TEST glusterd
|
||||
+TEST pidof glusterd
|
||||
+TEST init_n_bricks 3
|
||||
+TEST setup_lvm 3
|
||||
+
|
||||
+TEST $CLI volume create $V0 replica 3 $H0:$L{1,2,3}
|
||||
+TEST $CLI volume start $V0
|
||||
+
|
||||
+TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0
|
||||
+
|
||||
+TEST $CLI volume set $V0 storage.reserve-size 10MB
|
||||
+
|
||||
+#No effect as priority to reserve-size
|
||||
+TEST $CLI volume set $V0 storage.reserve 20
|
||||
+
|
||||
+TEST dd if=/dev/zero of=$M0/a bs=100M count=1
|
||||
+sleep 5
|
||||
+
|
||||
+#Below dd confirms posix is giving priority to reserve-size
|
||||
+TEST dd if=/dev/zero of=$M0/b bs=40M count=1
|
||||
+
|
||||
+sleep 5
|
||||
+TEST ! dd if=/dev/zero of=$M0/c bs=5M count=1
|
||||
+
|
||||
+rm -rf $M0/*
|
||||
+#Size will reserve from the previously set reserve option = 20%
|
||||
+TEST $CLI volume set $V0 storage.reserve-size 0
|
||||
+
|
||||
+#Overwrite reserve option
|
||||
+TEST $CLI volume set $V0 storage.reserve-size 40MB
|
||||
+
|
||||
+#wait 5s to reset disk_space_full flag
|
||||
+sleep 5
|
||||
+
|
||||
+TEST dd if=/dev/zero of=$M0/a bs=100M count=1
|
||||
+TEST dd if=/dev/zero of=$M0/b bs=10M count=1
|
||||
+
|
||||
+# Wait 5s to update disk_space_full flag because thread check disk space
|
||||
+# after every 5s
|
||||
+
|
||||
+sleep 5
|
||||
+# setup_lvm create lvm partition of 150M and 40M are reserve so after
|
||||
+# consuming more than 110M next dd should fail
|
||||
+TEST ! dd if=/dev/zero of=$M0/c bs=5M count=1
|
||||
+
|
||||
+TEST $CLI volume stop $V0
|
||||
+TEST $CLI volume delete $V0
|
||||
+
|
||||
+cleanup
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
||||
index 7a83124..3a7ab83 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c
|
||||
@@ -1231,6 +1231,30 @@ out:
|
||||
|
||||
return ret;
|
||||
}
|
||||
+static int
|
||||
+validate_size(glusterd_volinfo_t *volinfo, dict_t *dict, char *key, char *value,
|
||||
+ char **op_errstr)
|
||||
+{
|
||||
+ xlator_t *this = NULL;
|
||||
+ uint64_t size = 0;
|
||||
+ int ret = -1;
|
||||
+
|
||||
+ this = THIS;
|
||||
+ GF_VALIDATE_OR_GOTO("glusterd", this, out);
|
||||
+ ret = gf_string2bytesize_uint64(value, &size);
|
||||
+ if (ret < 0) {
|
||||
+ gf_asprintf(op_errstr,
|
||||
+ "%s is not a valid size. %s "
|
||||
+ "expects a valid value in bytes",
|
||||
+ value, key);
|
||||
+ gf_msg(this->name, GF_LOG_ERROR, 0, GD_MSG_INVALID_ENTRY, "%s",
|
||||
+ *op_errstr);
|
||||
+ }
|
||||
+out:
|
||||
+ gf_msg_debug("glusterd", 0, "Returning %d", ret);
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
|
||||
/* dispatch table for VOLUME SET
|
||||
* -----------------------------
|
||||
@@ -2830,6 +2854,15 @@ struct volopt_map_entry glusterd_volopt_map[] = {
|
||||
.op_version = GD_OP_VERSION_3_13_0,
|
||||
},
|
||||
{
|
||||
+ .key = "storage.reserve-size",
|
||||
+ .voltype = "storage/posix",
|
||||
+ .value = "0",
|
||||
+ .validate_fn = validate_size,
|
||||
+ .description = "If set, priority will be given to "
|
||||
+ "storage.reserve-size over storage.reserve",
|
||||
+ .op_version = GD_OP_VERSION_7_0,
|
||||
+ },
|
||||
+ {
|
||||
.option = "health-check-timeout",
|
||||
.key = "storage.health-check-timeout",
|
||||
.type = NO_DOC,
|
||||
diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c
|
||||
index ed82e35..0f70af5 100644
|
||||
--- a/xlators/storage/posix/src/posix-common.c
|
||||
+++ b/xlators/storage/posix/src/posix-common.c
|
||||
@@ -345,11 +345,18 @@ posix_reconfigure(xlator_t *this, dict_t *options)
|
||||
" fallback to <hostname>:<export>");
|
||||
}
|
||||
|
||||
- GF_OPTION_RECONF("reserve", priv->disk_reserve, options, uint32, out);
|
||||
- if (priv->disk_reserve) {
|
||||
+ GF_OPTION_RECONF("reserve-size", priv->disk_reserve_size, options, size,
|
||||
+ out);
|
||||
+
|
||||
+ GF_OPTION_RECONF("reserve", priv->disk_reserve_percent, options, uint32,
|
||||
+ out);
|
||||
+ if (priv->disk_reserve_size || priv->disk_reserve_percent) {
|
||||
ret = posix_spawn_disk_space_check_thread(this);
|
||||
- if (ret)
|
||||
+ if (ret) {
|
||||
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED,
|
||||
+ "Getting disk space check from thread failed");
|
||||
goto out;
|
||||
+ }
|
||||
}
|
||||
|
||||
GF_OPTION_RECONF("health-check-interval", priv->health_check_interval,
|
||||
@@ -968,11 +975,17 @@ posix_init(xlator_t *this)
|
||||
|
||||
_private->disk_space_check_active = _gf_false;
|
||||
_private->disk_space_full = 0;
|
||||
- GF_OPTION_INIT("reserve", _private->disk_reserve, uint32, out);
|
||||
- if (_private->disk_reserve) {
|
||||
+ GF_OPTION_INIT("reserve-size", _private->disk_reserve_size, size, out);
|
||||
+
|
||||
+ GF_OPTION_INIT("reserve", _private->disk_reserve_percent, uint32, out);
|
||||
+
|
||||
+ if (_private->disk_reserve_size || _private->disk_reserve_percent) {
|
||||
ret = posix_spawn_disk_space_check_thread(this);
|
||||
- if (ret)
|
||||
+ if (ret) {
|
||||
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED,
|
||||
+ "Getting disk space check from thread failed ");
|
||||
goto out;
|
||||
+ }
|
||||
}
|
||||
|
||||
_private->health_check_active = _gf_false;
|
||||
@@ -1216,6 +1229,15 @@ struct volume_options posix_options[] = {
|
||||
" Set to 0 to disable",
|
||||
.op_version = {GD_OP_VERSION_3_13_0},
|
||||
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
|
||||
+ {.key = {"reserve-size"},
|
||||
+ .type = GF_OPTION_TYPE_SIZET,
|
||||
+ .min = 0,
|
||||
+ .default_value = "0",
|
||||
+ .validate = GF_OPT_VALIDATE_MIN,
|
||||
+ .description = "size in megabytes to be reserved for disk space."
|
||||
+ " Set to 0 to disable",
|
||||
+ .op_version = {GD_OP_VERSION_7_0},
|
||||
+ .flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
|
||||
{.key = {"batch-fsync-mode"},
|
||||
.type = GF_OPTION_TYPE_STR,
|
||||
.default_value = "reverse-fsync",
|
||||
diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c
|
||||
index d0fd45a..aecf4f8 100644
|
||||
--- a/xlators/storage/posix/src/posix-helpers.c
|
||||
+++ b/xlators/storage/posix/src/posix-helpers.c
|
||||
@@ -2246,6 +2246,7 @@ posix_disk_space_check(xlator_t *this)
|
||||
struct posix_private *priv = NULL;
|
||||
char *subvol_path = NULL;
|
||||
int op_ret = 0;
|
||||
+ uint64_t size = 0;
|
||||
int percent = 0;
|
||||
struct statvfs buf = {0};
|
||||
uint64_t totsz = 0;
|
||||
@@ -2256,7 +2257,14 @@ posix_disk_space_check(xlator_t *this)
|
||||
GF_VALIDATE_OR_GOTO(this->name, priv, out);
|
||||
|
||||
subvol_path = priv->base_path;
|
||||
- percent = priv->disk_reserve;
|
||||
+
|
||||
+ if (priv->disk_reserve_size) {
|
||||
+ size = priv->disk_reserve_size;
|
||||
+ } else {
|
||||
+ percent = priv->disk_reserve_percent;
|
||||
+ totsz = (buf.f_blocks * buf.f_bsize);
|
||||
+ size = ((totsz * percent) / 100);
|
||||
+ }
|
||||
|
||||
op_ret = sys_statvfs(subvol_path, &buf);
|
||||
|
||||
@@ -2265,10 +2273,9 @@ posix_disk_space_check(xlator_t *this)
|
||||
"statvfs failed on %s", subvol_path);
|
||||
goto out;
|
||||
}
|
||||
- totsz = (buf.f_blocks * buf.f_bsize);
|
||||
freesz = (buf.f_bfree * buf.f_bsize);
|
||||
|
||||
- if (freesz <= ((totsz * percent) / 100)) {
|
||||
+ if (freesz <= size) {
|
||||
priv->disk_space_full = 1;
|
||||
} else {
|
||||
priv->disk_space_full = 0;
|
||||
diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c
|
||||
index 2c19ce1..7ca4d26 100644
|
||||
--- a/xlators/storage/posix/src/posix-inode-fd-ops.c
|
||||
+++ b/xlators/storage/posix/src/posix-inode-fd-ops.c
|
||||
@@ -720,7 +720,7 @@ posix_do_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
|
||||
thread after every 5 sec sleep to working correctly storage.reserve
|
||||
option behaviour
|
||||
*/
|
||||
- if (priv->disk_reserve)
|
||||
+ if (priv->disk_reserve_size || priv->disk_reserve_percent)
|
||||
posix_disk_space_check(this);
|
||||
|
||||
DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, ret, ret, out);
|
||||
@@ -2331,8 +2331,12 @@ posix_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
|
||||
goto out;
|
||||
}
|
||||
|
||||
- percent = priv->disk_reserve;
|
||||
- reserved_blocks = (buf.f_blocks * percent) / 100;
|
||||
+ if (priv->disk_reserve_size) {
|
||||
+ reserved_blocks = priv->disk_reserve_size / buf.f_bsize;
|
||||
+ } else {
|
||||
+ percent = priv->disk_reserve_percent;
|
||||
+ reserved_blocks = (buf.f_blocks * percent) / 100;
|
||||
+ }
|
||||
|
||||
if (buf.f_bfree > reserved_blocks) {
|
||||
buf.f_bfree = (buf.f_bfree - reserved_blocks);
|
||||
diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h
|
||||
index 1da4d01..4364b96 100644
|
||||
--- a/xlators/storage/posix/src/posix.h
|
||||
+++ b/xlators/storage/posix/src/posix.h
|
||||
@@ -225,7 +225,8 @@ struct posix_private {
|
||||
pthread_t health_check;
|
||||
gf_boolean_t health_check_active;
|
||||
|
||||
- uint32_t disk_reserve;
|
||||
+ uint32_t disk_reserve_percent;
|
||||
+ uint64_t disk_reserve_size;
|
||||
uint32_t disk_space_full;
|
||||
pthread_t disk_space_check;
|
||||
gf_boolean_t disk_space_check_active;
|
||||
--
|
||||
1.8.3.1
|
||||
|
141
0172-ec-fini-Fix-race-with-ec_fini-and-ec_notify.patch
Normal file
141
0172-ec-fini-Fix-race-with-ec_fini-and-ec_notify.patch
Normal file
@ -0,0 +1,141 @@
|
||||
From 998d9b8b5e271f407e1c654c34f45f0db36abc71 Mon Sep 17 00:00:00 2001
|
||||
From: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
Date: Tue, 21 May 2019 17:15:07 +0530
|
||||
Subject: [PATCH 172/178] ec/fini: Fix race with ec_fini and ec_notify
|
||||
|
||||
During a graph cleanup, we first sent a PARENT_DOWN and wait for
|
||||
a child down to ultimately free the xlator and the graph.
|
||||
|
||||
In the ec xlator, we cleanup the threads when we get a PARENT_DOWN event.
|
||||
But a racing event like CHILD_UP or event xl_op may trigger healing threads
|
||||
after threads cleanup.
|
||||
|
||||
So there is a chance that the threads might access a freed private variabe
|
||||
|
||||
Upstream patch: https://review.gluster.org/#/c/glusterfs/+/22758/
|
||||
|
||||
>Change-Id: I252d10181bb67b95900c903d479de707a8489532
|
||||
>fixes: bz#1703948
|
||||
>Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
|
||||
Change-Id: I84a10352d9fb3e68d4147b3791e3af45ab79050e
|
||||
BUG: 1703434
|
||||
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172285
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
libglusterfs/src/glusterfs/xlator.h | 3 +++
|
||||
libglusterfs/src/libglusterfs.sym | 1 +
|
||||
libglusterfs/src/xlator.c | 21 +++++++++++++++++++++
|
||||
xlators/cluster/ec/src/ec-heal.c | 4 ++++
|
||||
xlators/cluster/ec/src/ec-heald.c | 6 ++++++
|
||||
xlators/cluster/ec/src/ec.c | 3 +++
|
||||
6 files changed, 38 insertions(+)
|
||||
|
||||
diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h
|
||||
index 8998976..09e463e 100644
|
||||
--- a/libglusterfs/src/glusterfs/xlator.h
|
||||
+++ b/libglusterfs/src/glusterfs/xlator.h
|
||||
@@ -1092,4 +1092,7 @@ gluster_graph_take_reference(xlator_t *tree);
|
||||
|
||||
gf_boolean_t
|
||||
mgmt_is_multiplexed_daemon(char *name);
|
||||
+
|
||||
+gf_boolean_t
|
||||
+xlator_is_cleanup_starting(xlator_t *this);
|
||||
#endif /* _XLATOR_H */
|
||||
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
|
||||
index ec474e7..7a2edef 100644
|
||||
--- a/libglusterfs/src/libglusterfs.sym
|
||||
+++ b/libglusterfs/src/libglusterfs.sym
|
||||
@@ -1161,3 +1161,4 @@ glusterfs_process_svc_attach_volfp
|
||||
glusterfs_mux_volfile_reconfigure
|
||||
glusterfs_process_svc_detach
|
||||
mgmt_is_multiplexed_daemon
|
||||
+xlator_is_cleanup_starting
|
||||
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
|
||||
index 022c3ed..fbfbbe2 100644
|
||||
--- a/libglusterfs/src/xlator.c
|
||||
+++ b/libglusterfs/src/xlator.c
|
||||
@@ -1486,3 +1486,24 @@ mgmt_is_multiplexed_daemon(char *name)
|
||||
}
|
||||
return _gf_false;
|
||||
}
|
||||
+
|
||||
+gf_boolean_t
|
||||
+xlator_is_cleanup_starting(xlator_t *this)
|
||||
+{
|
||||
+ gf_boolean_t cleanup = _gf_false;
|
||||
+ glusterfs_graph_t *graph = NULL;
|
||||
+ xlator_t *xl = NULL;
|
||||
+
|
||||
+ if (!this)
|
||||
+ goto out;
|
||||
+ graph = this->graph;
|
||||
+
|
||||
+ if (!graph)
|
||||
+ goto out;
|
||||
+
|
||||
+ xl = graph->first;
|
||||
+ if (xl && xl->cleanup_starting)
|
||||
+ cleanup = _gf_true;
|
||||
+out:
|
||||
+ return cleanup;
|
||||
+}
|
||||
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
|
||||
index 2fa1f11..8844c29 100644
|
||||
--- a/xlators/cluster/ec/src/ec-heal.c
|
||||
+++ b/xlators/cluster/ec/src/ec-heal.c
|
||||
@@ -2855,6 +2855,10 @@ ec_replace_brick_heal_wrap(void *opaque)
|
||||
itable = ec->xl->itable;
|
||||
else
|
||||
goto out;
|
||||
+
|
||||
+ if (xlator_is_cleanup_starting(ec->xl))
|
||||
+ goto out;
|
||||
+
|
||||
ret = ec_replace_heal(ec, itable->root);
|
||||
out:
|
||||
return ret;
|
||||
diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
|
||||
index edf5e11..91512d7 100644
|
||||
--- a/xlators/cluster/ec/src/ec-heald.c
|
||||
+++ b/xlators/cluster/ec/src/ec-heald.c
|
||||
@@ -444,6 +444,9 @@ unlock:
|
||||
int
|
||||
ec_shd_full_healer_spawn(xlator_t *this, int subvol)
|
||||
{
|
||||
+ if (xlator_is_cleanup_starting(this))
|
||||
+ return -1;
|
||||
+
|
||||
return ec_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
|
||||
ec_shd_full_healer);
|
||||
}
|
||||
@@ -451,6 +454,9 @@ ec_shd_full_healer_spawn(xlator_t *this, int subvol)
|
||||
int
|
||||
ec_shd_index_healer_spawn(xlator_t *this, int subvol)
|
||||
{
|
||||
+ if (xlator_is_cleanup_starting(this))
|
||||
+ return -1;
|
||||
+
|
||||
return ec_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
|
||||
ec_shd_index_healer);
|
||||
}
|
||||
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
|
||||
index 264582a..df5912c 100644
|
||||
--- a/xlators/cluster/ec/src/ec.c
|
||||
+++ b/xlators/cluster/ec/src/ec.c
|
||||
@@ -486,6 +486,9 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state)
|
||||
{
|
||||
uintptr_t current_state = 0;
|
||||
|
||||
+ if (xlator_is_cleanup_starting(ec->xl))
|
||||
+ return _gf_false;
|
||||
+
|
||||
if ((ec->xl_notify & index_mask) == 0) {
|
||||
ec->xl_notify |= index_mask;
|
||||
ec->xl_notify_count++;
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -0,0 +1,52 @@
|
||||
From c001b60047c73e07f42ee858dd8ae19136ecd61b Mon Sep 17 00:00:00 2001
|
||||
From: Ravishankar N <ravishankar@redhat.com>
|
||||
Date: Thu, 6 Jun 2019 13:19:29 +0530
|
||||
Subject: [PATCH 173/178] glusterd: store fips-mode-rchecksum option in the
|
||||
info file
|
||||
|
||||
commit 146e4b45d0ce906ae50fd6941a1efafd133897ea enabled
|
||||
storage.fips-mode-rchecksum option for all new volumes with op-version
|
||||
>=GD_OP_VERSION_7_0 but `gluster vol get $volname
|
||||
storage.fips-mode-rchecksum` was displaying it as 'off'. This patch fixes it.
|
||||
|
||||
>upstream patch link : https://review.gluster.org/#/c/glusterfs/+/22830/
|
||||
|
||||
>fixes: bz#1717782
|
||||
>Change-Id: Ie09f89838893c5776a3f60569dfe8d409d1494dd
|
||||
>Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
||||
|
||||
BUG: 1715407
|
||||
Change-Id: Ie09f89838893c5776a3f60569dfe8d409d1494dd
|
||||
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172799
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
xlators/mgmt/glusterd/src/glusterd-utils.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
|
||||
index 2bc4836..7768b8e 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
|
||||
@@ -13124,6 +13124,17 @@ glusterd_enable_default_options(glusterd_volinfo_t *volinfo, char *option)
|
||||
}
|
||||
}
|
||||
}
|
||||
+ if (conf->op_version >= GD_OP_VERSION_7_0) {
|
||||
+ ret = dict_set_dynstr_with_alloc(volinfo->dict,
|
||||
+ "storage.fips-mode-rchecksum", "on");
|
||||
+ if (ret) {
|
||||
+ gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_DICT_SET_FAILED,
|
||||
+ "Failed to set option 'storage.fips-mode-rchecksum' "
|
||||
+ "on volume %s",
|
||||
+ volinfo->volname);
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -0,0 +1,53 @@
|
||||
From 9b94397a5a735910fab2a29670146a1feb6d890e Mon Sep 17 00:00:00 2001
|
||||
From: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
Date: Tue, 4 Jun 2019 11:13:50 +0530
|
||||
Subject: [PATCH 174/178] xlator/log: Add more logging in
|
||||
xlator_is_cleanup_starting
|
||||
|
||||
This patch will add two extra logs for invalid argument
|
||||
|
||||
> upstream patch : https://review.gluster.org/#/c/glusterfs/+/22810/
|
||||
|
||||
>Change-Id: I3950b4f4b9d88b1f1e788ef93d8f09d4bd8d4d8b
|
||||
>updates: bz#1703948
|
||||
>Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
|
||||
Change-Id: I3950b4f4b9d88b1f1e788ef93d8f09d4bd8d4d8b
|
||||
BUG: 1703434
|
||||
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172800
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
libglusterfs/src/xlator.c | 12 +++++++++---
|
||||
1 file changed, 9 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c
|
||||
index fbfbbe2..71e1ed4 100644
|
||||
--- a/libglusterfs/src/xlator.c
|
||||
+++ b/libglusterfs/src/xlator.c
|
||||
@@ -1494,12 +1494,18 @@ xlator_is_cleanup_starting(xlator_t *this)
|
||||
glusterfs_graph_t *graph = NULL;
|
||||
xlator_t *xl = NULL;
|
||||
|
||||
- if (!this)
|
||||
+ if (!this) {
|
||||
+ gf_msg("xlator", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG,
|
||||
+ "xlator object is null, returning false");
|
||||
goto out;
|
||||
- graph = this->graph;
|
||||
+ }
|
||||
|
||||
- if (!graph)
|
||||
+ graph = this->graph;
|
||||
+ if (!graph) {
|
||||
+ gf_msg("xlator", GF_LOG_WARNING, EINVAL, LG_MSG_INVALID_ARG,
|
||||
+ "Graph is not set for xlator %s", this->name);
|
||||
goto out;
|
||||
+ }
|
||||
|
||||
xl = graph->first;
|
||||
if (xl && xl->cleanup_starting)
|
||||
--
|
||||
1.8.3.1
|
||||
|
241
0175-ec-fini-Fix-race-between-xlator-cleanup-and-on-going.patch
Normal file
241
0175-ec-fini-Fix-race-between-xlator-cleanup-and-on-going.patch
Normal file
@ -0,0 +1,241 @@
|
||||
From 9fd966aa6879ac9867381629f82eca24b950d731 Mon Sep 17 00:00:00 2001
|
||||
From: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
Date: Sun, 2 Jun 2019 01:36:33 +0530
|
||||
Subject: [PATCH 175/178] ec/fini: Fix race between xlator cleanup and on going
|
||||
async fop
|
||||
|
||||
Problem:
|
||||
While we process a cleanup, there is a chance for a race between
|
||||
async operations, for example ec_launch_replace_heal. So this can
|
||||
lead to invalid mem access.
|
||||
|
||||
Solution:
|
||||
Just like we track on going heal fops, we can also track fops like
|
||||
ec_launch_replace_heal, so that we can decide when to send a
|
||||
PARENT_DOWN request.
|
||||
|
||||
> upstream patch : https://review.gluster.org/#/c/glusterfs/+/22798/
|
||||
|
||||
>Change-Id: I055391c5c6c34d58aef7336847f3b570cb831298
|
||||
>fixes: bz#1703948
|
||||
>Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
|
||||
Change-Id: I055391c5c6c34d58aef7336847f3b570cb831298
|
||||
BUG: 1714588
|
||||
Signed-off-by: Mohammed Rafi KC <rkavunga@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172801
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
xlators/cluster/ec/src/ec-common.c | 10 ++++++++++
|
||||
xlators/cluster/ec/src/ec-common.h | 2 ++
|
||||
xlators/cluster/ec/src/ec-data.c | 4 +++-
|
||||
xlators/cluster/ec/src/ec-heal.c | 17 +++++++++++++++--
|
||||
xlators/cluster/ec/src/ec-types.h | 1 +
|
||||
xlators/cluster/ec/src/ec.c | 37 +++++++++++++++++++++++++------------
|
||||
6 files changed, 56 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
|
||||
index e85aa8b..9cc6395 100644
|
||||
--- a/xlators/cluster/ec/src/ec-common.c
|
||||
+++ b/xlators/cluster/ec/src/ec-common.c
|
||||
@@ -2955,3 +2955,13 @@ ec_manager(ec_fop_data_t *fop, int32_t error)
|
||||
|
||||
__ec_manager(fop, error);
|
||||
}
|
||||
+
|
||||
+gf_boolean_t
|
||||
+__ec_is_last_fop(ec_t *ec)
|
||||
+{
|
||||
+ if ((list_empty(&ec->pending_fops)) &&
|
||||
+ (GF_ATOMIC_GET(ec->async_fop_count) == 0)) {
|
||||
+ return _gf_true;
|
||||
+ }
|
||||
+ return _gf_false;
|
||||
+}
|
||||
diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
|
||||
index e948342..bf6c97d 100644
|
||||
--- a/xlators/cluster/ec/src/ec-common.h
|
||||
+++ b/xlators/cluster/ec/src/ec-common.h
|
||||
@@ -204,4 +204,6 @@ void
|
||||
ec_reset_entry_healing(ec_fop_data_t *fop);
|
||||
char *
|
||||
ec_msg_str(ec_fop_data_t *fop);
|
||||
+gf_boolean_t
|
||||
+__ec_is_last_fop(ec_t *ec);
|
||||
#endif /* __EC_COMMON_H__ */
|
||||
diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
|
||||
index 6ef9340..8d2d9a1 100644
|
||||
--- a/xlators/cluster/ec/src/ec-data.c
|
||||
+++ b/xlators/cluster/ec/src/ec-data.c
|
||||
@@ -202,11 +202,13 @@ ec_handle_last_pending_fop_completion(ec_fop_data_t *fop, gf_boolean_t *notify)
|
||||
{
|
||||
ec_t *ec = fop->xl->private;
|
||||
|
||||
+ *notify = _gf_false;
|
||||
+
|
||||
if (!list_empty(&fop->pending_list)) {
|
||||
LOCK(&ec->lock);
|
||||
{
|
||||
list_del_init(&fop->pending_list);
|
||||
- *notify = list_empty(&ec->pending_fops);
|
||||
+ *notify = __ec_is_last_fop(ec);
|
||||
}
|
||||
UNLOCK(&ec->lock);
|
||||
}
|
||||
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
|
||||
index 8844c29..237fea2 100644
|
||||
--- a/xlators/cluster/ec/src/ec-heal.c
|
||||
+++ b/xlators/cluster/ec/src/ec-heal.c
|
||||
@@ -2814,8 +2814,20 @@ int
|
||||
ec_replace_heal_done(int ret, call_frame_t *heal, void *opaque)
|
||||
{
|
||||
ec_t *ec = opaque;
|
||||
+ gf_boolean_t last_fop = _gf_false;
|
||||
|
||||
+ if (GF_ATOMIC_DEC(ec->async_fop_count) == 0) {
|
||||
+ LOCK(&ec->lock);
|
||||
+ {
|
||||
+ last_fop = __ec_is_last_fop(ec);
|
||||
+ }
|
||||
+ UNLOCK(&ec->lock);
|
||||
+ }
|
||||
gf_msg_debug(ec->xl->name, 0, "getxattr on bricks is done ret %d", ret);
|
||||
+
|
||||
+ if (last_fop)
|
||||
+ ec_pending_fops_completed(ec);
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2869,14 +2881,15 @@ ec_launch_replace_heal(ec_t *ec)
|
||||
{
|
||||
int ret = -1;
|
||||
|
||||
- if (!ec)
|
||||
- return ret;
|
||||
ret = synctask_new(ec->xl->ctx->env, ec_replace_brick_heal_wrap,
|
||||
ec_replace_heal_done, NULL, ec);
|
||||
+
|
||||
if (ret < 0) {
|
||||
gf_msg_debug(ec->xl->name, 0, "Heal failed for replace brick ret = %d",
|
||||
ret);
|
||||
+ ec_replace_heal_done(-1, NULL, ec);
|
||||
}
|
||||
+
|
||||
return ret;
|
||||
}
|
||||
|
||||
diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
|
||||
index 1c295c0..4dbf4a3 100644
|
||||
--- a/xlators/cluster/ec/src/ec-types.h
|
||||
+++ b/xlators/cluster/ec/src/ec-types.h
|
||||
@@ -643,6 +643,7 @@ struct _ec {
|
||||
uintptr_t xl_notify; /* Bit flag representing
|
||||
notification for bricks. */
|
||||
uintptr_t node_mask;
|
||||
+ gf_atomic_t async_fop_count; /* Number of on going asynchronous fops. */
|
||||
xlator_t **xl_list;
|
||||
gf_lock_t lock;
|
||||
gf_timer_t *timer;
|
||||
diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c
|
||||
index df5912c..f0d58c0 100644
|
||||
--- a/xlators/cluster/ec/src/ec.c
|
||||
+++ b/xlators/cluster/ec/src/ec.c
|
||||
@@ -355,6 +355,7 @@ ec_notify_cbk(void *data)
|
||||
ec_t *ec = data;
|
||||
glusterfs_event_t event = GF_EVENT_MAXVAL;
|
||||
gf_boolean_t propagate = _gf_false;
|
||||
+ gf_boolean_t launch_heal = _gf_false;
|
||||
|
||||
LOCK(&ec->lock);
|
||||
{
|
||||
@@ -384,6 +385,11 @@ ec_notify_cbk(void *data)
|
||||
* still bricks DOWN, they will be healed when they
|
||||
* come up. */
|
||||
ec_up(ec->xl, ec);
|
||||
+
|
||||
+ if (ec->shd.iamshd && !ec->shutdown) {
|
||||
+ launch_heal = _gf_true;
|
||||
+ GF_ATOMIC_INC(ec->async_fop_count);
|
||||
+ }
|
||||
}
|
||||
|
||||
propagate = _gf_true;
|
||||
@@ -391,13 +397,12 @@ ec_notify_cbk(void *data)
|
||||
unlock:
|
||||
UNLOCK(&ec->lock);
|
||||
|
||||
+ if (launch_heal) {
|
||||
+ /* We have just brought the volume UP, so we trigger
|
||||
+ * a self-heal check on the root directory. */
|
||||
+ ec_launch_replace_heal(ec);
|
||||
+ }
|
||||
if (propagate) {
|
||||
- if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) {
|
||||
- /* We have just brought the volume UP, so we trigger
|
||||
- * a self-heal check on the root directory. */
|
||||
- ec_launch_replace_heal(ec);
|
||||
- }
|
||||
-
|
||||
default_notify(ec->xl, event, NULL);
|
||||
}
|
||||
}
|
||||
@@ -425,7 +430,7 @@ ec_disable_delays(ec_t *ec)
|
||||
{
|
||||
ec->shutdown = _gf_true;
|
||||
|
||||
- return list_empty(&ec->pending_fops);
|
||||
+ return __ec_is_last_fop(ec);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -603,7 +608,10 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
if (event == GF_EVENT_CHILD_UP) {
|
||||
/* We need to trigger a selfheal if a brick changes
|
||||
* to UP state. */
|
||||
- needs_shd_check = ec_set_up_state(ec, mask, mask);
|
||||
+ if (ec_set_up_state(ec, mask, mask) && ec->shd.iamshd &&
|
||||
+ !ec->shutdown) {
|
||||
+ needs_shd_check = _gf_true;
|
||||
+ }
|
||||
} else if (event == GF_EVENT_CHILD_DOWN) {
|
||||
ec_set_up_state(ec, mask, 0);
|
||||
}
|
||||
@@ -633,17 +641,21 @@ ec_notify(xlator_t *this, int32_t event, void *data, void *data2)
|
||||
}
|
||||
} else {
|
||||
propagate = _gf_false;
|
||||
+ needs_shd_check = _gf_false;
|
||||
+ }
|
||||
+
|
||||
+ if (needs_shd_check) {
|
||||
+ GF_ATOMIC_INC(ec->async_fop_count);
|
||||
}
|
||||
}
|
||||
unlock:
|
||||
UNLOCK(&ec->lock);
|
||||
|
||||
done:
|
||||
+ if (needs_shd_check) {
|
||||
+ ec_launch_replace_heal(ec);
|
||||
+ }
|
||||
if (propagate) {
|
||||
- if (needs_shd_check && ec->shd.iamshd) {
|
||||
- ec_launch_replace_heal(ec);
|
||||
- }
|
||||
-
|
||||
error = default_notify(this, event, data);
|
||||
}
|
||||
|
||||
@@ -705,6 +717,7 @@ init(xlator_t *this)
|
||||
ec->xl = this;
|
||||
LOCK_INIT(&ec->lock);
|
||||
|
||||
+ GF_ATOMIC_INIT(ec->async_fop_count, 0);
|
||||
INIT_LIST_HEAD(&ec->pending_fops);
|
||||
INIT_LIST_HEAD(&ec->heal_waiting);
|
||||
INIT_LIST_HEAD(&ec->healing);
|
||||
--
|
||||
1.8.3.1
|
||||
|
289
0176-features-shard-Fix-crash-during-background-shard-del.patch
Normal file
289
0176-features-shard-Fix-crash-during-background-shard-del.patch
Normal file
@ -0,0 +1,289 @@
|
||||
From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
|
||||
From: Krutika Dhananjay <kdhananj@redhat.com>
|
||||
Date: Fri, 5 Apr 2019 10:30:23 +0530
|
||||
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
|
||||
deletion in a specific case
|
||||
|
||||
Consider the following case -
|
||||
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
|
||||
shards are created.
|
||||
2. And then it is deleted after that.
|
||||
|
||||
The unique thing about FALLOCATE is that unlike WRITE, all of the
|
||||
participant shards are resolved and created and fallocated in a single
|
||||
batch. This means, in this case, after the first "shard-lru-limit"
|
||||
number of shards are resolved and added to lru list, as part of
|
||||
resolution of the remaining shards, some of the existing shards in lru
|
||||
list will need to be evicted. So these evicted shards will be
|
||||
inode_unlink()d as part of eviction. Now once the fop gets to the actual
|
||||
FALLOCATE stage, the lru'd-out shards get added to fsync list.
|
||||
|
||||
2 things to note at this point:
|
||||
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
|
||||
on base shard
|
||||
ii. and the more recently used shards are part of both fsync and lru list.
|
||||
So each of these shards holds 2 refs on base inode - one for being
|
||||
part of fsync list, and the other for being part of lru list.
|
||||
|
||||
FALLOCATE completes successfully and then this very file is deleted, and
|
||||
background shard deletion launched. Here's where the ref counts get mismatched.
|
||||
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
|
||||
return NULL, because they are inode_unlink()'d by now. So these inodes need to
|
||||
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
|
||||
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
|
||||
And its inode ctx is still valid and ctx->base_inode valid from the last
|
||||
time it was added to list.
|
||||
|
||||
But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
|
||||
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
|
||||
inode back to lru list, base inode is not ref'd since its NULL.
|
||||
|
||||
Whereas post unlinking this shard, during shard_unlink_block_inode(),
|
||||
ctx->base_inode is accessible and is unref'd because the shard was found to be part
|
||||
of LRU list, although the matching ref didn't occur. This at some point leads to
|
||||
base_inode refcount becoming 0 and it getting destroyed and released back while some
|
||||
of its associated shards are continuing to be unlinked in parallel and the client crashes
|
||||
whenever it is accessed next.
|
||||
|
||||
Fix is to pass base shard correctly, if available, in shard_link_block_inode().
|
||||
|
||||
Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c
|
||||
|
||||
>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
|
||||
>Updates: bz#1696136
|
||||
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
||||
|
||||
Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/
|
||||
|
||||
BUG: 1694595
|
||||
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
|
||||
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
tests/bugs/shard/bug-1696136.c | 121 +++++++++++++++++++++++++++++++++++++
|
||||
tests/bugs/shard/bug-1696136.t | 33 ++++++++++
|
||||
tests/bugs/shard/shard-fallocate.c | 2 +-
|
||||
xlators/features/shard/src/shard.c | 12 +++-
|
||||
4 files changed, 164 insertions(+), 4 deletions(-)
|
||||
create mode 100644 tests/bugs/shard/bug-1696136.c
|
||||
create mode 100644 tests/bugs/shard/bug-1696136.t
|
||||
|
||||
diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
|
||||
new file mode 100644
|
||||
index 0000000..b9e8d13
|
||||
--- /dev/null
|
||||
+++ b/tests/bugs/shard/bug-1696136.c
|
||||
@@ -0,0 +1,121 @@
|
||||
+#define _GNU_SOURCE
|
||||
+#include <fcntl.h>
|
||||
+#include <stdio.h>
|
||||
+#include <stdlib.h>
|
||||
+#include <glusterfs/api/glfs.h>
|
||||
+#include <glusterfs/api/glfs-handles.h>
|
||||
+
|
||||
+enum fallocate_flag {
|
||||
+ TEST_FALLOCATE_NONE,
|
||||
+ TEST_FALLOCATE_KEEP_SIZE,
|
||||
+ TEST_FALLOCATE_ZERO_RANGE,
|
||||
+ TEST_FALLOCATE_PUNCH_HOLE,
|
||||
+ TEST_FALLOCATE_MAX,
|
||||
+};
|
||||
+
|
||||
+int
|
||||
+get_fallocate_flag(int opcode)
|
||||
+{
|
||||
+ int ret = 0;
|
||||
+
|
||||
+ switch (opcode) {
|
||||
+ case TEST_FALLOCATE_NONE:
|
||||
+ ret = 0;
|
||||
+ break;
|
||||
+ case TEST_FALLOCATE_KEEP_SIZE:
|
||||
+ ret = FALLOC_FL_KEEP_SIZE;
|
||||
+ break;
|
||||
+ case TEST_FALLOCATE_ZERO_RANGE:
|
||||
+ ret = FALLOC_FL_ZERO_RANGE;
|
||||
+ break;
|
||||
+ case TEST_FALLOCATE_PUNCH_HOLE:
|
||||
+ ret = FALLOC_FL_PUNCH_HOLE;
|
||||
+ break;
|
||||
+ default:
|
||||
+ ret = -1;
|
||||
+ break;
|
||||
+ }
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+int
|
||||
+main(int argc, char *argv[])
|
||||
+{
|
||||
+ int ret = 1;
|
||||
+ int opcode = -1;
|
||||
+ off_t offset = 0;
|
||||
+ size_t len = 0;
|
||||
+ glfs_t *fs = NULL;
|
||||
+ glfs_fd_t *fd = NULL;
|
||||
+
|
||||
+ if (argc != 8) {
|
||||
+ fprintf(stderr,
|
||||
+ "Syntax: %s <host> <volname> <opcode> <offset> <len> "
|
||||
+ "<file-path> <log-file>\n",
|
||||
+ argv[0]);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ fs = glfs_new(argv[2]);
|
||||
+ if (!fs) {
|
||||
+ fprintf(stderr, "glfs_new: returned NULL\n");
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
|
||||
+ if (ret != 0) {
|
||||
+ fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ret = glfs_set_logging(fs, argv[7], 7);
|
||||
+ if (ret != 0) {
|
||||
+ fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ret = glfs_init(fs);
|
||||
+ if (ret != 0) {
|
||||
+ fprintf(stderr, "glfs_init: returned %d\n", ret);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ opcode = atoi(argv[3]);
|
||||
+ opcode = get_fallocate_flag(opcode);
|
||||
+ if (opcode < 0) {
|
||||
+ fprintf(stderr, "get_fallocate_flag: invalid flag \n");
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ offset = atoi(argv[4]);
|
||||
+ len = atoi(argv[5]);
|
||||
+
|
||||
+ fd = glfs_open(fs, argv[6], O_RDWR);
|
||||
+ if (fd == NULL) {
|
||||
+ fprintf(stderr, "glfs_open: returned NULL\n");
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ret = glfs_fallocate(fd, opcode, offset, len);
|
||||
+ if (ret < 0) {
|
||||
+ fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ret = glfs_unlink(fs, argv[6]);
|
||||
+ if (ret < 0) {
|
||||
+ fprintf(stderr, "glfs_unlink: returned %d\n", ret);
|
||||
+ goto out;
|
||||
+ }
|
||||
+ /* Sleep for 3s to give enough time for background deletion to complete
|
||||
+ * during which if the bug exists, the process will crash.
|
||||
+ */
|
||||
+ sleep(3);
|
||||
+ ret = 0;
|
||||
+
|
||||
+out:
|
||||
+ if (fd)
|
||||
+ glfs_close(fd);
|
||||
+ glfs_fini(fs);
|
||||
+ return ret;
|
||||
+}
|
||||
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
|
||||
new file mode 100644
|
||||
index 0000000..b6dc858
|
||||
--- /dev/null
|
||||
+++ b/tests/bugs/shard/bug-1696136.t
|
||||
@@ -0,0 +1,33 @@
|
||||
+#!/bin/bash
|
||||
+
|
||||
+. $(dirname $0)/../../include.rc
|
||||
+. $(dirname $0)/../../volume.rc
|
||||
+. $(dirname $0)/../../fallocate.rc
|
||||
+
|
||||
+cleanup
|
||||
+
|
||||
+TEST glusterd
|
||||
+TEST pidof glusterd
|
||||
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
||||
+TEST $CLI volume set $V0 features.shard on
|
||||
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
||||
+TEST $CLI volume set $V0 features.shard-lru-limit 120
|
||||
+TEST $CLI volume set $V0 performance.write-behind off
|
||||
+TEST $CLI volume start $V0
|
||||
+
|
||||
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
||||
+
|
||||
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
|
||||
+
|
||||
+# Create a file
|
||||
+TEST touch $M0/file1
|
||||
+
|
||||
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
|
||||
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
|
||||
+
|
||||
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
||||
+TEST $CLI volume stop $V0
|
||||
+TEST $CLI volume delete $V0
|
||||
+rm -f $(dirname $0)/bug-1696136
|
||||
+
|
||||
+cleanup
|
||||
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
|
||||
index 3a784d3..45b9ce0 100644
|
||||
--- a/tests/bugs/shard/shard-fallocate.c
|
||||
+++ b/tests/bugs/shard/shard-fallocate.c
|
||||
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
|
||||
}
|
||||
|
||||
ret = glfs_fallocate(fd, opcode, offset, len);
|
||||
- if (ret <= 0) {
|
||||
+ if (ret < 0) {
|
||||
fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
|
||||
goto out;
|
||||
}
|
||||
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
||||
index fa3564a..3c4bcdc 100644
|
||||
--- a/xlators/features/shard/src/shard.c
|
||||
+++ b/xlators/features/shard/src/shard.c
|
||||
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
|
||||
xlator_t *this = NULL;
|
||||
inode_t *fsync_inode = NULL;
|
||||
shard_priv_t *priv = NULL;
|
||||
+ inode_t *base_inode = NULL;
|
||||
|
||||
this = THIS;
|
||||
priv = this->private;
|
||||
- if (local->loc.inode)
|
||||
+ if (local->loc.inode) {
|
||||
gf_uuid_copy(gfid, local->loc.inode->gfid);
|
||||
- else
|
||||
+ base_inode = local->loc.inode;
|
||||
+ } else if (local->resolver_base_inode) {
|
||||
+ gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
|
||||
+ base_inode = local->resolver_base_inode;
|
||||
+ } else {
|
||||
gf_uuid_copy(gfid, local->base_gfid);
|
||||
+ }
|
||||
|
||||
shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
|
||||
|
||||
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
|
||||
LOCK(&priv->lock);
|
||||
{
|
||||
fsync_inode = __shard_update_shards_inode_list(
|
||||
- linked_inode, this, local->loc.inode, block_num, gfid);
|
||||
+ linked_inode, this, base_inode, block_num, gfid);
|
||||
}
|
||||
UNLOCK(&priv->lock);
|
||||
if (fsync_inode)
|
||||
--
|
||||
1.8.3.1
|
||||
|
161
0177-features-shard-Fix-extra-unref-when-inode-object-is-.patch
Normal file
161
0177-features-shard-Fix-extra-unref-when-inode-object-is-.patch
Normal file
@ -0,0 +1,161 @@
|
||||
From 4f0aa008ed393d7ce222c4ea4bd0fa6ed52b48f6 Mon Sep 17 00:00:00 2001
|
||||
From: Krutika Dhananjay <kdhananj@redhat.com>
|
||||
Date: Fri, 5 Apr 2019 12:29:23 +0530
|
||||
Subject: [PATCH 177/178] features/shard: Fix extra unref when inode object is
|
||||
lru'd out and added back
|
||||
|
||||
Long tale of double unref! But do read...
|
||||
|
||||
In cases where a shard base inode is evicted from lru list while still
|
||||
being part of fsync list but added back soon before its unlink, there
|
||||
could be an extra inode_unref() leading to premature inode destruction
|
||||
leading to crash.
|
||||
|
||||
One such specific case is the following -
|
||||
|
||||
Consider features.shard-deletion-rate = features.shard-lru-limit = 2.
|
||||
This is an oversimplified example but explains the problem clearly.
|
||||
|
||||
First, a file is FALLOCATE'd to a size so that number of shards under
|
||||
/.shard = 3 > lru-limit.
|
||||
Shards 1, 2 and 3 need to be resolved. 1 and 2 are resolved first.
|
||||
Resultant lru list:
|
||||
1 -----> 2
|
||||
refs on base inode - (1) + (1) = 2
|
||||
3 needs to be resolved. So 1 is lru'd out. Resultant lru list -
|
||||
2 -----> 3
|
||||
refs on base inode - (1) + (1) = 2
|
||||
|
||||
Note that 1 is inode_unlink()d but not destroyed because there are
|
||||
non-zero refs on it since it is still participating in this ongoing
|
||||
FALLOCATE operation.
|
||||
|
||||
FALLOCATE is sent on all participant shards. In the cbk, all of them are
|
||||
added to fync_list.
|
||||
Resulting fsync list -
|
||||
1 -----> 2 -----> 3 (order doesn't matter)
|
||||
refs on base inode - (1) + (1) + (1) = 3
|
||||
Total refs = 3 + 2 = 5
|
||||
|
||||
Now an attempt is made to unlink this file. Background deletion is triggered.
|
||||
The first $shard-deletion-rate shards need to be unlinked in the first batch.
|
||||
So shards 1 and 2 need to be resolved. inode_resolve fails on 1 but succeeds
|
||||
on 2 and so it's moved to tail of list.
|
||||
lru list now -
|
||||
3 -----> 2
|
||||
No change in refs.
|
||||
|
||||
shard 1 is looked up. In lookup_cbk, it's linked and added back to lru list
|
||||
at the cost of evicting shard 3.
|
||||
lru list now -
|
||||
2 -----> 1
|
||||
refs on base inode: (1) + (1) = 2
|
||||
fsync list now -
|
||||
1 -----> 2 (again order doesn't matter)
|
||||
refs on base inode - (1) + (1) = 2
|
||||
Total refs = 2 + 2 = 4
|
||||
After eviction, it is found 3 needs fsync. So fsync is wound, yet to be ack'd.
|
||||
So it is still inode_link()d.
|
||||
|
||||
Now deletion of shards 1 and 2 completes. lru list is empty. Base inode unref'd and
|
||||
destroyed.
|
||||
In the next batched deletion, 3 needs to be deleted. It is inode_resolve()able.
|
||||
It is added back to lru list but base inode passed to __shard_update_shards_inode_list()
|
||||
is NULL since the inode is destroyed. But its ctx->inode still contains base inode ptr
|
||||
from first addition to lru list for no additional ref on it.
|
||||
lru list now -
|
||||
3
|
||||
refs on base inode - (0)
|
||||
Total refs on base inode = 0
|
||||
Unlink is sent on 3. It completes. Now since the ctx contains ptr to base_inode and the
|
||||
shard is part of lru list, base shard is unref'd leading to a crash.
|
||||
|
||||
FIX:
|
||||
When shard is readded back to lru list, copy the base inode pointer as is into its inode ctx,
|
||||
even if it is NULL. This is needed to prevent double unrefs at the time of deleting it.
|
||||
|
||||
Upstream patch:
|
||||
> BUG: 1696136
|
||||
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/22517
|
||||
> Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
|
||||
> Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
||||
|
||||
Change-Id: I99a44039da2e10a1aad183e84f644d63ca552462
|
||||
Updates: bz#1694595
|
||||
Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172803
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||||
---
|
||||
.../bug-1696136-lru-limit-equals-deletion-rate.t | 34 ++++++++++++++++++++++
|
||||
xlators/features/shard/src/shard.c | 6 ++--
|
||||
2 files changed, 36 insertions(+), 4 deletions(-)
|
||||
create mode 100644 tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
|
||||
|
||||
diff --git a/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
|
||||
new file mode 100644
|
||||
index 0000000..3e4a65a
|
||||
--- /dev/null
|
||||
+++ b/tests/bugs/shard/bug-1696136-lru-limit-equals-deletion-rate.t
|
||||
@@ -0,0 +1,34 @@
|
||||
+#!/bin/bash
|
||||
+
|
||||
+. $(dirname $0)/../../include.rc
|
||||
+. $(dirname $0)/../../volume.rc
|
||||
+. $(dirname $0)/../../fallocate.rc
|
||||
+
|
||||
+cleanup
|
||||
+
|
||||
+TEST glusterd
|
||||
+TEST pidof glusterd
|
||||
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
||||
+TEST $CLI volume set $V0 features.shard on
|
||||
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
||||
+TEST $CLI volume set $V0 features.shard-lru-limit 120
|
||||
+TEST $CLI volume set $V0 features.shard-deletion-rate 120
|
||||
+TEST $CLI volume set $V0 performance.write-behind off
|
||||
+TEST $CLI volume start $V0
|
||||
+
|
||||
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
||||
+
|
||||
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
|
||||
+
|
||||
+# Create a file
|
||||
+TEST touch $M0/file1
|
||||
+
|
||||
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
|
||||
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
|
||||
+
|
||||
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
|
||||
+TEST $CLI volume stop $V0
|
||||
+TEST $CLI volume delete $V0
|
||||
+rm -f $(dirname $0)/bug-1696136
|
||||
+
|
||||
+cleanup
|
||||
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
||||
index 3c4bcdc..c1799ad 100644
|
||||
--- a/xlators/features/shard/src/shard.c
|
||||
+++ b/xlators/features/shard/src/shard.c
|
||||
@@ -689,8 +689,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
|
||||
ctx->block_num = block_num;
|
||||
list_add_tail(&ctx->ilist, &priv->ilist_head);
|
||||
priv->inode_count++;
|
||||
- if (base_inode)
|
||||
- ctx->base_inode = inode_ref(base_inode);
|
||||
+ ctx->base_inode = inode_ref(base_inode);
|
||||
} else {
|
||||
/*If on the other hand there is no available slot for this inode
|
||||
* in the list, delete the lru inode from the head of the list,
|
||||
@@ -765,8 +764,7 @@ __shard_update_shards_inode_list(inode_t *linked_inode, xlator_t *this,
|
||||
else
|
||||
gf_uuid_copy(ctx->base_gfid, gfid);
|
||||
ctx->block_num = block_num;
|
||||
- if (base_inode)
|
||||
- ctx->base_inode = inode_ref(base_inode);
|
||||
+ ctx->base_inode = inode_ref(base_inode);
|
||||
list_add_tail(&ctx->ilist, &priv->ilist_head);
|
||||
}
|
||||
} else {
|
||||
--
|
||||
1.8.3.1
|
||||
|
287
0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch
Normal file
287
0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch
Normal file
@ -0,0 +1,287 @@
|
||||
From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001
|
||||
From: karthik-us <ksubrahm@redhat.com>
|
||||
Date: Mon, 10 Jun 2019 23:58:16 +0530
|
||||
Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata
|
||||
pending as split-brain
|
||||
|
||||
Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/
|
||||
|
||||
Problem:
|
||||
We currently don't have a roll-back/undoing of post-ops if quorum is not met.
|
||||
Though the FOP is still unwound with failure, the xattrs remain on the disk.
|
||||
Due to these partial post-ops and partial heals (healing only when 2 bricks
|
||||
are up), we can end up in metadata split-brain purely from the afr xattrs
|
||||
point of view i.e each brick is blamed by atleast one of the others for
|
||||
metadata. These scenarios are hit when there is frequent connect/disconnect
|
||||
of the client/shd to the bricks.
|
||||
|
||||
Fix:
|
||||
Pick a source based on the xattr values. If 2 bricks blame one, the blamed
|
||||
one must be treated as sink. If there is no majority, all are sources. Once
|
||||
we pick a source, self-heal will then do the heal instead of erroring out
|
||||
due to split-brain.
|
||||
This patch also adds restriction of all the bricks to be up to perform
|
||||
metadata heal to avoid any metadata loss.
|
||||
|
||||
Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
|
||||
as it was doing metadata heal even when only 2 of 3 bricks were up.
|
||||
|
||||
Change-Id: I02064ecb7d68d498f75a353af64f75249a633508
|
||||
fixes: bz#1715438
|
||||
Signed-off-by: karthik-us <ksubrahm@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172935
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||||
---
|
||||
.../bug-1468279-source-not-blaming-sinks.t | 64 ----------
|
||||
.../bug-1717819-metadata-split-brain-detection.t | 130 +++++++++++++++++++++
|
||||
xlators/cluster/afr/src/afr-self-heal-common.c | 4 +-
|
||||
xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 +-
|
||||
4 files changed, 133 insertions(+), 67 deletions(-)
|
||||
delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
|
||||
create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
|
||||
|
||||
diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
|
||||
deleted file mode 100644
|
||||
index 054a4ad..0000000
|
||||
--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t
|
||||
+++ /dev/null
|
||||
@@ -1,64 +0,0 @@
|
||||
-#!/bin/bash
|
||||
-. $(dirname $0)/../../include.rc
|
||||
-. $(dirname $0)/../../volume.rc
|
||||
-cleanup;
|
||||
-
|
||||
-TEST glusterd
|
||||
-TEST pidof glusterd
|
||||
-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
|
||||
-TEST $CLI volume start $V0
|
||||
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
|
||||
-TEST $CLI volume set $V0 cluster.metadata-self-heal off
|
||||
-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0;
|
||||
-TEST touch $M0/file
|
||||
-
|
||||
-# Kill B1, create a pending metadata heal.
|
||||
-TEST kill_brick $V0 $H0 $B0/${V0}0
|
||||
-TEST setfattr -n user.xattr -v value1 $M0/file
|
||||
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
|
||||
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
|
||||
-
|
||||
-# Kill B2, heal from B3 to B1.
|
||||
-TEST $CLI volume start $V0 force
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
|
||||
-TEST kill_brick $V0 $H0 $B0/${V0}1
|
||||
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
|
||||
-$CLI volume heal $V0
|
||||
-EXPECT_WITHIN $HEAL_TIMEOUT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata"
|
||||
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
|
||||
-
|
||||
-# Create another pending metadata heal.
|
||||
-TEST setfattr -n user.xattr -v value2 $M0/file
|
||||
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
|
||||
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
|
||||
-
|
||||
-# Kill B1, heal from B3 to B2
|
||||
-TEST $CLI volume start $V0 force
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
|
||||
-TEST kill_brick $V0 $H0 $B0/${V0}0
|
||||
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
|
||||
-$CLI volume heal $V0
|
||||
-EXPECT_WITHIN $HEAL_TIMEOUT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata"
|
||||
-TEST $CLI volume set $V0 cluster.self-heal-daemon off
|
||||
-
|
||||
-# ALL bricks up again.
|
||||
-TEST $CLI volume start $V0 force
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
|
||||
-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1
|
||||
-# B1 and B2 blame each other, B3 doesn't blame anyone.
|
||||
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file
|
||||
-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file
|
||||
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file
|
||||
-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file
|
||||
-TEST $CLI volume set $V0 cluster.self-heal-daemon on
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1
|
||||
-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2
|
||||
-TEST $CLI volume heal $V0
|
||||
-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
||||
-
|
||||
-cleanup;
|
||||
diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
|
||||
new file mode 100644
|
||||
index 0000000..94b8bf3
|
||||
--- /dev/null
|
||||
+++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t
|
||||
@@ -0,0 +1,130 @@
|
||||
+#!/bin/bash
|
||||
+
|
||||
+. $(dirname $0)/../../include.rc
|
||||
+. $(dirname $0)/../../volume.rc
|
||||
+. $(dirname $0)/../../afr.rc
|
||||
+
|
||||
+cleanup;
|
||||
+
|
||||
+## Start and create a volume
|
||||
+TEST glusterd;
|
||||
+TEST pidof glusterd;
|
||||
+TEST $CLI volume info;
|
||||
+
|
||||
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2};
|
||||
+TEST $CLI volume start $V0;
|
||||
+EXPECT 'Started' volinfo_field $V0 'Status';
|
||||
+TEST $CLI volume heal $V0 disable
|
||||
+TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0
|
||||
+
|
||||
+###############################################################################
|
||||
+# Case of 2 bricks blaming the third and the third blaming the other two.
|
||||
+
|
||||
+TEST mkdir $M0/dir
|
||||
+
|
||||
+# B0 and B2 must blame B1
|
||||
+TEST kill_brick $V0 $H0 $B0/$V0"1"
|
||||
+TEST setfattr -n user.metadata -v 1 $M0/dir
|
||||
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata
|
||||
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata
|
||||
+CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir)
|
||||
+
|
||||
+# B1 must blame B0 and B2
|
||||
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir
|
||||
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir
|
||||
+
|
||||
+# Launch heal
|
||||
+TEST $CLI volume start $V0 force
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
|
||||
+TEST $CLI volume heal $V0 enable
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
|
||||
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
|
||||
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
|
||||
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
|
||||
+TEST $CLI volume heal $V0
|
||||
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
||||
+
|
||||
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir)
|
||||
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir)
|
||||
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir)
|
||||
+
|
||||
+TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ]
|
||||
+TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ]
|
||||
+TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ]
|
||||
+TEST setfattr -x user.metadata $M0/dir
|
||||
+
|
||||
+###############################################################################
|
||||
+# Case of each brick blaming the next one in a cyclic manner
|
||||
+
|
||||
+TEST $CLI volume heal $V0 disable
|
||||
+TEST `echo "hello" >> $M0/dir/file`
|
||||
+# Mark cyclic xattrs and modify metadata directly on the bricks.
|
||||
+setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
|
||||
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
|
||||
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file
|
||||
+
|
||||
+setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file
|
||||
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
|
||||
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
|
||||
+
|
||||
+# Add entry to xattrop dir to trigger index heal.
|
||||
+xattrop_dir0=$(afr_get_index_path $B0/$V0"0")
|
||||
+base_entry_b0=`ls $xattrop_dir0`
|
||||
+gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file))
|
||||
+ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str
|
||||
+EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0
|
||||
+
|
||||
+# Launch heal
|
||||
+TEST $CLI volume heal $V0 enable
|
||||
+TEST $CLI volume heal $V0
|
||||
+EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
|
||||
+
|
||||
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
|
||||
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
|
||||
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
|
||||
+
|
||||
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
|
||||
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
|
||||
+TEST rm -f $M0/dir/file
|
||||
+
|
||||
+###############################################################################
|
||||
+# Case of 2 bricks having quorum blaming and the other having only one blaming.
|
||||
+
|
||||
+TEST $CLI volume heal $V0 disable
|
||||
+TEST `echo "hello" >> $M0/dir/file`
|
||||
+# B0 and B2 must blame B1
|
||||
+TEST kill_brick $V0 $H0 $B0/$V0"1"
|
||||
+TEST setfattr -n user.metadata -v 1 $M0/dir/file
|
||||
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata
|
||||
+EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata
|
||||
+
|
||||
+# B1 must blame B0 and B2
|
||||
+setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
|
||||
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file
|
||||
+
|
||||
+# B0 must blame B2
|
||||
+setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file
|
||||
+
|
||||
+# Modify the metadata directly on the bricks B1 & B2.
|
||||
+setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file
|
||||
+setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file
|
||||
+
|
||||
+# Launch heal
|
||||
+TEST $CLI volume start $V0 force
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1
|
||||
+TEST $CLI volume heal $V0 enable
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status
|
||||
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0
|
||||
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1
|
||||
+EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2
|
||||
+
|
||||
+B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file)
|
||||
+B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file)
|
||||
+B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file)
|
||||
+
|
||||
+TEST [ "$B0_XATTR" == "$B1_XATTR" ]
|
||||
+TEST [ "$B0_XATTR" == "$B2_XATTR" ]
|
||||
+
|
||||
+###############################################################################
|
||||
+
|
||||
+cleanup
|
||||
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
|
||||
index 595bed4..5157e7d 100644
|
||||
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
|
||||
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
|
||||
@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
|
||||
}
|
||||
}
|
||||
|
||||
- if (type == AFR_DATA_TRANSACTION) {
|
||||
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) {
|
||||
min_participants = priv->child_count;
|
||||
} else {
|
||||
min_participants = AFR_SH_MIN_PARTICIPANTS;
|
||||
@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
|
||||
}
|
||||
}
|
||||
|
||||
- if (type == AFR_DATA_TRANSACTION)
|
||||
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
|
||||
afr_selfheal_post_op_failure_accounting(priv, accused, sources,
|
||||
locked_on);
|
||||
|
||||
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
|
||||
index ba43341..ecfa791 100644
|
||||
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
|
||||
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
|
||||
@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
|
||||
ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
|
||||
data_lock);
|
||||
{
|
||||
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
|
||||
+ if (ret < priv->child_count) {
|
||||
ret = -ENOTCONN;
|
||||
goto unlock;
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -231,7 +231,7 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist}
|
||||
%else
|
||||
Name: glusterfs
|
||||
Version: 6.0
|
||||
Release: 4%{?dist}
|
||||
Release: 5%{?dist}
|
||||
ExcludeArch: i686
|
||||
%endif
|
||||
License: GPLv2 or LGPLv3+
|
||||
@ -475,6 +475,15 @@ Patch0166: 0166-glusterd-svc-glusterd_svcs_stop-should-call-individu.patch
|
||||
Patch0167: 0167-glusterd-shd-Optimize-the-glustershd-manager-to-send.patch
|
||||
Patch0168: 0168-cluster-dht-Fix-directory-perms-during-selfheal.patch
|
||||
Patch0169: 0169-Build-Fix-spec-to-enable-rhel8-client-build.patch
|
||||
Patch0170: 0170-geo-rep-Convert-gfid-conflict-resolutiong-logs-into-.patch
|
||||
Patch0171: 0171-posix-add-storage.reserve-size-option.patch
|
||||
Patch0172: 0172-ec-fini-Fix-race-with-ec_fini-and-ec_notify.patch
|
||||
Patch0173: 0173-glusterd-store-fips-mode-rchecksum-option-in-the-inf.patch
|
||||
Patch0174: 0174-xlator-log-Add-more-logging-in-xlator_is_cleanup_sta.patch
|
||||
Patch0175: 0175-ec-fini-Fix-race-between-xlator-cleanup-and-on-going.patch
|
||||
Patch0176: 0176-features-shard-Fix-crash-during-background-shard-del.patch
|
||||
Patch0177: 0177-features-shard-Fix-extra-unref-when-inode-object-is-.patch
|
||||
Patch0178: 0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch
|
||||
|
||||
%description
|
||||
GlusterFS is a distributed file-system capable of scaling to several
|
||||
@ -2176,9 +2185,13 @@ fi
|
||||
%endif
|
||||
|
||||
%changelog
|
||||
* Tue Jun 11 2019 Sunil Kumar Acharya <sheggodu@redhat.com> - 6.0-5
|
||||
- fixes bugs bz#1573077 bz#1694595 bz#1703434 bz#1714536 bz#1714588
|
||||
bz#1715407 bz#1715438 bz#1705018
|
||||
|
||||
* Fri Jun 07 2019 Rinku Kothiya <rkothiya@redhat.com> - 6.0-4
|
||||
- fixes bugs bz#1480907 bz#1702298 bz#1703455 bz#1704181 bz#1704562
|
||||
bz#1707246 bz#1708067 bz#1708116 bz#1708121 bz#1709087 bz#1711249 bz#1711296
|
||||
- fixes bugs bz#1480907 bz#1702298 bz#1703455 bz#1704181 bz#1707246
|
||||
bz#1708067 bz#1708116 bz#1708121 bz#1709087 bz#1711249 bz#1711296
|
||||
bz#1714078 bz#1714124 bz#1716385 bz#1716626 bz#1716821 bz#1716865 bz#1717927
|
||||
|
||||
* Tue May 14 2019 Rinku Kothiya <rkothiya@redhat.com> - 6.0-3
|
||||
|
Loading…
Reference in New Issue
Block a user