autobuild v3.12.2-15
Resolves: bz#1589279 bz#1598384 bz#1599362 bz#1599998 bz#1600790 Resolves: bz#1601331 bz#1603103 Signed-off-by: Milind Changire <mchangir@redhat.com>
This commit is contained in:
parent
0820681560
commit
299302574e
138
0326-glusterd-memory-leak-in-get-state.patch
Normal file
138
0326-glusterd-memory-leak-in-get-state.patch
Normal file
@ -0,0 +1,138 @@
|
||||
From eadd7e7168349705b29bc6ae9f99ba3e6ae58060 Mon Sep 17 00:00:00 2001
|
||||
From: Sanju Rakonde <srakonde@redhat.com>
|
||||
Date: Mon, 16 Jul 2018 15:59:36 +0530
|
||||
Subject: [PATCH 326/333] glusterd: memory leak in get-state
|
||||
|
||||
Problem: gluster get-state command is leaking the memory when
|
||||
geo-replication session is configured.
|
||||
|
||||
Cause: In glusterd_print_gsync_status(), we are trying to get
|
||||
reference to the keys of gsync_dict. The references to keys of
|
||||
gsync_dict are stored status_vols[i]. status_vols[i] are
|
||||
allocated with a memory of size of gf_gsync_status_t.
|
||||
|
||||
Solution: Need not to use a array of pointers(status_vals), using
|
||||
a pointer to hold the reference to a key of gsync_dict is sufficient.
|
||||
|
||||
Followed the below steps for testing:
|
||||
1. Configured geo-rep session
|
||||
2. Ran gluster get-state command for 1000 times.
|
||||
|
||||
Without this patch, glusterd's memory was increasing significantly
|
||||
(around 22000KB per 1000 times), with this patch it reduced (1500KB
|
||||
per 1000 times)
|
||||
|
||||
>fixes: bz#1601423
|
||||
>Change-Id: I361f5525d71f821bb345419ccfdc20ca288ca292
|
||||
>Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
|
||||
|
||||
upstream patch: https://review.gluster.org/#/c/20521/
|
||||
|
||||
Change-Id: I361f5525d71f821bb345419ccfdc20ca288ca292
|
||||
BUG: 1599362
|
||||
Signed-off-by: Sanju Rakonde <srakonde@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/144325
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Mohit Agrawal <moagrawa@redhat.com>
|
||||
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||||
---
|
||||
xlators/mgmt/glusterd/src/glusterd-handler.c | 53 ++++++++++------------------
|
||||
1 file changed, 19 insertions(+), 34 deletions(-)
|
||||
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c
|
||||
index 395b342..861ff17 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-handler.c
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-handler.c
|
||||
@@ -5082,7 +5082,7 @@ glusterd_print_gsync_status (FILE *fp, dict_t *gsync_dict)
|
||||
int ret = -1;
|
||||
int gsync_count = 0;
|
||||
int i = 0;
|
||||
- gf_gsync_status_t **status_vals = NULL;
|
||||
+ gf_gsync_status_t *status_vals = NULL;
|
||||
char status_val_name[PATH_MAX] = {0,};
|
||||
|
||||
GF_VALIDATE_OR_GOTO (THIS->name, fp, out);
|
||||
@@ -5097,62 +5097,47 @@ glusterd_print_gsync_status (FILE *fp, dict_t *gsync_dict)
|
||||
goto out;
|
||||
}
|
||||
|
||||
- status_vals = GF_CALLOC (gsync_count, sizeof (gf_gsync_status_t *),
|
||||
- gf_common_mt_char);
|
||||
- if (!status_vals) {
|
||||
- ret = -1;
|
||||
- goto out;
|
||||
- }
|
||||
- for (i = 0; i < gsync_count; i++) {
|
||||
- status_vals[i] = GF_CALLOC (1, sizeof (gf_gsync_status_t),
|
||||
- gf_common_mt_char);
|
||||
- if (!status_vals[i]) {
|
||||
- ret = -1;
|
||||
- goto out;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
for (i = 0; i < gsync_count; i++) {
|
||||
snprintf (status_val_name, sizeof(status_val_name), "status_value%d", i);
|
||||
|
||||
- ret = dict_get_bin (gsync_dict, status_val_name, (void **)&(status_vals[i]));
|
||||
+ ret = dict_get_bin (gsync_dict, status_val_name, (void **)&(status_vals));
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
fprintf (fp, "Volume%d.pair%d.session_slave: %s\n", volcount, i+1,
|
||||
- get_struct_variable(21, status_vals[i]));
|
||||
+ get_struct_variable(21, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.master_node: %s\n", volcount, i+1,
|
||||
- get_struct_variable(0, status_vals[i]));
|
||||
+ get_struct_variable(0, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.master_volume: %s\n", volcount, i+1,
|
||||
- get_struct_variable(1, status_vals[i]));
|
||||
+ get_struct_variable(1, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.master_brick: %s\n", volcount, i+1,
|
||||
- get_struct_variable(2, status_vals[i]));
|
||||
+ get_struct_variable(2, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.slave_user: %s\n", volcount, i+1,
|
||||
- get_struct_variable(3, status_vals[i]));
|
||||
+ get_struct_variable(3, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.slave: %s\n", volcount, i+1,
|
||||
- get_struct_variable(4, status_vals[i]));
|
||||
+ get_struct_variable(4, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.slave_node: %s\n", volcount, i+1,
|
||||
- get_struct_variable(5, status_vals[i]));
|
||||
+ get_struct_variable(5, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.status: %s\n", volcount, i+1,
|
||||
- get_struct_variable(6, status_vals[i]));
|
||||
+ get_struct_variable(6, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.crawl_status: %s\n", volcount, i+1,
|
||||
- get_struct_variable(7, status_vals[i]));
|
||||
+ get_struct_variable(7, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.last_synced: %s\n", volcount, i+1,
|
||||
- get_struct_variable(8, status_vals[i]));
|
||||
+ get_struct_variable(8, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.entry: %s\n", volcount, i+1,
|
||||
- get_struct_variable(9, status_vals[i]));
|
||||
+ get_struct_variable(9, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.data: %s\n", volcount, i+1,
|
||||
- get_struct_variable(10, status_vals[i]));
|
||||
+ get_struct_variable(10, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.meta: %s\n", volcount, i+1,
|
||||
- get_struct_variable(11, status_vals[i]));
|
||||
+ get_struct_variable(11, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.failures: %s\n", volcount, i+1,
|
||||
- get_struct_variable(12, status_vals[i]));
|
||||
+ get_struct_variable(12, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.checkpoint_time: %s\n", volcount,
|
||||
- i+1, get_struct_variable(13, status_vals[i]));
|
||||
+ i+1, get_struct_variable(13, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.checkpoint_completed: %s\n",
|
||||
- volcount, i+1, get_struct_variable(14, status_vals[i]));
|
||||
+ volcount, i+1, get_struct_variable(14, status_vals));
|
||||
fprintf (fp, "Volume%d.pair%d.checkpoint_completion_time: %s\n",
|
||||
- volcount, i+1, get_struct_variable(15, status_vals[i]));
|
||||
+ volcount, i+1, get_struct_variable(15, status_vals));
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
--
|
||||
1.8.3.1
|
||||
|
102
0327-afr-switch-lk_owner-only-when-pre-op-succeeds.patch
Normal file
102
0327-afr-switch-lk_owner-only-when-pre-op-succeeds.patch
Normal file
@ -0,0 +1,102 @@
|
||||
From fef5fb73545bed5a4040db1f8e4e855286c1981d Mon Sep 17 00:00:00 2001
|
||||
From: Ravishankar N <ravishankar@redhat.com>
|
||||
Date: Wed, 18 Jul 2018 14:16:46 +0530
|
||||
Subject: [PATCH 327/333] afr: switch lk_owner only when pre-op succeeds
|
||||
|
||||
Backport of https://review.gluster.org/#/c/20527/
|
||||
|
||||
Problem:
|
||||
In a disk full scenario, we take a failure path in afr_transaction_perform_fop()
|
||||
and go to unlock phase. But we change the lk-owner before that, causing unlock
|
||||
to fail. When mount issues another fop that takes locks on that file, it hangs.
|
||||
|
||||
Fix:
|
||||
Change lk-owner only when we are about to perform the fop phase.
|
||||
Also fix the same issue for arbiters when afr_txn_arbitrate_fop() fails the fop.
|
||||
|
||||
Also removed the DISK_SPACE_CHECK_AND_GOTO in posix_xattrop. Otherwise truncate
|
||||
to zero will fail pre-op phase with ENOSPC when the user is actually trying to
|
||||
freee up space.
|
||||
|
||||
Change-Id: I8663003fa7d472e93fe61cc1e39c78084d3de81f
|
||||
BUG: 1599998
|
||||
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/144275
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||||
---
|
||||
xlators/cluster/afr/src/afr-transaction.c | 20 ++++++++++----------
|
||||
xlators/storage/posix/src/posix.c | 5 -----
|
||||
2 files changed, 10 insertions(+), 15 deletions(-)
|
||||
|
||||
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
|
||||
index 321b6f1..3f55070 100644
|
||||
--- a/xlators/cluster/afr/src/afr-transaction.c
|
||||
+++ b/xlators/cluster/afr/src/afr-transaction.c
|
||||
@@ -495,11 +495,10 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
|
||||
local->op_errno = ENOTCONN;
|
||||
for (i = 0; i < priv->child_count; i++)
|
||||
local->transaction.failed_subvols[i] = 1;
|
||||
- afr_changelog_post_op (frame, this);/*uninherit should happen*/
|
||||
- } else {
|
||||
- afr_transaction_fop (frame, this);
|
||||
}
|
||||
|
||||
+ afr_transaction_fop (frame, this);
|
||||
+
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -529,13 +528,6 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
|
||||
local->transaction.failed_subvols[i] = 1;
|
||||
}
|
||||
}
|
||||
- /* Perform fops with the lk-owner from top xlator.
|
||||
- * Eg: lk-owner of posix-lk and flush should be same,
|
||||
- * flush cant clear the posix-lks without that lk-owner.
|
||||
- */
|
||||
- afr_save_lk_owner (frame);
|
||||
- frame->root->lk_owner =
|
||||
- local->transaction.main_frame->root->lk_owner;
|
||||
|
||||
if (local->pre_op_compat)
|
||||
/* old mode, pre-op was done as afr_changelog_do()
|
||||
@@ -561,6 +553,14 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
|
||||
}
|
||||
|
||||
fop:
|
||||
+ /* Perform fops with the lk-owner from top xlator.
|
||||
+ * Eg: lk-owner of posix-lk and flush should be same,
|
||||
+ * flush cant clear the posix-lks without that lk-owner.
|
||||
+ */
|
||||
+ afr_save_lk_owner (frame);
|
||||
+ frame->root->lk_owner =
|
||||
+ local->transaction.main_frame->root->lk_owner;
|
||||
+
|
||||
if (priv->arbiter_count == 1) {
|
||||
afr_txn_arbitrate_fop (frame, this);
|
||||
} else {
|
||||
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
|
||||
index 01f472b..ddb875c 100644
|
||||
--- a/xlators/storage/posix/src/posix.c
|
||||
+++ b/xlators/storage/posix/src/posix.c
|
||||
@@ -6147,16 +6147,11 @@ do_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
|
||||
dict_t *xattr_rsp = NULL;
|
||||
dict_t *xdata_rsp = NULL;
|
||||
struct iatt stbuf = {0};
|
||||
- struct posix_private *priv = NULL;
|
||||
-
|
||||
|
||||
VALIDATE_OR_GOTO (frame, out);
|
||||
VALIDATE_OR_GOTO (xattr, out);
|
||||
VALIDATE_OR_GOTO (this, out);
|
||||
|
||||
- priv = this->private;
|
||||
- DISK_SPACE_CHECK_AND_GOTO (frame, priv, xdata, op_ret, op_errno, out);
|
||||
-
|
||||
if (fd) {
|
||||
op_ret = posix_fd_ctx_get (fd, this, &pfd, &op_errno);
|
||||
if (op_ret < 0) {
|
||||
--
|
||||
1.8.3.1
|
||||
|
455
0328-geo-rep-Fix-issues-with-gfid-conflict-handling.patch
Normal file
455
0328-geo-rep-Fix-issues-with-gfid-conflict-handling.patch
Normal file
@ -0,0 +1,455 @@
|
||||
From a9db68fc1f05639cb79defef6ed7da58572113ea Mon Sep 17 00:00:00 2001
|
||||
From: Kotresh HR <khiremat@redhat.com>
|
||||
Date: Thu, 5 Jul 2018 07:07:38 -0400
|
||||
Subject: [PATCH 328/333] geo-rep: Fix issues with gfid conflict handling
|
||||
|
||||
1. MKDIR/RMDIR is recorded on all bricks. So if
|
||||
one brick succeeds creating it, other bricks
|
||||
should ignore it. But this was not happening.
|
||||
The fix rename of directories in hybrid crawl,
|
||||
was trying to rename the directory to itself
|
||||
and in the process crashing with ENOENT if the
|
||||
directory is removed.
|
||||
|
||||
2. If file is created, deleted and a directory is
|
||||
created with same name, it was failing to sync.
|
||||
Again the issue is around the fix for rename
|
||||
of directories in hybrid crawl. Fixed the same.
|
||||
|
||||
If the same case was done with hardlink present
|
||||
for the file, it was failing. This patch fixes
|
||||
that too.
|
||||
|
||||
Backport of
|
||||
> Patch: https://review.gluster.org/#/c/20473/
|
||||
> fixes: bz#1598884
|
||||
> Change-Id: I6f3bca44e194e415a3d4de3b9d03cc8976439284
|
||||
> Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
||||
|
||||
BUG: 1598384
|
||||
Change-Id: I6f3bca44e194e415a3d4de3b9d03cc8976439284
|
||||
Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/143400
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Aravinda Vishwanathapura Krishna Murthy <avishwan@redhat.com>
|
||||
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||||
---
|
||||
geo-replication/syncdaemon/master.py | 157 ++++++++++++++++++++++---------
|
||||
geo-replication/syncdaemon/resource.py | 57 ++++++-----
|
||||
geo-replication/syncdaemon/syncdutils.py | 35 +++++++
|
||||
3 files changed, 180 insertions(+), 69 deletions(-)
|
||||
|
||||
diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py
|
||||
index 64e9836..1399378 100644
|
||||
--- a/geo-replication/syncdaemon/master.py
|
||||
+++ b/geo-replication/syncdaemon/master.py
|
||||
@@ -692,7 +692,8 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
TYPE_GFID = "D "
|
||||
TYPE_ENTRY = "E "
|
||||
|
||||
- MAX_EF_RETRIES = 15
|
||||
+ MAX_EF_RETRIES = 10
|
||||
+ MAX_OE_RETRIES = 5
|
||||
|
||||
# flat directory hierarchy for gfid based access
|
||||
FLAT_DIR_HIERARCHY = '.'
|
||||
@@ -788,38 +789,53 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
|
||||
self.status.inc_value("failures", num_failures)
|
||||
|
||||
- def fix_possible_entry_failures(self, failures, retry_count):
|
||||
+ def fix_possible_entry_failures(self, failures, retry_count, entries):
|
||||
pfx = gauxpfx()
|
||||
fix_entry_ops = []
|
||||
failures1 = []
|
||||
for failure in failures:
|
||||
- if failure[2]['dst']:
|
||||
+ if failure[2]['name_mismatch']:
|
||||
+ pbname = failure[2]['slave_entry']
|
||||
+ elif failure[2]['dst']:
|
||||
pbname = failure[0]['entry1']
|
||||
else:
|
||||
pbname = failure[0]['entry']
|
||||
- if failure[2]['gfid_mismatch']:
|
||||
+
|
||||
+ op = failure[0]['op']
|
||||
+ # name exists but gfid is different
|
||||
+ if failure[2]['gfid_mismatch'] or failure[2]['name_mismatch']:
|
||||
slave_gfid = failure[2]['slave_gfid']
|
||||
st = lstat(os.path.join(pfx, slave_gfid))
|
||||
+ # Takes care of scenarios with no hardlinks
|
||||
if isinstance(st, int) and st == ENOENT:
|
||||
- logging.info(lf('Fixing gfid mismatch in slave. Deleting'
|
||||
- ' the entry', retry_count=retry_count,
|
||||
+ logging.info(lf('Entry not present on master. Fixing gfid '
|
||||
+ 'mismatch in slave. Deleting the entry',
|
||||
+ retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
- #Add deletion to fix_entry_ops list
|
||||
+ # Add deletion to fix_entry_ops list
|
||||
if failure[2]['slave_isdir']:
|
||||
- fix_entry_ops.append(edct('RMDIR',
|
||||
- gfid=failure[2]['slave_gfid'],
|
||||
- entry=pbname))
|
||||
+ fix_entry_ops.append(
|
||||
+ edct('RMDIR',
|
||||
+ gfid=failure[2]['slave_gfid'],
|
||||
+ entry=pbname))
|
||||
else:
|
||||
- fix_entry_ops.append(edct('UNLINK',
|
||||
- gfid=failure[2]['slave_gfid'],
|
||||
- entry=pbname))
|
||||
+ fix_entry_ops.append(
|
||||
+ edct('UNLINK',
|
||||
+ gfid=failure[2]['slave_gfid'],
|
||||
+ entry=pbname))
|
||||
+ # Takes care of scenarios of hardlinks/renames on master
|
||||
elif not isinstance(st, int):
|
||||
- #The file exists on master but with different name.
|
||||
- #Probabaly renamed and got missed during xsync crawl.
|
||||
- if failure[2]['slave_isdir']:
|
||||
- logging.info(lf('Fixing gfid mismatch in slave',
|
||||
+ if matching_disk_gfid(slave_gfid, pbname):
|
||||
+ # Safe to ignore the failure as master contains same
|
||||
+ # file with same gfid. Remove entry from entries list
|
||||
+ logging.info(lf('Fixing gfid mismatch in slave. '
|
||||
+ ' Safe to ignore, take out entry',
|
||||
retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
+ entries.remove(failure[0])
|
||||
+ # The file exists on master but with different name.
|
||||
+ # Probably renamed and got missed during xsync crawl.
|
||||
+ elif failure[2]['slave_isdir']:
|
||||
realpath = os.readlink(os.path.join(gconf.local_path,
|
||||
".glusterfs",
|
||||
slave_gfid[0:2],
|
||||
@@ -827,64 +843,99 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
slave_gfid))
|
||||
dst_entry = os.path.join(pfx, realpath.split('/')[-2],
|
||||
realpath.split('/')[-1])
|
||||
- rename_dict = edct('RENAME', gfid=slave_gfid,
|
||||
- entry=failure[0]['entry'],
|
||||
- entry1=dst_entry, stat=st,
|
||||
- link=None)
|
||||
- logging.info(lf('Fixing gfid mismatch in slave. '
|
||||
- 'Renaming', retry_count=retry_count,
|
||||
- entry=repr(rename_dict)))
|
||||
- fix_entry_ops.append(rename_dict)
|
||||
+ src_entry = pbname
|
||||
+ logging.info(lf('Fixing dir name/gfid mismatch in '
|
||||
+ 'slave', retry_count=retry_count,
|
||||
+ entry=repr(failure)))
|
||||
+ if src_entry == dst_entry:
|
||||
+ # Safe to ignore the failure as master contains
|
||||
+ # same directory as in slave with same gfid.
|
||||
+ # Remove the failure entry from entries list
|
||||
+ logging.info(lf('Fixing dir name/gfid mismatch'
|
||||
+ ' in slave. Safe to ignore, '
|
||||
+ 'take out entry',
|
||||
+ retry_count=retry_count,
|
||||
+ entry=repr(failure)))
|
||||
+ entries.remove(failure[0])
|
||||
+ else:
|
||||
+ rename_dict = edct('RENAME', gfid=slave_gfid,
|
||||
+ entry=src_entry,
|
||||
+ entry1=dst_entry, stat=st,
|
||||
+ link=None)
|
||||
+ logging.info(lf('Fixing dir name/gfid mismatch'
|
||||
+ ' in slave. Renaming',
|
||||
+ retry_count=retry_count,
|
||||
+ entry=repr(rename_dict)))
|
||||
+ fix_entry_ops.append(rename_dict)
|
||||
else:
|
||||
- logging.info(lf('Fixing gfid mismatch in slave. '
|
||||
- ' Deleting the entry',
|
||||
+ # A hardlink file exists with different name or
|
||||
+ # renamed file exists and we are sure from
|
||||
+ # matching_disk_gfid check that the entry doesn't
|
||||
+ # exist with same gfid so we can safely delete on slave
|
||||
+ logging.info(lf('Fixing file gfid mismatch in slave. '
|
||||
+ 'Hardlink/Rename Case. Deleting entry',
|
||||
+ retry_count=retry_count,
|
||||
+ entry=repr(failure)))
|
||||
+ fix_entry_ops.append(
|
||||
+ edct('UNLINK',
|
||||
+ gfid=failure[2]['slave_gfid'],
|
||||
+ entry=pbname))
|
||||
+ elif failure[1] == ENOENT:
|
||||
+ # Ignore ENOENT error for fix_entry_ops aka retry_count > 1
|
||||
+ if retry_count > 1:
|
||||
+ logging.info(lf('ENOENT error while fixing entry ops. '
|
||||
+ 'Safe to ignore, take out entry',
|
||||
+ retry_count=retry_count,
|
||||
+ entry=repr(failure)))
|
||||
+ entries.remove(failure[0])
|
||||
+ elif op in ('MKNOD', 'CREATE', 'MKDIR'):
|
||||
+ pargfid = pbname.split('/')[1]
|
||||
+ st = lstat(os.path.join(pfx, pargfid))
|
||||
+ # Safe to ignore the failure as master doesn't contain
|
||||
+ # parent directory.
|
||||
+ if isinstance(st, int):
|
||||
+ logging.info(lf('Fixing ENOENT error in slave. Parent '
|
||||
+ 'does not exist on master. Safe to '
|
||||
+ 'ignore, take out entry',
|
||||
retry_count=retry_count,
|
||||
entry=repr(failure)))
|
||||
- fix_entry_ops.append(edct('UNLINK',
|
||||
- gfid=failure[2]['slave_gfid'],
|
||||
- entry=pbname))
|
||||
- logging.error(lf('Entry cannot be fixed in slave due '
|
||||
- 'to GFID mismatch, find respective '
|
||||
- 'path for the GFID and trigger sync',
|
||||
- gfid=slave_gfid))
|
||||
+ entries.remove(failure[0])
|
||||
|
||||
if fix_entry_ops:
|
||||
- #Process deletions of entries whose gfids are mismatched
|
||||
+ # Process deletions of entries whose gfids are mismatched
|
||||
failures1 = self.slave.server.entry_ops(fix_entry_ops)
|
||||
- if not failures1:
|
||||
- logging.info ("Sucessfully fixed entry ops with gfid mismatch")
|
||||
|
||||
- return failures1
|
||||
+ return (failures1, fix_entry_ops)
|
||||
|
||||
def handle_entry_failures(self, failures, entries):
|
||||
retries = 0
|
||||
pending_failures = False
|
||||
failures1 = []
|
||||
failures2 = []
|
||||
+ entry_ops1 = []
|
||||
+ entry_ops2 = []
|
||||
|
||||
if failures:
|
||||
pending_failures = True
|
||||
failures1 = failures
|
||||
+ entry_ops1 = entries
|
||||
|
||||
while pending_failures and retries < self.MAX_EF_RETRIES:
|
||||
retries += 1
|
||||
- failures2 = self.fix_possible_entry_failures(failures1,
|
||||
- retries)
|
||||
+ (failures2, entry_ops2) = self.fix_possible_entry_failures(
|
||||
+ failures1, retries, entry_ops1)
|
||||
if not failures2:
|
||||
pending_failures = False
|
||||
+ logging.info(lf('Sucessfully fixed entry ops with gfid '
|
||||
+ 'mismatch', retry_count=retries))
|
||||
else:
|
||||
pending_failures = True
|
||||
failures1 = failures2
|
||||
+ entry_ops1 = entry_ops2
|
||||
|
||||
if pending_failures:
|
||||
for failure in failures1:
|
||||
logging.error("Failed to fix entry ops %s", repr(failure))
|
||||
- else:
|
||||
- #Retry original entry list 5 times
|
||||
- failures = self.slave.server.entry_ops(entries)
|
||||
-
|
||||
- self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
|
||||
-
|
||||
|
||||
def process_change(self, change, done, retry):
|
||||
pfx = gauxpfx()
|
||||
@@ -1112,7 +1163,19 @@ class GMasterChangelogMixin(GMasterCommon):
|
||||
self.status.inc_value("entry", len(entries))
|
||||
|
||||
failures = self.slave.server.entry_ops(entries)
|
||||
- self.handle_entry_failures(failures, entries)
|
||||
+ count = 0
|
||||
+ while failures and count < self.MAX_OE_RETRIES:
|
||||
+ count += 1
|
||||
+ self.handle_entry_failures(failures, entries)
|
||||
+ logging.info("Retry original entries. count = %s" % count)
|
||||
+ failures = self.slave.server.entry_ops(entries)
|
||||
+ if not failures:
|
||||
+ logging.info("Sucessfully fixed all entry ops with gfid "
|
||||
+ "mismatch")
|
||||
+ break
|
||||
+
|
||||
+ self.log_failures(failures, 'gfid', gauxpfx(), 'ENTRY')
|
||||
+
|
||||
self.status.dec_value("entry", len(entries))
|
||||
|
||||
# Update Entry stime in Brick Root only in case of Changelog mode
|
||||
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
|
||||
index 0d5462a..eb696f3 100644
|
||||
--- a/geo-replication/syncdaemon/resource.py
|
||||
+++ b/geo-replication/syncdaemon/resource.py
|
||||
@@ -38,9 +38,9 @@ from syncdutils import CHANGELOG_AGENT_CLIENT_VERSION
|
||||
from syncdutils import GX_GFID_CANONICAL_LEN
|
||||
from gsyncdstatus import GeorepStatus
|
||||
from syncdutils import get_master_and_slave_data_from_args
|
||||
-from syncdutils import lf, Popen, sup, Volinfo
|
||||
+from syncdutils import lf, Popen, sup
|
||||
from syncdutils import Xattr, matching_disk_gfid, get_gfid_from_mnt
|
||||
-from syncdutils import unshare_propagation_supported
|
||||
+from syncdutils import unshare_propagation_supported, get_slv_dir_path
|
||||
|
||||
UrlRX = re.compile('\A(\w+)://([^ *?[]*)\Z')
|
||||
HostRX = re.compile('[a-zA-Z\d](?:[a-zA-Z\d.-]*[a-zA-Z\d])?', re.I)
|
||||
@@ -50,7 +50,6 @@ ENOTSUP = getattr(errno, 'ENOTSUP', 'EOPNOTSUPP')
|
||||
|
||||
slv_volume = None
|
||||
slv_host = None
|
||||
-slv_bricks = None
|
||||
|
||||
def desugar(ustr):
|
||||
"""transform sugared url strings to standard <scheme>://<urlbody> form
|
||||
@@ -463,13 +462,23 @@ class Server(object):
|
||||
# to be purged is the GFID gotten from the changelog.
|
||||
# (a stat(changelog_gfid) would also be valid here)
|
||||
# The race here is between the GFID check and the purge.
|
||||
+
|
||||
+ # If the entry or the gfid of the file to be deleted is not present
|
||||
+ # on slave, we can ignore the unlink/rmdir
|
||||
+ if isinstance(lstat(entry), int) or \
|
||||
+ isinstance(lstat(os.path.join(pfx, gfid)), int):
|
||||
+ return
|
||||
+
|
||||
if not matching_disk_gfid(gfid, entry):
|
||||
collect_failure(e, EEXIST)
|
||||
return
|
||||
|
||||
if op == 'UNLINK':
|
||||
er = errno_wrap(os.unlink, [entry], [ENOENT, ESTALE], [EBUSY])
|
||||
- return er
|
||||
+ # EISDIR is safe error, ignore. This can only happen when
|
||||
+ # unlink is sent from master while fixing gfid conflicts.
|
||||
+ if er != EISDIR:
|
||||
+ return er
|
||||
|
||||
elif op == 'RMDIR':
|
||||
er = errno_wrap(os.rmdir, [entry], [ENOENT, ESTALE,
|
||||
@@ -480,7 +489,11 @@ class Server(object):
|
||||
def collect_failure(e, cmd_ret, dst=False):
|
||||
slv_entry_info = {}
|
||||
slv_entry_info['gfid_mismatch'] = False
|
||||
+ slv_entry_info['name_mismatch'] = False
|
||||
slv_entry_info['dst'] = dst
|
||||
+ slv_entry_info['slave_isdir'] = False
|
||||
+ slv_entry_info['slave_name'] = None
|
||||
+ slv_entry_info['slave_gfid'] = None
|
||||
# We do this for failing fops on Slave
|
||||
# Master should be logging this
|
||||
if cmd_ret is None:
|
||||
@@ -498,6 +511,9 @@ class Server(object):
|
||||
if not isinstance(st, int):
|
||||
if st and stat.S_ISDIR(st.st_mode):
|
||||
slv_entry_info['slave_isdir'] = True
|
||||
+ dir_name = get_slv_dir_path(slv_host, slv_volume,
|
||||
+ disk_gfid)
|
||||
+ slv_entry_info['slave_name'] = dir_name
|
||||
else:
|
||||
slv_entry_info['slave_isdir'] = False
|
||||
slv_entry_info['slave_gfid'] = disk_gfid
|
||||
@@ -618,37 +634,34 @@ class Server(object):
|
||||
[ENOENT, EEXIST], [ESTALE])
|
||||
collect_failure(e, cmd_ret)
|
||||
elif op == 'MKDIR':
|
||||
+ en = e['entry']
|
||||
slink = os.path.join(pfx, gfid)
|
||||
st = lstat(slink)
|
||||
# don't create multiple entries with same gfid
|
||||
if isinstance(st, int):
|
||||
blob = entry_pack_mkdir(
|
||||
gfid, bname, e['mode'], e['uid'], e['gid'])
|
||||
- else:
|
||||
+ elif (isinstance(lstat(en), int) or
|
||||
+ not matching_disk_gfid(gfid, en)):
|
||||
# If gfid of a directory exists on slave but path based
|
||||
# create is getting EEXIST. This means the directory is
|
||||
# renamed in master but recorded as MKDIR during hybrid
|
||||
# crawl. Get the directory path by reading the backend
|
||||
# symlink and trying to rename to new name as said by
|
||||
# master.
|
||||
- global slv_bricks
|
||||
- global slv_volume
|
||||
- global slv_host
|
||||
- if not slv_bricks:
|
||||
- slv_info = Volinfo (slv_volume, slv_host)
|
||||
- slv_bricks = slv_info.bricks
|
||||
- # Result of readlink would be of format as below.
|
||||
- # readlink = "../../pgfid[0:2]/pgfid[2:4]/pgfid/basename"
|
||||
- realpath = os.readlink(os.path.join(slv_bricks[0]['dir'],
|
||||
- ".glusterfs", gfid[0:2],
|
||||
- gfid[2:4], gfid))
|
||||
- realpath_parts = realpath.split('/')
|
||||
- src_pargfid = realpath_parts[-2]
|
||||
- src_basename = realpath_parts[-1]
|
||||
- src_entry = os.path.join(pfx, src_pargfid, src_basename)
|
||||
logging.info(lf("Special case: rename on mkdir",
|
||||
- gfid=gfid, entry=repr(entry)))
|
||||
- rename_with_disk_gfid_confirmation(gfid, src_entry, entry)
|
||||
+ gfid=gfid, entry=repr(entry)))
|
||||
+ src_entry = get_slv_dir_path(slv_host, slv_volume, gfid)
|
||||
+ if src_entry is not None and src_entry != entry:
|
||||
+ slv_entry_info = {}
|
||||
+ slv_entry_info['gfid_mismatch'] = False
|
||||
+ slv_entry_info['name_mismatch'] = True
|
||||
+ slv_entry_info['dst'] = False
|
||||
+ slv_entry_info['slave_isdir'] = True
|
||||
+ slv_entry_info['slave_gfid'] = gfid
|
||||
+ slv_entry_info['slave_entry'] = src_entry
|
||||
+
|
||||
+ failures.append((e, EEXIST, slv_entry_info))
|
||||
elif op == 'LINK':
|
||||
slink = os.path.join(pfx, gfid)
|
||||
st = lstat(slink)
|
||||
diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py
|
||||
index 6dafb0a..d798356 100644
|
||||
--- a/geo-replication/syncdaemon/syncdutils.py
|
||||
+++ b/geo-replication/syncdaemon/syncdutils.py
|
||||
@@ -77,6 +77,7 @@ CHANGELOG_AGENT_CLIENT_VERSION = 1.0
|
||||
NodeID = None
|
||||
rsync_version = None
|
||||
unshare_mnt_propagation = None
|
||||
+slv_bricks = None
|
||||
SPACE_ESCAPE_CHAR = "%20"
|
||||
NEWLINE_ESCAPE_CHAR = "%0A"
|
||||
PERCENTAGE_ESCAPE_CHAR = "%25"
|
||||
@@ -671,6 +672,40 @@ def get_rsync_version(rsync_cmd):
|
||||
return rsync_version
|
||||
|
||||
|
||||
+def get_slv_dir_path(slv_host, slv_volume, gfid):
|
||||
+ global slv_bricks
|
||||
+
|
||||
+ dir_path = ENOENT
|
||||
+
|
||||
+ if not slv_bricks:
|
||||
+ slv_info = Volinfo(slv_volume, slv_host)
|
||||
+ slv_bricks = slv_info.bricks
|
||||
+ # Result of readlink would be of format as below.
|
||||
+ # readlink = "../../pgfid[0:2]/pgfid[2:4]/pgfid/basename"
|
||||
+ for brick in slv_bricks:
|
||||
+ dir_path = errno_wrap(os.path.join,
|
||||
+ [brick['dir'],
|
||||
+ ".glusterfs", gfid[0:2],
|
||||
+ gfid[2:4],
|
||||
+ gfid], [ENOENT], [ESTALE])
|
||||
+ if dir_path != ENOENT:
|
||||
+ break
|
||||
+
|
||||
+ if not isinstance(dir_path, int):
|
||||
+ realpath = errno_wrap(os.readlink, [dir_path],
|
||||
+ [ENOENT], [ESTALE])
|
||||
+
|
||||
+ if not isinstance(realpath, int):
|
||||
+ realpath_parts = realpath.split('/')
|
||||
+ pargfid = realpath_parts[-2]
|
||||
+ basename = realpath_parts[-1]
|
||||
+ pfx = gauxpfx()
|
||||
+ dir_entry = os.path.join(pfx, pargfid, basename)
|
||||
+ return dir_entry
|
||||
+
|
||||
+ return None
|
||||
+
|
||||
+
|
||||
def lf(event, **kwargs):
|
||||
"""
|
||||
Log Format helper function, log messages can be
|
||||
--
|
||||
1.8.3.1
|
||||
|
52
0329-cluster-dht-Set-loc-gfid-before-healing-attr.patch
Normal file
52
0329-cluster-dht-Set-loc-gfid-before-healing-attr.patch
Normal file
@ -0,0 +1,52 @@
|
||||
From c2b215a14a38d3587a5a3ea4efab384033019ed5 Mon Sep 17 00:00:00 2001
|
||||
From: N Balachandran <nbalacha@redhat.com>
|
||||
Date: Wed, 18 Jul 2018 22:09:19 +0530
|
||||
Subject: [PATCH 329/333] cluster/dht: Set loc->gfid before healing attr
|
||||
|
||||
AFR takes inodelks when setting attrs. The loc->gfid
|
||||
and loc->inode->gfid were both null when dht_dir_attr_heal
|
||||
was called during a fresh lookup of an existing directory.
|
||||
As the gfid is null, client_pre_inodelk asserts in the gfid
|
||||
check.
|
||||
We now set the loc->gfid before calling dht_dir_attr_heal.
|
||||
|
||||
upstream patch: https://review.gluster.org/#/c/20530/
|
||||
|
||||
> Change-Id: I457f5a73fd301d97a03ca032587e73d4803298ac
|
||||
> fixes: bz#1602866
|
||||
> Signed-off-by: N Balachandran <nbalacha@redhat.com>
|
||||
|
||||
Change-Id: Ie5e30d4ab3b824eaad333da22465d6672c75a2f6
|
||||
BUG: 1601331
|
||||
Signed-off-by: N Balachandran <nbalacha@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/144386
|
||||
Reviewed-by: Mohit Agrawal <moagrawa@redhat.com>
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
---
|
||||
xlators/cluster/dht/src/dht-common.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
|
||||
index 2207708..0984f8f 100644
|
||||
--- a/xlators/cluster/dht/src/dht-common.c
|
||||
+++ b/xlators/cluster/dht/src/dht-common.c
|
||||
@@ -1518,7 +1518,6 @@ unlock:
|
||||
if (local->need_attrheal) {
|
||||
local->need_attrheal = 0;
|
||||
if (!__is_root_gfid (inode->gfid)) {
|
||||
- gf_uuid_copy (local->gfid, local->mds_stbuf.ia_gfid);
|
||||
local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
|
||||
local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
|
||||
local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
|
||||
@@ -1532,6 +1531,8 @@ unlock:
|
||||
goto skip_attr_heal;
|
||||
}
|
||||
copy_local->stbuf = local->stbuf;
|
||||
+ gf_uuid_copy (copy_local->loc.gfid,
|
||||
+ local->stbuf.ia_gfid);
|
||||
copy_local->mds_stbuf = local->mds_stbuf;
|
||||
copy_local->mds_subvol = local->mds_subvol;
|
||||
copy->local = copy_local;
|
||||
--
|
||||
1.8.3.1
|
||||
|
70
0330-posix-check-before-removing-stale-symlink.patch
Normal file
70
0330-posix-check-before-removing-stale-symlink.patch
Normal file
@ -0,0 +1,70 @@
|
||||
From 4742c4766af4b0def0e12a2b0544c30496dfb48e Mon Sep 17 00:00:00 2001
|
||||
From: Ravishankar N <ravishankar@redhat.com>
|
||||
Date: Thu, 19 Jul 2018 12:47:38 +0530
|
||||
Subject: [PATCH 330/333] posix: check before removing stale symlink
|
||||
|
||||
Backport of https://review.gluster.org/#/c/20509/
|
||||
|
||||
BZ 1564071 complains of directories with missing gfid symlinks and
|
||||
corresponding "Found stale gfid handle" messages in the logs. Hence
|
||||
add a check to see if the symlink points to an actual directory before
|
||||
removing it.
|
||||
|
||||
Note: Removing stale symlinks was added via commit
|
||||
3e9a9c029fac359477fb26d9cc7803749ba038b2
|
||||
|
||||
Change-Id: I5d91fab8e5f3a621a9ecad4a1f9c898a3c2d346a
|
||||
BUG: 1603103
|
||||
Signed-off-by: Ravishankar N <ravishankar@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/144867
|
||||
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||||
---
|
||||
xlators/storage/posix/src/posix.c | 13 +++++++++----
|
||||
1 file changed, 9 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c
|
||||
index ddb875c..c3b7120 100644
|
||||
--- a/xlators/storage/posix/src/posix.c
|
||||
+++ b/xlators/storage/posix/src/posix.c
|
||||
@@ -235,6 +235,7 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
|
||||
int32_t nlink_samepgfid = 0;
|
||||
struct posix_private *priv = NULL;
|
||||
posix_inode_ctx_t *ctx = NULL;
|
||||
+ int ret = 0;
|
||||
|
||||
VALIDATE_OR_GOTO (frame, out);
|
||||
VALIDATE_OR_GOTO (this, out);
|
||||
@@ -284,20 +285,24 @@ posix_lookup (call_frame_t *frame, xlator_t *this,
|
||||
"lstat on %s failed",
|
||||
real_path ? real_path : "null");
|
||||
}
|
||||
+ entry_ret = -1;
|
||||
if (loc_is_nameless(loc)) {
|
||||
if (!op_errno)
|
||||
op_errno = ESTALE;
|
||||
loc_gfid (loc, gfid);
|
||||
MAKE_HANDLE_ABSPATH (gfid_path, this, gfid);
|
||||
- op_ret = sys_lstat(gfid_path, &statbuf);
|
||||
- if (op_ret == 0 && statbuf.st_nlink == 1) {
|
||||
- gf_msg (this->name, GF_LOG_WARNING, ESTALE,
|
||||
+ ret = sys_stat(gfid_path, &statbuf);
|
||||
+ if (ret == 0 && ((statbuf.st_mode & S_IFMT) == S_IFDIR))
|
||||
+ /*Don't unset if it was a symlink to a dir.*/
|
||||
+ goto parent;
|
||||
+ ret = sys_lstat(gfid_path, &statbuf);
|
||||
+ if (ret == 0 && statbuf.st_nlink == 1) {
|
||||
+ gf_msg (this->name, GF_LOG_WARNING, op_errno,
|
||||
P_MSG_HANDLE_DELETE, "Found stale gfid "
|
||||
"handle %s, removing it.", gfid_path);
|
||||
posix_handle_unset (this, gfid, NULL);
|
||||
}
|
||||
}
|
||||
- entry_ret = -1;
|
||||
goto parent;
|
||||
}
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
49
0331-rpc-free-registered-callback-programs.patch
Normal file
49
0331-rpc-free-registered-callback-programs.patch
Normal file
@ -0,0 +1,49 @@
|
||||
From 9c1ddc2e32cbfc8ad313b4f1342fbc20e49af80b Mon Sep 17 00:00:00 2001
|
||||
From: Niels de Vos <ndevos@redhat.com>
|
||||
Date: Mon, 9 Oct 2017 18:58:09 +0200
|
||||
Subject: [PATCH 331/333] rpc: free registered callback programs
|
||||
|
||||
> Change-Id: I8c6f6b642f025d1faf74015b8f7aaecd7ebfd4d5
|
||||
> BUG: 1443145
|
||||
> Signed-off-by: Niels de Vos <ndevos@redhat.com>
|
||||
> (cherry picked from commit ec39ca32d942d49fd701156174abbba0b73bce2f)
|
||||
> (Reviewed on upstream link https://review.gluster.org/#/c/18478)
|
||||
|
||||
Change-Id: I23e44507d12326bf63c96c56eae83d5424f8ee63
|
||||
BUG: 1600790
|
||||
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/145358
|
||||
Reviewed-by: Niels de Vos <ndevos@redhat.com>
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
---
|
||||
rpc/rpc-lib/src/rpc-clnt.c | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
|
||||
index e34d2ca..1ea8099 100644
|
||||
--- a/rpc/rpc-lib/src/rpc-clnt.c
|
||||
+++ b/rpc/rpc-lib/src/rpc-clnt.c
|
||||
@@ -1771,6 +1771,9 @@ rpc_clnt_trigger_destroy (struct rpc_clnt *rpc)
|
||||
static void
|
||||
rpc_clnt_destroy (struct rpc_clnt *rpc)
|
||||
{
|
||||
+ rpcclnt_cb_program_t *program = NULL;
|
||||
+ rpcclnt_cb_program_t *tmp = NULL;
|
||||
+
|
||||
if (!rpc)
|
||||
return;
|
||||
|
||||
@@ -1783,6 +1786,10 @@ rpc_clnt_destroy (struct rpc_clnt *rpc)
|
||||
mem_pool_destroy (rpc->reqpool);
|
||||
mem_pool_destroy (rpc->saved_frames_pool);
|
||||
|
||||
+ list_for_each_entry_safe (program, tmp, &rpc->programs, program) {
|
||||
+ GF_FREE (program);
|
||||
+ }
|
||||
+
|
||||
GF_FREE (rpc);
|
||||
return;
|
||||
}
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -0,0 +1,67 @@
|
||||
From 76823d120518528c4edad4af6f4c1cdd50f5b398 Mon Sep 17 00:00:00 2001
|
||||
From: Mohit Agrawal <moagrawal@redhat.com>
|
||||
Date: Tue, 24 Jul 2018 14:48:35 +0530
|
||||
Subject: [PATCH 332/333] rpc: rpc_clnt_connection_cleanup is crashed due to
|
||||
double free
|
||||
|
||||
Problem: gfapi client is getting crashed in rpc_clnt_connection_cleanup
|
||||
at the time of destroying saved_frames
|
||||
|
||||
Solution: gfapi client is getting crashed because saved_frame ptr is
|
||||
already freed in rpc_clnt_destroy.To avoid the same update
|
||||
code in rpc_clnt_destroy
|
||||
|
||||
> Change-Id: Id8cce102b49f26cfd86ef88257032ed98f43192b
|
||||
> fixes: bz#1607783
|
||||
> (cherry picked from commit abd7b1393294d29eef6913e7f93ab76040c90428)
|
||||
> (Reviewed on upstream link https://review.gluster.org/#/c/20557/)
|
||||
|
||||
Change-Id: Id3200e36acc1c49a8f5d39a1cc5053864899754c
|
||||
BUG: 1600790
|
||||
Signed-off-by: Mohit Agrawal <moagrawal@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/145377
|
||||
Tested-by: Mohit Agrawal <moagrawa@redhat.com>
|
||||
Reviewed-by: Niels de Vos <ndevos@redhat.com>
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
---
|
||||
rpc/rpc-lib/src/rpc-clnt.c | 20 +++++++++++++++++---
|
||||
1 file changed, 17 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/rpc/rpc-lib/src/rpc-clnt.c b/rpc/rpc-lib/src/rpc-clnt.c
|
||||
index 1ea8099..fd7e3ec 100644
|
||||
--- a/rpc/rpc-lib/src/rpc-clnt.c
|
||||
+++ b/rpc/rpc-lib/src/rpc-clnt.c
|
||||
@@ -1771,13 +1771,27 @@ rpc_clnt_trigger_destroy (struct rpc_clnt *rpc)
|
||||
static void
|
||||
rpc_clnt_destroy (struct rpc_clnt *rpc)
|
||||
{
|
||||
- rpcclnt_cb_program_t *program = NULL;
|
||||
- rpcclnt_cb_program_t *tmp = NULL;
|
||||
+ rpcclnt_cb_program_t *program = NULL;
|
||||
+ rpcclnt_cb_program_t *tmp = NULL;
|
||||
+ struct saved_frames *saved_frames = NULL;
|
||||
+ rpc_clnt_connection_t *conn = NULL;
|
||||
|
||||
if (!rpc)
|
||||
return;
|
||||
|
||||
- saved_frames_destroy (rpc->conn.saved_frames);
|
||||
+ conn = &rpc->conn;
|
||||
+ /* Access saved_frames in critical-section to avoid
|
||||
+ crash in rpc_clnt_connection_cleanup at the time
|
||||
+ of destroying saved frames
|
||||
+ */
|
||||
+ pthread_mutex_lock (&conn->lock);
|
||||
+ {
|
||||
+ saved_frames = conn->saved_frames;
|
||||
+ conn->saved_frames = NULL;
|
||||
+ }
|
||||
+ pthread_mutex_unlock (&conn->lock);
|
||||
+
|
||||
+ saved_frames_destroy (saved_frames);
|
||||
pthread_mutex_destroy (&rpc->lock);
|
||||
pthread_mutex_destroy (&rpc->conn.lock);
|
||||
|
||||
--
|
||||
1.8.3.1
|
||||
|
726
0333-glusterd-Add-multiple-checks-before-attach-start-a-b.patch
Normal file
726
0333-glusterd-Add-multiple-checks-before-attach-start-a-b.patch
Normal file
@ -0,0 +1,726 @@
|
||||
From 53ecd916d5ef56e164228ba123b078d4b30bfa81 Mon Sep 17 00:00:00 2001
|
||||
From: Mohit Agrawal <moagrawal@redhat.com>
|
||||
Date: Thu, 12 Jul 2018 13:29:48 +0530
|
||||
Subject: [PATCH 333/333] glusterd: Add multiple checks before attach/start a
|
||||
brick
|
||||
|
||||
Problem: In brick mux scenario sometime glusterd is not able
|
||||
to start/attach a brick and gluster v status shows
|
||||
brick is already running
|
||||
|
||||
Solution:
|
||||
1) To make sure brick is running check brick_path in
|
||||
/proc/<pid>/fd , if a brick is consumed by the brick
|
||||
process it means brick stack is come up otherwise not
|
||||
2) Before start/attach a brick check if a brick is mounted
|
||||
or not
|
||||
3) At the time of printing volume status check brick is
|
||||
consumed by any brick process
|
||||
|
||||
Test: To test the same followed procedure
|
||||
1) Setup brick mux environment on a vm
|
||||
2) Put a breaking point in gdb in function posix_health_check_thread_proc
|
||||
at the time of notify GF_EVENT_CHILD_DOWN event
|
||||
3) unmount anyone brick path forcefully
|
||||
4) check gluster v status it will show N/A for the brick
|
||||
5) Try to start volume with force option, glusterd throw
|
||||
message "No device available for mount brick"
|
||||
6) Mount the brick_root path
|
||||
7) Try to start volume with force option
|
||||
8) down brick is started successfully
|
||||
|
||||
> Change-Id: I91898dad21d082ebddd12aa0d1f7f0ed012bdf69
|
||||
> fixes: bz#1595320
|
||||
> (cherry picked from commit 9400b6f2c8aa219a493961e0ab9770b7f12e80d2)
|
||||
> (Reviewed on upstream link https://review.gluster.org/#/c/20202/)
|
||||
|
||||
Change-Id: I62459910272754e4e062b2725fea2a1e68d743f1
|
||||
BUG: 1589279
|
||||
Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
|
||||
Reviewed-on: https://code.engineering.redhat.com/gerrit/145269
|
||||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||||
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||||
---
|
||||
glusterfsd/src/glusterfsd-mgmt.c | 3 +
|
||||
tests/basic/bug-1595320.t | 92 +++++++++
|
||||
tests/basic/posix/shared-statfs.t | 2 +
|
||||
tests/bitrot/bug-1373520.t | 1 +
|
||||
tests/bugs/distribute/bug-1368012.t | 2 +
|
||||
tests/bugs/distribute/bug-853258.t | 1 +
|
||||
tests/bugs/quota/bug-1293601.t | 3 +-
|
||||
xlators/mgmt/glusterd/src/glusterd-snapshot.c | 2 +-
|
||||
xlators/mgmt/glusterd/src/glusterd-utils.c | 261 ++++++++++++++++++++----
|
||||
xlators/mgmt/glusterd/src/glusterd-utils.h | 6 +-
|
||||
xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 7 +-
|
||||
11 files changed, 329 insertions(+), 51 deletions(-)
|
||||
create mode 100644 tests/basic/bug-1595320.t
|
||||
|
||||
diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c
|
||||
index 30a717f..cbd436a 100644
|
||||
--- a/glusterfsd/src/glusterfsd-mgmt.c
|
||||
+++ b/glusterfsd/src/glusterfsd-mgmt.c
|
||||
@@ -1010,6 +1010,9 @@ glusterfs_handle_attach (rpcsvc_request_t *req)
|
||||
"got attach for %s but no active graph",
|
||||
xlator_req.name);
|
||||
}
|
||||
+ if (ret) {
|
||||
+ ret = -1;
|
||||
+ }
|
||||
|
||||
glusterfs_translator_info_response_send (req, ret, NULL, NULL);
|
||||
|
||||
diff --git a/tests/basic/bug-1595320.t b/tests/basic/bug-1595320.t
|
||||
new file mode 100644
|
||||
index 0000000..9d856ee
|
||||
--- /dev/null
|
||||
+++ b/tests/basic/bug-1595320.t
|
||||
@@ -0,0 +1,92 @@
|
||||
+#!/bin/bash
|
||||
+
|
||||
+. $(dirname $0)/../include.rc
|
||||
+. $(dirname $0)/../volume.rc
|
||||
+. $(dirname $0)/../snapshot.rc
|
||||
+
|
||||
+cleanup
|
||||
+
|
||||
+function count_up_bricks {
|
||||
+ $CLI --xml volume status $V0 | grep '<status>1' | wc -l
|
||||
+}
|
||||
+
|
||||
+function count_brick_processes {
|
||||
+ pgrep glusterfsd | wc -l
|
||||
+}
|
||||
+
|
||||
+# Setup 3 LVMS
|
||||
+LVM_PREFIX="test"
|
||||
+TEST init_n_bricks 3
|
||||
+TEST setup_lvm 3
|
||||
+
|
||||
+# Start glusterd
|
||||
+TEST glusterd
|
||||
+TEST pidof glusterd
|
||||
+
|
||||
+# Create volume and enable brick multiplexing
|
||||
+TEST $CLI volume create $V0 $H0:$L1 $H0:$L2 $H0:$L3
|
||||
+gluster v set all cluster.brick-multiplex on
|
||||
+
|
||||
+# Start the volume
|
||||
+TEST $CLI volume start $V0
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 count_up_bricks
|
||||
+EXPECT 1 count_brick_processes
|
||||
+
|
||||
+# Kill volume ungracefully
|
||||
+brick_pid=`pgrep glusterfsd`
|
||||
+
|
||||
+# Make sure every brick root should be consumed by a brick process
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L1 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L2 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L3 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+
|
||||
+b1_pid_file=$(ls $GLUSTERD_PIDFILEDIR/vols/$V0/*d-backends-1*.pid)
|
||||
+b2_pid_file=$(ls $GLUSTERD_PIDFILEDIR/vols/$V0/*d-backends-2*.pid)
|
||||
+b3_pid_file=$(ls $GLUSTERD_PIDFILEDIR/vols/$V0/*d-backends-3*.pid)
|
||||
+
|
||||
+kill -9 $brick_pid
|
||||
+EXPECT 0 count_brick_processes
|
||||
+
|
||||
+# Unmount 3rd brick root from node
|
||||
+brick_root=$L3
|
||||
+TEST umount -l $brick_root 2>/dev/null
|
||||
+
|
||||
+# Start the volume only 2 brick should be start
|
||||
+TEST $CLI volume start $V0 force
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 2 count_up_bricks
|
||||
+EXPECT 1 count_brick_processes
|
||||
+
|
||||
+brick_pid=`pgrep glusterfsd`
|
||||
+
|
||||
+# Make sure only two brick root should be consumed by a brick process
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L1 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L2 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L3 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 0 ]
|
||||
+
|
||||
+# Mount the brick root
|
||||
+TEST mount -t xfs -o nouuid /dev/test_vg_3/brick_lvm $brick_root
|
||||
+
|
||||
+# Replace brick_pid file to test brick_attach code
|
||||
+TEST cp $b1_pid_file $b3_pid_file
|
||||
+
|
||||
+# Start the volume all brick should be up
|
||||
+TEST $CLI volume start $V0 force
|
||||
+
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT 3 count_up_bricks
|
||||
+EXPECT 1 count_brick_processes
|
||||
+
|
||||
+# Make sure every brick root should be consumed by a brick process
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L1 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L2 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+n=`ls -lrth /proc/$brick_pid/fd | grep -iw $L3 | grep -v ".glusterfs" | wc -l`
|
||||
+TEST [ $n -eq 1 ]
|
||||
+
|
||||
+cleanup
|
||||
diff --git a/tests/basic/posix/shared-statfs.t b/tests/basic/posix/shared-statfs.t
|
||||
index 8caa9fa..3343956 100644
|
||||
--- a/tests/basic/posix/shared-statfs.t
|
||||
+++ b/tests/basic/posix/shared-statfs.t
|
||||
@@ -23,6 +23,7 @@ TEST MOUNT_LOOP $LO2 $B0/${V0}2
|
||||
# Create a subdir in mountpoint and use that for volume.
|
||||
TEST $CLI volume create $V0 $H0:$B0/${V0}1/1 $H0:$B0/${V0}2/1;
|
||||
TEST $CLI volume start $V0
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" online_brick_count
|
||||
TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0
|
||||
total_space=$(df -P $M0 | tail -1 | awk '{ print $2}')
|
||||
# Keeping the size less than 200M mainly because XFS will use
|
||||
@@ -38,6 +39,7 @@ EXPECT 'Stopped' volinfo_field $V0 'Status';
|
||||
TEST $CLI volume add-brick $V0 $H0:$B0/${V0}1/2 $H0:$B0/${V0}2/2 $H0:$B0/${V0}1/3 $H0:$B0/${V0}2/3
|
||||
|
||||
TEST $CLI volume start $V0
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "6" online_brick_count
|
||||
TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0
|
||||
total_space=$(df -P $M0 | tail -1 | awk '{ print $2}')
|
||||
TEST [ $total_space -gt 194000 -a $total_space -lt 200000 ]
|
||||
diff --git a/tests/bitrot/bug-1373520.t b/tests/bitrot/bug-1373520.t
|
||||
index 225d3b1..c09d424 100644
|
||||
--- a/tests/bitrot/bug-1373520.t
|
||||
+++ b/tests/bitrot/bug-1373520.t
|
||||
@@ -11,6 +11,7 @@ TEST pidof glusterd
|
||||
#Create a disperse volume
|
||||
TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5}
|
||||
TEST $CLI volume start $V0
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "6" online_brick_count
|
||||
EXPECT_WITHIN $PROCESS_UP_TIMEOUT 'Started' volinfo_field $V0 'Status'
|
||||
|
||||
#Disable md-cache
|
||||
diff --git a/tests/bugs/distribute/bug-1368012.t b/tests/bugs/distribute/bug-1368012.t
|
||||
index f89314b..b861554 100644
|
||||
--- a/tests/bugs/distribute/bug-1368012.t
|
||||
+++ b/tests/bugs/distribute/bug-1368012.t
|
||||
@@ -22,6 +22,7 @@ EXPECT "$V0" volinfo_field $V0 'Volume Name';
|
||||
EXPECT 'Created' volinfo_field $V0 'Status';
|
||||
## Start volume and verify
|
||||
TEST $CLI volume start $V0;
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "2" online_brick_count
|
||||
TEST $CLI volume set $V0 performance.stat-prefetch off
|
||||
EXPECT 'Started' volinfo_field $V0 'Status';
|
||||
TEST glusterfs -s $H0 --volfile-id=$V0 $M0
|
||||
@@ -36,6 +37,7 @@ TEST permission_root=`stat -c "%A" $M0`
|
||||
TEST echo $permission_root
|
||||
#Add-brick
|
||||
TEST $CLI volume add-brick $V0 $H0:/${V0}3
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "3" online_brick_count
|
||||
|
||||
#Allow one lookup to happen
|
||||
TEST pushd $M0
|
||||
diff --git a/tests/bugs/distribute/bug-853258.t b/tests/bugs/distribute/bug-853258.t
|
||||
index e39f507..6817d9e 100755
|
||||
--- a/tests/bugs/distribute/bug-853258.t
|
||||
+++ b/tests/bugs/distribute/bug-853258.t
|
||||
@@ -31,6 +31,7 @@ done
|
||||
|
||||
# Expand the volume and force assignment of new ranges.
|
||||
TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "4" online_brick_count
|
||||
# Force assignment of initial ranges.
|
||||
TEST $CLI volume rebalance $V0 fix-layout start
|
||||
EXPECT_WITHIN $REBALANCE_TIMEOUT "fix-layout completed" fix-layout_status_field $V0
|
||||
diff --git a/tests/bugs/quota/bug-1293601.t b/tests/bugs/quota/bug-1293601.t
|
||||
index def4ef9..741758b 100644
|
||||
--- a/tests/bugs/quota/bug-1293601.t
|
||||
+++ b/tests/bugs/quota/bug-1293601.t
|
||||
@@ -9,6 +9,7 @@ TEST glusterd
|
||||
|
||||
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{1,2,3,4}
|
||||
TEST $CLI volume start $V0
|
||||
+EXPECT_WITHIN $PROCESS_UP_TIMEOUT "4" online_brick_count
|
||||
TEST $CLI volume quota $V0 enable
|
||||
|
||||
TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0;
|
||||
@@ -27,6 +28,6 @@ EXPECT_WITHIN $MARKER_UPDATE_TIMEOUT "1.0MB" quotausage "/"
|
||||
TEST $CLI volume quota $V0 disable
|
||||
TEST $CLI volume quota $V0 enable
|
||||
|
||||
-EXPECT_WITHIN 40 "1.0MB" quotausage "/"
|
||||
+EXPECT_WITHIN 60 "1.0MB" quotausage "/"
|
||||
|
||||
cleanup;
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-snapshot.c b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
|
||||
index 304cef6..09e10bf 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-snapshot.c
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-snapshot.c
|
||||
@@ -2844,7 +2844,7 @@ glusterd_do_lvm_snapshot_remove (glusterd_volinfo_t *snap_vol,
|
||||
GLUSTERD_GET_BRICK_PIDFILE (pidfile, snap_vol, brickinfo, priv);
|
||||
if (gf_is_service_running (pidfile, &pid)) {
|
||||
(void) send_attach_req (this, brickinfo->rpc,
|
||||
- brickinfo->path, NULL,
|
||||
+ brickinfo->path, NULL, NULL,
|
||||
GLUSTERD_BRICK_TERMINATE);
|
||||
brickinfo->status = GF_BRICK_STOPPED;
|
||||
}
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c
|
||||
index 95df889..fe9cc75 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-utils.c
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.c
|
||||
@@ -2186,7 +2186,7 @@ retry:
|
||||
goto out;
|
||||
}
|
||||
|
||||
- ret = glusterd_brick_process_add_brick (brickinfo, volinfo);
|
||||
+ ret = glusterd_brick_process_add_brick (brickinfo);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, 0,
|
||||
GD_MSG_BRICKPROC_ADD_BRICK_FAILED, "Adding brick %s:%s "
|
||||
@@ -2372,8 +2372,7 @@ out:
|
||||
}
|
||||
|
||||
int
|
||||
-glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo,
|
||||
- glusterd_volinfo_t *volinfo)
|
||||
+glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo)
|
||||
{
|
||||
int ret = -1;
|
||||
xlator_t *this = NULL;
|
||||
@@ -2500,7 +2499,7 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
|
||||
brickinfo->hostname, brickinfo->path);
|
||||
|
||||
(void) send_attach_req (this, brickinfo->rpc,
|
||||
- brickinfo->path, NULL,
|
||||
+ brickinfo->path, NULL, NULL,
|
||||
GLUSTERD_BRICK_TERMINATE);
|
||||
} else {
|
||||
gf_msg_debug (this->name, 0, "About to stop glusterfsd"
|
||||
@@ -5426,23 +5425,92 @@ static int32_t
|
||||
attach_brick_callback (struct rpc_req *req, struct iovec *iov, int count,
|
||||
void *v_frame)
|
||||
{
|
||||
- call_frame_t *frame = v_frame;
|
||||
- glusterd_conf_t *conf = frame->this->private;
|
||||
- glusterd_brickinfo_t *brickinfo = frame->local;
|
||||
+ call_frame_t *frame = v_frame;
|
||||
+ glusterd_conf_t *conf = frame->this->private;
|
||||
+ glusterd_brickinfo_t *brickinfo = frame->local;
|
||||
+ glusterd_brickinfo_t *other_brick = frame->cookie;
|
||||
+ glusterd_volinfo_t *volinfo = NULL;
|
||||
+ xlator_t *this = THIS;
|
||||
+ int ret = -1;
|
||||
+ char pidfile1[PATH_MAX] = {0};
|
||||
+ char pidfile2[PATH_MAX] = {0};
|
||||
+ gf_getspec_rsp rsp = {0,};
|
||||
|
||||
frame->local = NULL;
|
||||
- brickinfo->port_registered = _gf_true;
|
||||
+ frame->cookie = NULL;
|
||||
+
|
||||
+ ret = xdr_to_generic (*iov, &rsp, (xdrproc_t)xdr_gf_getspec_rsp);
|
||||
+ if (ret < 0) {
|
||||
+ gf_log (frame->this->name, GF_LOG_ERROR, "XDR decoding error");
|
||||
+ ret = -1;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ret = glusterd_get_volinfo_from_brick (other_brick->path,
|
||||
+ &volinfo);
|
||||
+ if (ret) {
|
||||
+ gf_msg (THIS->name, GF_LOG_ERROR, 0,
|
||||
+ GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
|
||||
+ " from brick(%s) so pidfile copying/unlink will fail",
|
||||
+ other_brick->path);
|
||||
+ goto out;
|
||||
+ }
|
||||
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile1, volinfo, other_brick, conf);
|
||||
+ volinfo = NULL;
|
||||
+
|
||||
+ ret = glusterd_get_volinfo_from_brick (brickinfo->path,
|
||||
+ &volinfo);
|
||||
+ if (ret) {
|
||||
+ gf_msg (THIS->name, GF_LOG_ERROR, 0,
|
||||
+ GD_MSG_VOLINFO_GET_FAIL, "Failed to get volinfo"
|
||||
+ " from brick(%s) so pidfile copying/unlink will fail",
|
||||
+ brickinfo->path);
|
||||
+ goto out;
|
||||
+ }
|
||||
+ GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf);
|
||||
+
|
||||
+ if (rsp.op_ret == 0) {
|
||||
+ brickinfo->port_registered = _gf_true;
|
||||
+
|
||||
+ /* PID file is copied once brick has attached
|
||||
+ successfully
|
||||
+ */
|
||||
+ glusterd_copy_file (pidfile1, pidfile2);
|
||||
+ brickinfo->status = GF_BRICK_STARTED;
|
||||
+ brickinfo->rpc = rpc_clnt_ref (other_brick->rpc);
|
||||
+ gf_log (THIS->name, GF_LOG_INFO, "brick %s is attached successfully",
|
||||
+ brickinfo->path);
|
||||
+ } else {
|
||||
+ gf_log (THIS->name, GF_LOG_INFO, "attach_brick failed pidfile"
|
||||
+ " is %s for brick_path %s", pidfile2, brickinfo->path);
|
||||
+ brickinfo->port = 0;
|
||||
+ brickinfo->status = GF_BRICK_STOPPED;
|
||||
+ ret = glusterd_brick_process_remove_brick (brickinfo);
|
||||
+ if (ret)
|
||||
+ gf_msg_debug (this->name, 0, "Couldn't remove brick from"
|
||||
+ " brick process");
|
||||
+ LOCK (&volinfo->lock);
|
||||
+ ret = glusterd_store_volinfo (volinfo, GLUSTERD_VOLINFO_VER_AC_NONE);
|
||||
+ UNLOCK (&volinfo->lock);
|
||||
+ if (ret) {
|
||||
+ gf_msg (this->name, GF_LOG_ERROR, 0,
|
||||
+ GD_MSG_VOLINFO_SET_FAIL,
|
||||
+ "Failed to store volinfo of "
|
||||
+ "%s volume", volinfo->volname);
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+out:
|
||||
synclock_lock (&conf->big_lock);
|
||||
--(conf->blockers);
|
||||
synclock_unlock (&conf->big_lock);
|
||||
-
|
||||
STACK_DESTROY (frame->root);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int
|
||||
send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path,
|
||||
- glusterd_brickinfo_t *brickinfo, int op)
|
||||
+ glusterd_brickinfo_t *brickinfo, glusterd_brickinfo_t *other_brick, int op)
|
||||
{
|
||||
int ret = -1;
|
||||
struct iobuf *iobuf = NULL;
|
||||
@@ -5516,6 +5584,7 @@ send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path,
|
||||
|
||||
if (op == GLUSTERD_BRICK_ATTACH) {
|
||||
frame->local = brickinfo;
|
||||
+ frame->cookie = other_brick;
|
||||
cbkfn = attach_brick_callback;
|
||||
}
|
||||
/* Send the msg */
|
||||
@@ -5582,27 +5651,19 @@ attach_brick (xlator_t *this,
|
||||
rpc = rpc_clnt_ref (other_brick->rpc);
|
||||
if (rpc) {
|
||||
ret = send_attach_req (this, rpc, path, brickinfo,
|
||||
+ other_brick,
|
||||
GLUSTERD_BRICK_ATTACH);
|
||||
rpc_clnt_unref (rpc);
|
||||
if (!ret) {
|
||||
ret = pmap_registry_extend (this, other_brick->port,
|
||||
- brickinfo->path);
|
||||
+ brickinfo->path);
|
||||
if (ret != 0) {
|
||||
gf_log (this->name, GF_LOG_ERROR,
|
||||
"adding brick to process failed");
|
||||
- return ret;
|
||||
+ goto out;
|
||||
}
|
||||
-
|
||||
- /* PID file is copied once brick has attached
|
||||
- successfully
|
||||
- */
|
||||
- glusterd_copy_file (pidfile1, pidfile2);
|
||||
brickinfo->port = other_brick->port;
|
||||
- brickinfo->status = GF_BRICK_STARTED;
|
||||
- brickinfo->rpc =
|
||||
- rpc_clnt_ref (other_brick->rpc);
|
||||
- ret = glusterd_brick_process_add_brick (brickinfo,
|
||||
- volinfo);
|
||||
+ ret = glusterd_brick_process_add_brick (brickinfo);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, 0,
|
||||
GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
|
||||
@@ -5611,29 +5672,23 @@ attach_brick (xlator_t *this,
|
||||
brickinfo->path);
|
||||
return ret;
|
||||
}
|
||||
-
|
||||
- if (ret) {
|
||||
- gf_msg_debug (this->name, 0, "Add brick"
|
||||
- " to brick process failed");
|
||||
- return ret;
|
||||
- }
|
||||
-
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
/*
|
||||
- * It might not actually be safe to manipulate the lock like
|
||||
- * this, but if we don't then the connection can never actually
|
||||
- * complete and retries are useless. Unfortunately, all of the
|
||||
- * alternatives (e.g. doing all of this in a separate thread)
|
||||
- * are much more complicated and risky. TBD: see if there's a
|
||||
- * better way
|
||||
+ * It might not actually be safe to manipulate the lock
|
||||
+ * like this, but if we don't then the connection can
|
||||
+ * never actually complete and retries are useless.
|
||||
+ * Unfortunately, all of the alternatives (e.g. doing
|
||||
+ * all of this in a separate thread) are much more
|
||||
+ * complicated and risky.
|
||||
+ * TBD: see if there's a better way
|
||||
*/
|
||||
synclock_unlock (&conf->big_lock);
|
||||
sleep (1);
|
||||
synclock_lock (&conf->big_lock);
|
||||
}
|
||||
-
|
||||
+out:
|
||||
gf_log (this->name, GF_LOG_WARNING,
|
||||
"attach failed for %s", brickinfo->path);
|
||||
return ret;
|
||||
@@ -5855,6 +5910,7 @@ find_compatible_brick (glusterd_conf_t *conf,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+
|
||||
/* Below function is use to populate sockpath based on passed pid
|
||||
value as a argument after check the value from proc and also
|
||||
check if passed pid is match with running glusterfs process
|
||||
@@ -5941,6 +5997,62 @@ glusterd_get_sock_from_brick_pid (int pid, char *sockpath, size_t len)
|
||||
}
|
||||
|
||||
|
||||
+char *
|
||||
+search_brick_path_from_proc (pid_t brick_pid, char *brickpath)
|
||||
+{
|
||||
+ struct dirent *dp = NULL;
|
||||
+ DIR *dirp = NULL;
|
||||
+ size_t len = 0;
|
||||
+ int fd = -1;
|
||||
+ char path[PATH_MAX] = {0,};
|
||||
+ char sym[PATH_MAX] = {0,};
|
||||
+ struct dirent scratch[2] = {{0,},};
|
||||
+ char *brick_path = NULL;
|
||||
+
|
||||
+ if (!brickpath)
|
||||
+ goto out;
|
||||
+
|
||||
+ sprintf(path, "/proc/%d/fd/", brick_pid);
|
||||
+ dirp = sys_opendir (path);
|
||||
+ if (!dirp)
|
||||
+ goto out;
|
||||
+
|
||||
+ len = strlen (path);
|
||||
+ if (len >= (sizeof(path) - 2))
|
||||
+ goto out;
|
||||
+
|
||||
+ fd = dirfd (dirp);
|
||||
+ if (fd < 0)
|
||||
+ goto out;
|
||||
+
|
||||
+ memset(path, 0, sizeof(path));
|
||||
+ memset(sym, 0, sizeof(sym));
|
||||
+
|
||||
+ while ((dp = sys_readdir(dirp, scratch))) {
|
||||
+ if (!strcmp(dp->d_name, ".") ||
|
||||
+ !strcmp(dp->d_name, ".."))
|
||||
+ continue;
|
||||
+
|
||||
+ /* check for non numerical descriptors */
|
||||
+ if (!strtol(dp->d_name, (char **)NULL, 10))
|
||||
+ continue;
|
||||
+
|
||||
+ len = readlinkat (fd, dp->d_name, sym, sizeof(sym) - 1);
|
||||
+ if (len > 1) {
|
||||
+ sym[len] = '\0';
|
||||
+ if (!strcmp (sym, brickpath)) {
|
||||
+ brick_path = gf_strdup(sym);
|
||||
+ break;
|
||||
+ }
|
||||
+ memset (sym, 0, sizeof (sym));
|
||||
+ }
|
||||
+ }
|
||||
+out:
|
||||
+ sys_closedir(dirp);
|
||||
+ return brick_path;
|
||||
+}
|
||||
+
|
||||
+
|
||||
int
|
||||
glusterd_brick_start (glusterd_volinfo_t *volinfo,
|
||||
glusterd_brickinfo_t *brickinfo,
|
||||
@@ -5954,7 +6066,9 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
|
||||
int32_t pid = -1;
|
||||
char pidfile[PATH_MAX] = {0};
|
||||
char socketpath[PATH_MAX] = {0};
|
||||
+ char *brickpath = NULL;
|
||||
glusterd_volinfo_t *other_vol;
|
||||
+ struct statvfs brickstat = {0,};
|
||||
|
||||
this = THIS;
|
||||
GF_ASSERT (this);
|
||||
@@ -6000,6 +6114,28 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
|
||||
brickinfo->start_triggered = _gf_true;
|
||||
|
||||
GLUSTERD_GET_BRICK_PIDFILE (pidfile, volinfo, brickinfo, conf);
|
||||
+
|
||||
+ ret = sys_statvfs (brickinfo->path, &brickstat);
|
||||
+ if (ret) {
|
||||
+ gf_msg (this->name, GF_LOG_ERROR,
|
||||
+ errno, GD_MSG_BRICKINFO_CREATE_FAIL,
|
||||
+ "failed to get statfs() call on brick %s",
|
||||
+ brickinfo->path);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ /* Compare fsid is helpful to ensure the existence of a brick_root
|
||||
+ path before the start/attach a brick
|
||||
+ */
|
||||
+ if (brickinfo->statfs_fsid &&
|
||||
+ (brickinfo->statfs_fsid != brickstat.f_fsid)) {
|
||||
+ gf_log (this->name, GF_LOG_ERROR,
|
||||
+ "fsid comparison is failed it means Brick root path"
|
||||
+ " %s is not created by glusterd, start/attach will also fail",
|
||||
+ brickinfo->path);
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
if (gf_is_service_running (pidfile, &pid)) {
|
||||
if (brickinfo->status != GF_BRICK_STARTING &&
|
||||
brickinfo->status != GF_BRICK_STARTED) {
|
||||
@@ -6019,12 +6155,29 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
|
||||
* TBD: re-use RPC connection across bricks
|
||||
*/
|
||||
if (is_brick_mx_enabled ()) {
|
||||
+ brickpath = search_brick_path_from_proc (pid, brickinfo->path);
|
||||
+ if (!brickpath) {
|
||||
+ gf_log (this->name, GF_LOG_INFO,
|
||||
+ "Either pid %d is not running or brick"
|
||||
+ " path %s is not consumed so cleanup pidfile",
|
||||
+ pid, brickinfo->path);
|
||||
+ /* search brick is failed so unlink pidfile */
|
||||
+ if (sys_access (pidfile , R_OK) == 0) {
|
||||
+ sys_unlink (pidfile);
|
||||
+ }
|
||||
+ goto run;
|
||||
+ }
|
||||
+ GF_FREE (brickpath);
|
||||
ret = glusterd_get_sock_from_brick_pid (pid, socketpath,
|
||||
sizeof(socketpath));
|
||||
if (ret) {
|
||||
- gf_log (this->name, GF_LOG_DEBUG,
|
||||
+ gf_log (this->name, GF_LOG_INFO,
|
||||
"Either pid %d is not running or is not match"
|
||||
" with any running brick process ", pid);
|
||||
+ /* Fetch unix socket is failed so unlink pidfile */
|
||||
+ if (sys_access (pidfile , R_OK) == 0) {
|
||||
+ sys_unlink (pidfile);
|
||||
+ }
|
||||
goto run;
|
||||
}
|
||||
} else {
|
||||
@@ -6039,7 +6192,7 @@ glusterd_brick_start (glusterd_volinfo_t *volinfo,
|
||||
(void) glusterd_brick_connect (volinfo, brickinfo,
|
||||
socketpath);
|
||||
|
||||
- ret = glusterd_brick_process_add_brick (brickinfo, volinfo);
|
||||
+ ret = glusterd_brick_process_add_brick (brickinfo);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, 0,
|
||||
GD_MSG_BRICKPROC_ADD_BRICK_FAILED,
|
||||
@@ -6079,6 +6232,10 @@ run:
|
||||
if (ret == 0) {
|
||||
goto out;
|
||||
}
|
||||
+ /* Attach_brick is failed so unlink pidfile */
|
||||
+ if (sys_access (pidfile , R_OK) == 0) {
|
||||
+ sys_unlink (pidfile);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -7063,14 +7220,15 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
|
||||
dict_t *dict, int32_t count)
|
||||
{
|
||||
|
||||
- int ret = -1;
|
||||
- int32_t pid = -1;
|
||||
- char key[1024] = {0};
|
||||
- char base_key[1024] = {0};
|
||||
- char pidfile[PATH_MAX] = {0};
|
||||
+ int ret = -1;
|
||||
+ int32_t pid = -1;
|
||||
+ char key[1024] = {0};
|
||||
+ char base_key[1024] = {0};
|
||||
+ char pidfile[PATH_MAX] = {0};
|
||||
xlator_t *this = NULL;
|
||||
glusterd_conf_t *priv = NULL;
|
||||
- gf_boolean_t brick_online = _gf_false;
|
||||
+ gf_boolean_t brick_online = _gf_false;
|
||||
+ char *brickpath = NULL;
|
||||
|
||||
GF_ASSERT (volinfo);
|
||||
GF_ASSERT (brickinfo);
|
||||
@@ -7127,7 +7285,20 @@ glusterd_add_brick_to_dict (glusterd_volinfo_t *volinfo,
|
||||
if (glusterd_is_brick_started (brickinfo)) {
|
||||
if (gf_is_service_running (pidfile, &pid) &&
|
||||
brickinfo->port_registered) {
|
||||
- brick_online = _gf_true;
|
||||
+ if (!is_brick_mx_enabled ()) {
|
||||
+ brick_online = _gf_true;
|
||||
+ } else {
|
||||
+ brickpath = search_brick_path_from_proc (pid, brickinfo->path);
|
||||
+ if (!brickpath) {
|
||||
+ gf_log (this->name, GF_LOG_INFO,
|
||||
+ "brick path %s is not consumed",
|
||||
+ brickinfo->path);
|
||||
+ brick_online = _gf_false;
|
||||
+ } else {
|
||||
+ brick_online = _gf_true;
|
||||
+ GF_FREE (brickpath);
|
||||
+ }
|
||||
+ }
|
||||
} else {
|
||||
pid = -1;
|
||||
}
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h
|
||||
index 4c9561e..4835728 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-utils.h
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-utils.h
|
||||
@@ -179,8 +179,7 @@ int32_t
|
||||
glusterd_resolve_brick (glusterd_brickinfo_t *brickinfo);
|
||||
|
||||
int
|
||||
-glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo,
|
||||
- glusterd_volinfo_t *volinfo);
|
||||
+glusterd_brick_process_add_brick (glusterd_brickinfo_t *brickinfo);
|
||||
|
||||
int
|
||||
glusterd_brick_process_remove_brick (glusterd_brickinfo_t *brickinfo);
|
||||
@@ -200,7 +199,8 @@ glusterd_volume_stop_glusterfs (glusterd_volinfo_t *volinfo,
|
||||
|
||||
int
|
||||
send_attach_req (xlator_t *this, struct rpc_clnt *rpc, char *path,
|
||||
- glusterd_brickinfo_t *brick, int op);
|
||||
+ glusterd_brickinfo_t *brick,
|
||||
+ glusterd_brickinfo_t *other_brick, int op);
|
||||
|
||||
glusterd_volinfo_t *
|
||||
glusterd_volinfo_ref (glusterd_volinfo_t *volinfo);
|
||||
diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
|
||||
index e34d58a..8bb0b6d 100644
|
||||
--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
|
||||
+++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c
|
||||
@@ -2585,8 +2585,13 @@ glusterd_start_volume (glusterd_volinfo_t *volinfo, int flags,
|
||||
}
|
||||
|
||||
glusterd_set_volume_status (volinfo, GLUSTERD_STATUS_STARTED);
|
||||
-
|
||||
+ /* Update volinfo on disk in critical section because
|
||||
+ attach_brick_callback can also call store_volinfo for same
|
||||
+ volume to update volinfo on disk
|
||||
+ */
|
||||
+ LOCK (&volinfo->lock);
|
||||
ret = glusterd_store_volinfo (volinfo, verincrement);
|
||||
+ UNLOCK (&volinfo->lock);
|
||||
if (ret) {
|
||||
gf_msg (this->name, GF_LOG_ERROR, 0,
|
||||
GD_MSG_VOLINFO_SET_FAIL,
|
||||
--
|
||||
1.8.3.1
|
||||
|
@ -192,7 +192,7 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist}
|
||||
%else
|
||||
Name: glusterfs
|
||||
Version: 3.12.2
|
||||
Release: 14%{?dist}
|
||||
Release: 15%{?dist}
|
||||
%endif
|
||||
License: GPLv2 or LGPLv3+
|
||||
Group: System Environment/Base
|
||||
@ -590,6 +590,14 @@ Patch0322: 0322-geo-rep-Cleanup-stale-unprocessed-xsync-changelogs.patch
|
||||
Patch0323: 0323-cluster-afr-Mark-dirty-for-entry-transactions-for-qu.patch
|
||||
Patch0324: 0324-dht-delete-tier-related-internal-xattr-in-dht_getxat.patch
|
||||
Patch0325: 0325-core-dereference-check-on-the-variables-in-glusterfs.patch
|
||||
Patch0326: 0326-glusterd-memory-leak-in-get-state.patch
|
||||
Patch0327: 0327-afr-switch-lk_owner-only-when-pre-op-succeeds.patch
|
||||
Patch0328: 0328-geo-rep-Fix-issues-with-gfid-conflict-handling.patch
|
||||
Patch0329: 0329-cluster-dht-Set-loc-gfid-before-healing-attr.patch
|
||||
Patch0330: 0330-posix-check-before-removing-stale-symlink.patch
|
||||
Patch0331: 0331-rpc-free-registered-callback-programs.patch
|
||||
Patch0332: 0332-rpc-rpc_clnt_connection_cleanup-is-crashed-due-to-do.patch
|
||||
Patch0333: 0333-glusterd-Add-multiple-checks-before-attach-start-a-b.patch
|
||||
|
||||
%description
|
||||
GlusterFS is a distributed file-system capable of scaling to several
|
||||
@ -2538,6 +2546,10 @@ fi
|
||||
%endif
|
||||
|
||||
%changelog
|
||||
* Fri Jul 27 2018 Milind Changire <mchangir@redhat.com> - 3.12.2-15
|
||||
- fixes bugs bz#1589279 bz#1598384 bz#1599362 bz#1599998 bz#1600790
|
||||
bz#1601331 bz#1603103
|
||||
|
||||
* Wed Jul 18 2018 Milind Changire <mchangir@redhat.com> - 3.12.2-14
|
||||
- fixes bugs bz#1547903 bz#1566336 bz#1568896 bz#1578716 bz#1581047
|
||||
bz#1581231 bz#1582066 bz#1593865 bz#1597506 bz#1597511 bz#1597654 bz#1597768
|
||||
|
Loading…
Reference in New Issue
Block a user