256 lines
8.0 KiB
Diff
256 lines
8.0 KiB
Diff
|
From 29ec87484b1ee3ad6417c37726db8aa9296f3a83 Mon Sep 17 00:00:00 2001
|
|||
|
From: Kotresh HR <khiremat@redhat.com>
|
|||
|
Date: Wed, 8 May 2019 11:26:06 +0530
|
|||
|
Subject: [PATCH 163/169] geo-rep: Fix sync hang with tarssh
|
|||
|
|
|||
|
Problem:
|
|||
|
Geo-rep sync hangs when tarssh is used as sync
|
|||
|
engine at heavy workload.
|
|||
|
|
|||
|
Analysis and Root cause:
|
|||
|
It's found out that the tar process was hung.
|
|||
|
When debugged further, it's found out that stderr
|
|||
|
buffer of tar process on master was full i.e., 64k.
|
|||
|
When the buffer was copied to a file from /proc/pid/fd/2,
|
|||
|
the hang is resolved.
|
|||
|
|
|||
|
This can happen when files picked by tar process
|
|||
|
to sync doesn't exist on master anymore. If this count
|
|||
|
increases around 1k, the stderr buffer is filled up.
|
|||
|
|
|||
|
Fix:
|
|||
|
The tar process is executed using Popen with stderr as PIPE.
|
|||
|
The final execution is something like below.
|
|||
|
|
|||
|
tar | ssh <args> root@slave tar --overwrite -xf - -C <path>
|
|||
|
|
|||
|
It was waiting on ssh process first using communicate() and then tar.
|
|||
|
Note that communicate() reads stdout and stderr. So when stderr of tar
|
|||
|
process is filled up, there is no one to read until untar via ssh is
|
|||
|
completed. This can't happen and leads to deadlock.
|
|||
|
Hence we should be waiting on both process parallely, so that stderr is
|
|||
|
read on both processes.
|
|||
|
|
|||
|
Backport of:
|
|||
|
> Patch: https://review.gluster.org/22684
|
|||
|
> Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
|
|||
|
> BUG: 1707728
|
|||
|
> Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
|||
|
|
|||
|
Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
|
|||
|
fixes: bz#1708116
|
|||
|
Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
|||
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/172395
|
|||
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|||
|
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
|||
|
---
|
|||
|
geo-replication/syncdaemon/resource.py | 22 ++++--
|
|||
|
tests/00-geo-rep/georep-stderr-hang.t | 128 +++++++++++++++++++++++++++++++++
|
|||
|
tests/geo-rep.rc | 17 +++++
|
|||
|
3 files changed, 163 insertions(+), 4 deletions(-)
|
|||
|
create mode 100644 tests/00-geo-rep/georep-stderr-hang.t
|
|||
|
|
|||
|
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
|
|||
|
index 522279b..b16db60 100644
|
|||
|
--- a/geo-replication/syncdaemon/resource.py
|
|||
|
+++ b/geo-replication/syncdaemon/resource.py
|
|||
|
@@ -1540,15 +1540,29 @@ class SSH(object):
|
|||
|
|
|||
|
p0.stdin.close()
|
|||
|
p0.stdout.close() # Allow p0 to receive a SIGPIPE if p1 exits.
|
|||
|
- # wait for tar to terminate, collecting any errors, further
|
|||
|
- # waiting for transfer to complete
|
|||
|
- _, stderr1 = p1.communicate()
|
|||
|
|
|||
|
# stdin and stdout of p0 is already closed, Reset to None and
|
|||
|
# wait for child process to complete
|
|||
|
p0.stdin = None
|
|||
|
p0.stdout = None
|
|||
|
- p0.communicate()
|
|||
|
+
|
|||
|
+ def wait_for_tar(p0):
|
|||
|
+ _, stderr = p0.communicate()
|
|||
|
+ if log_err:
|
|||
|
+ for errline in stderr.strip().split("\n")[:-1]:
|
|||
|
+ if "No such file or directory" not in errline:
|
|||
|
+ logging.error(lf("SYNC Error",
|
|||
|
+ sync_engine="Tarssh",
|
|||
|
+ error=errline))
|
|||
|
+
|
|||
|
+ t = syncdutils.Thread(target=wait_for_tar, args=(p0, ))
|
|||
|
+ # wait for tar to terminate, collecting any errors, further
|
|||
|
+ # waiting for transfer to complete
|
|||
|
+ t.start()
|
|||
|
+
|
|||
|
+ # wait for ssh process
|
|||
|
+ _, stderr1 = p1.communicate()
|
|||
|
+ t.join()
|
|||
|
|
|||
|
if log_err:
|
|||
|
for errline in stderr1.strip().split("\n")[:-1]:
|
|||
|
diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
|
|||
|
new file mode 100644
|
|||
|
index 0000000..496f0e6
|
|||
|
--- /dev/null
|
|||
|
+++ b/tests/00-geo-rep/georep-stderr-hang.t
|
|||
|
@@ -0,0 +1,128 @@
|
|||
|
+#!/bin/bash
|
|||
|
+
|
|||
|
+. $(dirname $0)/../include.rc
|
|||
|
+. $(dirname $0)/../volume.rc
|
|||
|
+. $(dirname $0)/../geo-rep.rc
|
|||
|
+. $(dirname $0)/../env.rc
|
|||
|
+
|
|||
|
+SCRIPT_TIMEOUT=500
|
|||
|
+
|
|||
|
+AREQUAL_PATH=$(dirname $0)/../utils
|
|||
|
+test "`uname -s`" != "Linux" && {
|
|||
|
+ CFLAGS="$CFLAGS -lintl";
|
|||
|
+}
|
|||
|
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
|
|||
|
+
|
|||
|
+### Basic Tests with Distribute Replicate volumes
|
|||
|
+
|
|||
|
+##Cleanup and start glusterd
|
|||
|
+cleanup;
|
|||
|
+TEST glusterd;
|
|||
|
+TEST pidof glusterd
|
|||
|
+
|
|||
|
+
|
|||
|
+##Variables
|
|||
|
+GEOREP_CLI="$CLI volume geo-replication"
|
|||
|
+master=$GMV0
|
|||
|
+SH0="127.0.0.1"
|
|||
|
+slave=${SH0}::${GSV0}
|
|||
|
+num_active=2
|
|||
|
+num_passive=2
|
|||
|
+master_mnt=$M0
|
|||
|
+slave_mnt=$M1
|
|||
|
+
|
|||
|
+############################################################
|
|||
|
+#SETUP VOLUMES AND GEO-REPLICATION
|
|||
|
+############################################################
|
|||
|
+
|
|||
|
+##create_and_start_master_volume
|
|||
|
+TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
|
|||
|
+TEST $CLI volume start $GMV0
|
|||
|
+
|
|||
|
+##create_and_start_slave_volume
|
|||
|
+TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
|
|||
|
+TEST $CLI volume start $GSV0
|
|||
|
+TEST $CLI volume set $GSV0 performance.stat-prefetch off
|
|||
|
+TEST $CLI volume set $GSV0 performance.quick-read off
|
|||
|
+TEST $CLI volume set $GSV0 performance.readdir-ahead off
|
|||
|
+TEST $CLI volume set $GSV0 performance.read-ahead off
|
|||
|
+
|
|||
|
+##Mount master
|
|||
|
+TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
|
|||
|
+
|
|||
|
+##Mount slave
|
|||
|
+TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
|
|||
|
+
|
|||
|
+############################################################
|
|||
|
+#BASIC GEO-REPLICATION TESTS
|
|||
|
+############################################################
|
|||
|
+
|
|||
|
+TEST create_georep_session $master $slave
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
|
|||
|
+
|
|||
|
+#Config gluster-command-dir
|
|||
|
+TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
|
|||
|
+
|
|||
|
+#Config gluster-command-dir
|
|||
|
+TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
|
|||
|
+
|
|||
|
+#Set changelog roll-over time to 45 secs
|
|||
|
+TEST $CLI volume set $GMV0 changelog.rollover-time 45
|
|||
|
+
|
|||
|
+#Wait for common secret pem file to be created
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_common_secret_file
|
|||
|
+
|
|||
|
+#Verify the keys are distributed
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_keys_distributed
|
|||
|
+
|
|||
|
+#Set sync-jobs to 1
|
|||
|
+TEST $GEOREP_CLI $master $slave config sync-jobs 1
|
|||
|
+
|
|||
|
+#Start_georep
|
|||
|
+TEST $GEOREP_CLI $master $slave start
|
|||
|
+
|
|||
|
+touch $M0
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Changelog Crawl"
|
|||
|
+
|
|||
|
+#Check History Crawl.
|
|||
|
+TEST $GEOREP_CLI $master $slave stop
|
|||
|
+TEST create_data_hang "rsync_hang"
|
|||
|
+TEST create_data "history_rsync"
|
|||
|
+TEST $GEOREP_CLI $master $slave start
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
|
|||
|
+
|
|||
|
+#Verify arequal for whole volume
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
|
|||
|
+
|
|||
|
+#Stop Geo-rep
|
|||
|
+TEST $GEOREP_CLI $master $slave stop
|
|||
|
+
|
|||
|
+#Config tarssh as sync-engine
|
|||
|
+TEST $GEOREP_CLI $master $slave config sync-method tarssh
|
|||
|
+
|
|||
|
+#Create tarssh hang data
|
|||
|
+TEST create_data_hang "tarssh_hang"
|
|||
|
+TEST create_data "history_tar"
|
|||
|
+
|
|||
|
+TEST $GEOREP_CLI $master $slave start
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
|
|||
|
+
|
|||
|
+#Verify arequal for whole volume
|
|||
|
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
|
|||
|
+
|
|||
|
+#Stop Geo-rep
|
|||
|
+TEST $GEOREP_CLI $master $slave stop
|
|||
|
+
|
|||
|
+#Delete Geo-rep
|
|||
|
+TEST $GEOREP_CLI $master $slave delete
|
|||
|
+
|
|||
|
+#Cleanup are-equal binary
|
|||
|
+TEST rm $AREQUAL_PATH/arequal-checksum
|
|||
|
+
|
|||
|
+#Cleanup authorized keys
|
|||
|
+sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
|
|||
|
+sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
|
|||
|
+
|
|||
|
+cleanup;
|
|||
|
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
|
|||
|
diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
|
|||
|
index 2035b9f..e4f014e 100644
|
|||
|
--- a/tests/geo-rep.rc
|
|||
|
+++ b/tests/geo-rep.rc
|
|||
|
@@ -101,6 +101,23 @@ function create_data()
|
|||
|
chown 1000:1000 ${master_mnt}/${prefix}_chown_f1_ಸಂತಸ
|
|||
|
}
|
|||
|
|
|||
|
+function create_data_hang()
|
|||
|
+{
|
|||
|
+ prefix=$1
|
|||
|
+ mkdir ${master_mnt}/${prefix}
|
|||
|
+ cd ${master_mnt}/${prefix}
|
|||
|
+ # ~1k files is required with 1 sync-job and hang happens if
|
|||
|
+ # stderr buffer of tar/ssh executed with Popen is full (i.e., 64k).
|
|||
|
+ # 64k is hit when ~800 files were not found while syncing data
|
|||
|
+ # from master. So around 1k files is required to hit the condition.
|
|||
|
+ for i in {1..1000}
|
|||
|
+ do
|
|||
|
+ echo "test data" > file$i
|
|||
|
+ mv -f file$i file
|
|||
|
+ done
|
|||
|
+ cd -
|
|||
|
+}
|
|||
|
+
|
|||
|
function chown_file_ok()
|
|||
|
{
|
|||
|
local file_owner=$(stat --format "%u:%g" "$1")
|
|||
|
--
|
|||
|
1.8.3.1
|
|||
|
|