256 lines
8.0 KiB
Diff
256 lines
8.0 KiB
Diff
From 29ec87484b1ee3ad6417c37726db8aa9296f3a83 Mon Sep 17 00:00:00 2001
|
||
From: Kotresh HR <khiremat@redhat.com>
|
||
Date: Wed, 8 May 2019 11:26:06 +0530
|
||
Subject: [PATCH 163/169] geo-rep: Fix sync hang with tarssh
|
||
|
||
Problem:
|
||
Geo-rep sync hangs when tarssh is used as sync
|
||
engine at heavy workload.
|
||
|
||
Analysis and Root cause:
|
||
It's found out that the tar process was hung.
|
||
When debugged further, it's found out that stderr
|
||
buffer of tar process on master was full i.e., 64k.
|
||
When the buffer was copied to a file from /proc/pid/fd/2,
|
||
the hang is resolved.
|
||
|
||
This can happen when files picked by tar process
|
||
to sync doesn't exist on master anymore. If this count
|
||
increases around 1k, the stderr buffer is filled up.
|
||
|
||
Fix:
|
||
The tar process is executed using Popen with stderr as PIPE.
|
||
The final execution is something like below.
|
||
|
||
tar | ssh <args> root@slave tar --overwrite -xf - -C <path>
|
||
|
||
It was waiting on ssh process first using communicate() and then tar.
|
||
Note that communicate() reads stdout and stderr. So when stderr of tar
|
||
process is filled up, there is no one to read until untar via ssh is
|
||
completed. This can't happen and leads to deadlock.
|
||
Hence we should be waiting on both process parallely, so that stderr is
|
||
read on both processes.
|
||
|
||
Backport of:
|
||
> Patch: https://review.gluster.org/22684
|
||
> Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
|
||
> BUG: 1707728
|
||
> Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
||
|
||
Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
|
||
fixes: bz#1708116
|
||
Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
||
Reviewed-on: https://code.engineering.redhat.com/gerrit/172395
|
||
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
|
||
---
|
||
geo-replication/syncdaemon/resource.py | 22 ++++--
|
||
tests/00-geo-rep/georep-stderr-hang.t | 128 +++++++++++++++++++++++++++++++++
|
||
tests/geo-rep.rc | 17 +++++
|
||
3 files changed, 163 insertions(+), 4 deletions(-)
|
||
create mode 100644 tests/00-geo-rep/georep-stderr-hang.t
|
||
|
||
diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
|
||
index 522279b..b16db60 100644
|
||
--- a/geo-replication/syncdaemon/resource.py
|
||
+++ b/geo-replication/syncdaemon/resource.py
|
||
@@ -1540,15 +1540,29 @@ class SSH(object):
|
||
|
||
p0.stdin.close()
|
||
p0.stdout.close() # Allow p0 to receive a SIGPIPE if p1 exits.
|
||
- # wait for tar to terminate, collecting any errors, further
|
||
- # waiting for transfer to complete
|
||
- _, stderr1 = p1.communicate()
|
||
|
||
# stdin and stdout of p0 is already closed, Reset to None and
|
||
# wait for child process to complete
|
||
p0.stdin = None
|
||
p0.stdout = None
|
||
- p0.communicate()
|
||
+
|
||
+ def wait_for_tar(p0):
|
||
+ _, stderr = p0.communicate()
|
||
+ if log_err:
|
||
+ for errline in stderr.strip().split("\n")[:-1]:
|
||
+ if "No such file or directory" not in errline:
|
||
+ logging.error(lf("SYNC Error",
|
||
+ sync_engine="Tarssh",
|
||
+ error=errline))
|
||
+
|
||
+ t = syncdutils.Thread(target=wait_for_tar, args=(p0, ))
|
||
+ # wait for tar to terminate, collecting any errors, further
|
||
+ # waiting for transfer to complete
|
||
+ t.start()
|
||
+
|
||
+ # wait for ssh process
|
||
+ _, stderr1 = p1.communicate()
|
||
+ t.join()
|
||
|
||
if log_err:
|
||
for errline in stderr1.strip().split("\n")[:-1]:
|
||
diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
|
||
new file mode 100644
|
||
index 0000000..496f0e6
|
||
--- /dev/null
|
||
+++ b/tests/00-geo-rep/georep-stderr-hang.t
|
||
@@ -0,0 +1,128 @@
|
||
+#!/bin/bash
|
||
+
|
||
+. $(dirname $0)/../include.rc
|
||
+. $(dirname $0)/../volume.rc
|
||
+. $(dirname $0)/../geo-rep.rc
|
||
+. $(dirname $0)/../env.rc
|
||
+
|
||
+SCRIPT_TIMEOUT=500
|
||
+
|
||
+AREQUAL_PATH=$(dirname $0)/../utils
|
||
+test "`uname -s`" != "Linux" && {
|
||
+ CFLAGS="$CFLAGS -lintl";
|
||
+}
|
||
+build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
|
||
+
|
||
+### Basic Tests with Distribute Replicate volumes
|
||
+
|
||
+##Cleanup and start glusterd
|
||
+cleanup;
|
||
+TEST glusterd;
|
||
+TEST pidof glusterd
|
||
+
|
||
+
|
||
+##Variables
|
||
+GEOREP_CLI="$CLI volume geo-replication"
|
||
+master=$GMV0
|
||
+SH0="127.0.0.1"
|
||
+slave=${SH0}::${GSV0}
|
||
+num_active=2
|
||
+num_passive=2
|
||
+master_mnt=$M0
|
||
+slave_mnt=$M1
|
||
+
|
||
+############################################################
|
||
+#SETUP VOLUMES AND GEO-REPLICATION
|
||
+############################################################
|
||
+
|
||
+##create_and_start_master_volume
|
||
+TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
|
||
+TEST $CLI volume start $GMV0
|
||
+
|
||
+##create_and_start_slave_volume
|
||
+TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
|
||
+TEST $CLI volume start $GSV0
|
||
+TEST $CLI volume set $GSV0 performance.stat-prefetch off
|
||
+TEST $CLI volume set $GSV0 performance.quick-read off
|
||
+TEST $CLI volume set $GSV0 performance.readdir-ahead off
|
||
+TEST $CLI volume set $GSV0 performance.read-ahead off
|
||
+
|
||
+##Mount master
|
||
+TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
|
||
+
|
||
+##Mount slave
|
||
+TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
|
||
+
|
||
+############################################################
|
||
+#BASIC GEO-REPLICATION TESTS
|
||
+############################################################
|
||
+
|
||
+TEST create_georep_session $master $slave
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
|
||
+
|
||
+#Config gluster-command-dir
|
||
+TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
|
||
+
|
||
+#Config gluster-command-dir
|
||
+TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
|
||
+
|
||
+#Set changelog roll-over time to 45 secs
|
||
+TEST $CLI volume set $GMV0 changelog.rollover-time 45
|
||
+
|
||
+#Wait for common secret pem file to be created
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_common_secret_file
|
||
+
|
||
+#Verify the keys are distributed
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_keys_distributed
|
||
+
|
||
+#Set sync-jobs to 1
|
||
+TEST $GEOREP_CLI $master $slave config sync-jobs 1
|
||
+
|
||
+#Start_georep
|
||
+TEST $GEOREP_CLI $master $slave start
|
||
+
|
||
+touch $M0
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Changelog Crawl"
|
||
+
|
||
+#Check History Crawl.
|
||
+TEST $GEOREP_CLI $master $slave stop
|
||
+TEST create_data_hang "rsync_hang"
|
||
+TEST create_data "history_rsync"
|
||
+TEST $GEOREP_CLI $master $slave start
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
|
||
+
|
||
+#Verify arequal for whole volume
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
|
||
+
|
||
+#Stop Geo-rep
|
||
+TEST $GEOREP_CLI $master $slave stop
|
||
+
|
||
+#Config tarssh as sync-engine
|
||
+TEST $GEOREP_CLI $master $slave config sync-method tarssh
|
||
+
|
||
+#Create tarssh hang data
|
||
+TEST create_data_hang "tarssh_hang"
|
||
+TEST create_data "history_tar"
|
||
+
|
||
+TEST $GEOREP_CLI $master $slave start
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
|
||
+
|
||
+#Verify arequal for whole volume
|
||
+EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
|
||
+
|
||
+#Stop Geo-rep
|
||
+TEST $GEOREP_CLI $master $slave stop
|
||
+
|
||
+#Delete Geo-rep
|
||
+TEST $GEOREP_CLI $master $slave delete
|
||
+
|
||
+#Cleanup are-equal binary
|
||
+TEST rm $AREQUAL_PATH/arequal-checksum
|
||
+
|
||
+#Cleanup authorized keys
|
||
+sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
|
||
+sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
|
||
+
|
||
+cleanup;
|
||
+#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
|
||
diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
|
||
index 2035b9f..e4f014e 100644
|
||
--- a/tests/geo-rep.rc
|
||
+++ b/tests/geo-rep.rc
|
||
@@ -101,6 +101,23 @@ function create_data()
|
||
chown 1000:1000 ${master_mnt}/${prefix}_chown_f1_ಸಂತಸ
|
||
}
|
||
|
||
+function create_data_hang()
|
||
+{
|
||
+ prefix=$1
|
||
+ mkdir ${master_mnt}/${prefix}
|
||
+ cd ${master_mnt}/${prefix}
|
||
+ # ~1k files is required with 1 sync-job and hang happens if
|
||
+ # stderr buffer of tar/ssh executed with Popen is full (i.e., 64k).
|
||
+ # 64k is hit when ~800 files were not found while syncing data
|
||
+ # from master. So around 1k files is required to hit the condition.
|
||
+ for i in {1..1000}
|
||
+ do
|
||
+ echo "test data" > file$i
|
||
+ mv -f file$i file
|
||
+ done
|
||
+ cd -
|
||
+}
|
||
+
|
||
function chown_file_ok()
|
||
{
|
||
local file_owner=$(stat --format "%u:%g" "$1")
|
||
--
|
||
1.8.3.1
|
||
|