diff --git a/bz2110038-mysql-common-improve-error-message.patch b/bz2110038-mysql-common-improve-error-message.patch new file mode 100644 index 0000000..4a19fc4 --- /dev/null +++ b/bz2110038-mysql-common-improve-error-message.patch @@ -0,0 +1,68 @@ +From fcceb714085836de9db4493b527e94d85dd72626 Mon Sep 17 00:00:00 2001 +From: ut002970 +Date: Wed, 6 Sep 2023 15:27:05 +0800 +Subject: [PATCH 1/3] modify error message + +--- + heartbeat/mysql-common.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/mysql-common.sh b/heartbeat/mysql-common.sh +index 8104019b03..a93acc4c60 100755 +--- a/heartbeat/mysql-common.sh ++++ b/heartbeat/mysql-common.sh +@@ -254,7 +254,7 @@ mysql_common_start() + while [ $start_wait = 1 ]; do + if ! ps $pid > /dev/null 2>&1; then + wait $pid +- ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), please check your installation" ++ ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), please check your installation, log message you can check $OCF_RESKEY_log" + return $OCF_ERR_GENERIC + fi + mysql_common_status info + +From 8f9b344cd5b3cb96ea0f94b7ab0306da2234ac00 Mon Sep 17 00:00:00 2001 +From: ut002970 +Date: Wed, 6 Sep 2023 15:56:24 +0800 +Subject: [PATCH 2/3] modify error message + +--- + heartbeat/mysql-common.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/mysql-common.sh b/heartbeat/mysql-common.sh +index a93acc4c60..d5b2286737 100755 +--- a/heartbeat/mysql-common.sh ++++ b/heartbeat/mysql-common.sh +@@ -254,7 +254,7 @@ mysql_common_start() + while [ $start_wait = 1 ]; do + if ! ps $pid > /dev/null 2>&1; then + wait $pid +- ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), please check your installation, log message you can check $OCF_RESKEY_log" ++ ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), Check $OCF_RESKEY_log for details" + return $OCF_ERR_GENERIC + fi + mysql_common_status info + +From a292b3c552bf3f2beea5f73e0d171546c0a1273c Mon Sep 17 00:00:00 2001 +From: ut002970 +Date: Wed, 6 Sep 2023 16:10:48 +0800 +Subject: [PATCH 3/3] modify error message + +--- + heartbeat/mysql-common.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/heartbeat/mysql-common.sh b/heartbeat/mysql-common.sh +index d5b2286737..d6b4e3cdf4 100755 +--- a/heartbeat/mysql-common.sh ++++ b/heartbeat/mysql-common.sh +@@ -254,7 +254,7 @@ mysql_common_start() + while [ $start_wait = 1 ]; do + if ! ps $pid > /dev/null 2>&1; then + wait $pid +- ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), Check $OCF_RESKEY_log for details" ++ ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?). Check $OCF_RESKEY_log for details" + return $OCF_ERR_GENERIC + fi + mysql_common_status info diff --git a/resource-agents.spec b/resource-agents.spec index 4fab320..a94dc60 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 43%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 44%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -109,6 +109,8 @@ Patch56: bz2207567-Filesystem-2-fix-incorrect-parameter-types.patch Patch57: bz2209433-Delay-1-increase-default-timeouts.patch Patch58: bz2209433-Delay-2-remove-incorrect-statement.patch Patch59: bz2207567-Filesystem-3-fix-signal_delay-default-value.patch +Patch60: rhel-979-storage-mon-daemon-mode.patch +Patch61: bz2110038-mysql-common-improve-error-message.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aws.patch @@ -121,7 +123,7 @@ Provides: heartbeat-resources = %{version} # Build dependencies BuildRequires: make BuildRequires: automake autoconf pkgconfig gcc -BuildRequires: libxslt glib2-devel +BuildRequires: libxslt glib2-devel libqb-devel BuildRequires: systemd BuildRequires: which @@ -232,71 +234,73 @@ databases to be managed in a cluster environment. exit 1 %endif %setup -q -n %{upstream_prefix}-%{upstream_version} -%patch0 -p1 -F1 -%patch1 -p1 -%patch2 -p1 -%patch3 -p1 -%patch4 -p1 -%patch5 -p1 -%patch6 -p1 -%patch7 -p1 -%patch8 -p1 -%patch9 -p1 -%patch10 -p1 -%patch11 -p1 -%patch12 -p1 -%patch13 -p1 -%patch14 -p1 -%patch15 -p1 -%patch16 -p1 -%patch17 -p1 -%patch18 -p1 -%patch19 -p1 -%patch20 -p1 -%patch21 -p1 -%patch22 -p1 -%patch23 -p1 -%patch24 -p1 -%patch25 -p1 -%patch26 -p1 -%patch27 -p1 -%patch28 -p1 -%patch29 -p1 -%patch30 -p1 -%patch31 -p1 -%patch32 -p1 -%patch33 -p1 -%patch34 -p1 -%patch35 -p1 -%patch36 -p1 -%patch37 -p1 -%patch38 -p1 -%patch39 -p1 -%patch40 -p1 -%patch41 -p1 -%patch42 -p1 -%patch43 -p1 -%patch44 -p1 -%patch45 -p1 -%patch46 -p1 -%patch47 -p1 -%patch48 -p1 -%patch49 -p1 -%patch50 -p1 -%patch51 -p1 -%patch52 -p1 -%patch53 -p1 -%patch54 -p1 -%patch55 -p1 -%patch56 -p1 -%patch57 -p1 -%patch58 -p1 -%patch59 -p1 +%patch -p1 -P 0 -F1 +%patch -p1 -P 1 +%patch -p1 -P 2 +%patch -p1 -P 3 +%patch -p1 -P 4 +%patch -p1 -P 5 +%patch -p1 -P 6 +%patch -p1 -P 7 +%patch -p1 -P 8 +%patch -p1 -P 9 +%patch -p1 -P 10 +%patch -p1 -P 11 +%patch -p1 -P 12 +%patch -p1 -P 13 +%patch -p1 -P 14 +%patch -p1 -P 15 +%patch -p1 -P 16 +%patch -p1 -P 17 +%patch -p1 -P 18 +%patch -p1 -P 19 +%patch -p1 -P 20 +%patch -p1 -P 21 +%patch -p1 -P 22 +%patch -p1 -P 23 +%patch -p1 -P 24 +%patch -p1 -P 25 +%patch -p1 -P 26 +%patch -p1 -P 27 +%patch -p1 -P 28 +%patch -p1 -P 29 +%patch -p1 -P 30 +%patch -p1 -P 31 +%patch -p1 -P 32 +%patch -p1 -P 33 +%patch -p1 -P 34 +%patch -p1 -P 35 +%patch -p1 -P 36 +%patch -p1 -P 37 +%patch -p1 -P 38 +%patch -p1 -P 39 +%patch -p1 -P 40 +%patch -p1 -P 41 +%patch -p1 -P 42 +%patch -p1 -P 43 +%patch -p1 -P 44 +%patch -p1 -P 45 +%patch -p1 -P 46 +%patch -p1 -P 47 +%patch -p1 -P 48 +%patch -p1 -P 49 +%patch -p1 -P 50 +%patch -p1 -P 51 +%patch -p1 -P 52 +%patch -p1 -P 53 +%patch -p1 -P 54 +%patch -p1 -P 55 +%patch -p1 -P 56 +%patch -p1 -P 57 +%patch -p1 -P 58 +%patch -p1 -P 59 +%patch -p1 -P 60 +%patch -p1 -P 61 # bundled ha-cloud-support libs -%patch500 -p1 -%patch501 -p1 -%patch502 -p1 +%patch -p1 -P 500 +%patch -p1 -P 501 +%patch -p1 -P 502 chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate @@ -613,6 +617,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Wed Sep 6 2023 Oyvind Albrigtsen - 4.10.0-44 +- storage-mon: add daemon-mode to deal with I/O hangs +- mysql-common: improve error message + + Resolves: RHEL-979, rhbz#2110038 + * Thu Jul 20 2023 Oyvind Albrigtsen - 4.10.0-43 - Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems diff --git a/rhel-979-storage-mon-daemon-mode.patch b/rhel-979-storage-mon-daemon-mode.patch new file mode 100644 index 0000000..ee7280b --- /dev/null +++ b/rhel-979-storage-mon-daemon-mode.patch @@ -0,0 +1,1566 @@ +From 6045e383f65432084cd07032eb5515cb8231dc04 Mon Sep 17 00:00:00 2001 +From: Hideo Yamauchi +Date: Mon, 24 Jul 2023 06:46:23 +0900 +Subject: [PATCH 1/4] Mid: storage-mon: Functionalization of test_device call + processing. + +--- + tools/storage_mon.c | 141 +++++++++++++++++++++++--------------------- + 1 file changed, 75 insertions(+), 66 deletions(-) + +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index f829c50814..b0e277cbe0 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -146,18 +146,87 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + exit(-1); + } + ++static int test_device_main(size_t device_count, char *devices[MAX_DEVICES], int scores[MAX_DEVICES], int verbose, int inject_error_percent, int timeout) ++{ ++ pid_t test_forks[MAX_DEVICES]; ++ size_t i; ++ struct timespec ts; ++ time_t start_time; ++ size_t finished_count = 0; ++ int final_score = 0; ++ ++ memset(test_forks, 0, sizeof(test_forks)); ++ for (i=0; i ts.tv_sec)) { ++ for (i=0; i 0) { ++ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED); ++ if (w < 0) { ++ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno)); ++ return -1; ++ } ++ ++ if (w == test_forks[i]) { ++ if (WIFEXITED(wstatus)) { ++ if (WEXITSTATUS(wstatus) != 0) { ++ syslog(LOG_ERR, "Error reading from device %s", devices[i]); ++ final_score += scores[i]; ++ } ++ ++ finished_count++; ++ test_forks[i] = 0; ++ } ++ } ++ } ++ } ++ ++ usleep(100000); ++ ++ clock_gettime(CLOCK_REALTIME, &ts); ++ } ++ ++ /* See which threads have not finished */ ++ for (i=0; i ts.tv_sec)) { +- for (i=0; i 0) { +- w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED); +- if (w < 0) { +- fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno)); +- return -1; +- } +- +- if (w == test_forks[i]) { +- if (WIFEXITED(wstatus)) { +- if (WEXITSTATUS(wstatus) != 0) { +- syslog(LOG_ERR, "Error reading from device %s", devices[i]); +- final_score += scores[i]; +- } +- +- finished_count++; +- test_forks[i] = 0; +- } +- } +- } +- } +- +- usleep(100000); +- +- clock_gettime(CLOCK_REALTIME, &ts); +- } +- +- /* See which threads have not finished */ +- for (i=0; i +Date: Mon, 24 Jul 2023 06:47:20 +0900 +Subject: [PATCH 2/4] Mid: storage-mon: Added daemon/client mode. + +--- + configure.ac | 1 + + heartbeat/storage-mon.in | 222 +++++++++--- + resource-agents.spec.in | 2 +- + tools/Makefile.am | 3 +- + tools/storage_mon.c | 724 +++++++++++++++++++++++++++++++++++---- + 5 files changed, 828 insertions(+), 124 deletions(-) + +diff --git a/configure.ac b/configure.ac +index 7b5faff584..74766899b8 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -620,6 +620,7 @@ fi + PKG_CHECK_MODULES([GLIB], [$GPKGNAME]) + CPPFLAGS="$CPPFLAGS $GLIB_CFLAGS" + LIBS="$LIBS $GLIB_LIBS" ++PKG_CHECK_MODULES([LIBQB], "libqb") + + dnl ======================================================================== + dnl Headers +diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in +index d764b49d7c..81d8f5bcec 100644 +--- a/heartbeat/storage-mon.in ++++ b/heartbeat/storage-mon.in +@@ -48,20 +48,26 @@ + . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + + # +-STORAGEMON=$HA_BIN/storage_mon +-ATTRDUP=/usr/sbin/attrd_updater ++STORAGEMON=${HA_BIN}/storage_mon ++ATTRDUP=${HA_SBIN_DIR}/attrd_updater ++PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid ++ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}" + + OCF_RESKEY_CRM_meta_interval_default="0" + OCF_RESKEY_io_timeout_default="10" ++OCF_RESKEY_check_interval_default="30" + OCF_RESKEY_inject_errors_default="" + OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" ++OCF_RESKEY_daemonize_default="" + + # Explicitly list all environment variables used, to make static analysis happy + : ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} + : ${OCF_RESKEY_drives:=""} + : ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}} ++: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}} + : ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}} + : ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}} ++: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}} + + ####################################################################### + +@@ -106,6 +112,14 @@ Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default). + + + ++ ++ ++Specify interval between I/O checks in seconds.(Only supported with the damonize option.) ++ ++I/O check interval ++ ++ ++ + + + Used only for testing! Specify % of I/O errors to simulate drives failures. +@@ -114,6 +128,14 @@ Used only for testing! Specify % of I/O errors to simulate drives failures. + + + ++ ++ ++Specifies to start storage-mon as a daemon and check for devices. ++ ++start storage-mon with daemon ++ ++ ++ + + + +@@ -146,6 +168,11 @@ storage-mon_init() { + exit $OCF_ERR_INSTALLED + fi + ++ if [ ! -x "$ATTRDUP" ] ; then ++ ocf_log err "${ATTRDUP} not installed." ++ exit $OCF_ERR_INSTALLED ++ fi ++ + i=0 + for DRIVE in ${OCF_RESKEY_drives}; do + if [ ! -e "$DRIVE" ] ; then +@@ -161,7 +188,12 @@ storage-mon_init() { + fi + + if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then +- ocf_log err "Minimum timeout is 1. Recommended 10 (default)." ++ ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then ++ ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}." + exit $OCF_ERR_CONFIGURED + fi + +@@ -173,63 +205,147 @@ storage-mon_init() { + fi + } + +-storage-mon_validate() { +- storage-mon_init +- +- # Is the state directory writable? +- state_dir=$(dirname "$OCF_RESKEY_state_file") +- touch "$state_dir/$$" +- if [ $? -ne 0 ]; then +- return $OCF_ERR_CONFIGURED +- fi +- rm "$state_dir/$$" +- +- return $OCF_SUCCESS +-} +- + storage-mon_monitor() { +- storage-mon_init ++ if [ -z "$OCF_RESKEY_daemonize" ]; then ++ storage-mon_init + +- # Monitor _MUST!_ differentiate correctly between running +- # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). +- # That is THREE states, not just yes/no. ++ # Monitor _MUST!_ differentiate correctly between running ++ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). ++ # That is THREE states, not just yes/no. + +- if [ ! -f "${OCF_RESKEY_state_file}" ]; then +- return $OCF_NOT_RUNNING +- fi ++ if [ ! -f "${OCF_RESKEY_state_file}" ]; then ++ return $OCF_NOT_RUNNING ++ fi + +- # generate command line +- cmdline="" +- for DRIVE in ${OCF_RESKEY_drives}; do +- cmdline="$cmdline --device $DRIVE --score 1" +- done +- cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}" +- if [ -n "${OCF_RESKEY_inject_errors}" ]; then +- cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" +- fi +- $STORAGEMON $cmdline +- if [ $? -ne 0 ]; then +- status="red" ++ # generate command line ++ cmdline="" ++ for DRIVE in ${OCF_RESKEY_drives}; do ++ cmdline="$cmdline --device $DRIVE --score 1" ++ done ++ cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}" ++ if [ -n "${OCF_RESKEY_inject_errors}" ]; then ++ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" ++ fi ++ $STORAGEMON $cmdline ++ if [ $? -ne 0 ]; then ++ status="red" ++ else ++ status="green" ++ fi ++ ++ "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s" ++ return $OCF_SUCCESS + else +- status="green" +- fi ++ ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1 ++ case "$?" in ++ 0) rc=$OCF_SUCCESS;; ++ 1|2) rc=$OCF_NOT_RUNNING;; ++ *) rc=$OCF_ERR_GENERIC;; ++ esac ++ ++ if [ $rc -ne $OCF_SUCCESS ]; then ++ return "$rc" ++ fi ++ if [ "$1" = "pid_check_only" ]; then ++ return "$rc" ++ fi + +- "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s" +- return $OCF_SUCCESS ++ # generate client command line ++ cmdline="" ++ cmdline="$cmdline --client --attrname ${ATTRNAME}" ++ while : ++ do ++ # 0 : Normal. ++ # greater than 0 : monitoring error. ++ # 255(-1) : communication system error. ++ # 254(-2) : Not all checks completed for first device in daemon mode. ++ $STORAGEMON $cmdline ++ rc=$? ++ case "$rc" in ++ 254|255) ++ # If there is a communication error or the initial check of all devices has not been completed, ++ # it will loop and try to reconnect. ++ # When everything ends with a communication error during monitor, a monitor timeout occurs. ++ ocf_log debug "client monitor error : $rc" ++ ;; ++ 0) ++ status="green" ++ break ++ ;; ++ *) ++ status="red" ++ break ++ ;; ++ esac ++ done ++ ++ "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s" ++ return $OCF_SUCCESS ++ fi + } + + storage-mon_start() { +- storage-mon_monitor +- if [ $? -eq $OCF_SUCCESS ]; then +- return $OCF_SUCCESS ++ if [ -z "$OCF_RESKEY_daemonize" ]; then ++ storage-mon_monitor ++ if [ $? -eq $OCF_SUCCESS ]; then ++ return $OCF_SUCCESS ++ fi ++ touch "${OCF_RESKEY_state_file}" ++ else ++ storage-mon_init ++ # generate command line ++ cmdline="" ++ for DRIVE in ${OCF_RESKEY_drives}; do ++ cmdline="$cmdline --device $DRIVE --score 1" ++ done ++ #cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME} --ha-sbin-dir ${HA_SBIN_DIR}" ++ cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}" ++ if [ -n "${OCF_RESKEY_inject_errors}" ]; then ++ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" ++ fi ++ $STORAGEMON $cmdline ++ if [ "$?" -ne 0 ]; then ++ return $OCF_ERR_GENERIC ++ fi + fi +- touch "${OCF_RESKEY_state_file}" + } + + storage-mon_stop() { + storage-mon_monitor +- if [ $? -eq $OCF_SUCCESS ]; then +- rm "${OCF_RESKEY_state_file}" ++ rc=$? ++ ++ if [ -z "$OCF_RESKEY_daemonize" ]; then ++ if [ $rc -eq $OCF_SUCCESS ]; then ++ rm "${OCF_RESKEY_state_file}" ++ fi ++ else ++ case "$rc" in ++ $OCF_SUCCESS) ++ ;; ++ $OCF_NOT_RUNNING) ++ return "$OCF_SUCCESS";; ++ *) ++ return "$rc";; ++ esac ++ ++ kill -TERM $(cat "${PIDFILE}") ++ if [ "$?" -ne 0 ]; then ++ return $OCF_ERR_GENERIC ++ fi ++ ++ while true; do ++ storage-mon_monitor pid_check_only ++ rc="$?" ++ case "$rc" in ++ $OCF_SUCCESS) ++ ;; ++ $OCF_NOT_RUNNING) ++ return "$OCF_SUCCESS";; ++ *) ++ return "$rc";; ++ esac ++ sleep 1 ++ done + fi + return $OCF_SUCCESS + } +@@ -237,13 +353,15 @@ storage-mon_stop() { + storage-mon_validate() { + storage-mon_init + +- # Is the state directory writable? +- state_dir=$(dirname "${OCF_RESKEY_state_file}") +- touch "$state_dir/$$" +- if [ $? -ne 0 ]; then +- return $OCF_ERR_CONFIGURED ++ if [ -z "$OCF_RESKEY_daemonize" ]; then ++ # Is the state directory writable? ++ state_dir=$(dirname "${OCF_RESKEY_state_file}") ++ touch "$state_dir/$$" ++ if [ $? -ne 0 ]; then ++ return $OCF_ERR_CONFIGURED ++ fi ++ rm "$state_dir/$$" + fi +- rm "$state_dir/$$" + + return $OCF_SUCCESS + } +diff --git a/resource-agents.spec.in b/resource-agents.spec.in +index 2ffa00d946..1cbf28c033 100644 +--- a/resource-agents.spec.in ++++ b/resource-agents.spec.in +@@ -55,7 +55,7 @@ Provides: heartbeat-resources = %{version} + BuildRequires: make + BuildRequires: automake autoconf pkgconfig gcc + BuildRequires: perl +-BuildRequires: libxslt glib2-devel ++BuildRequires: libxslt glib2-devel libqb-devel + BuildRequires: systemd + BuildRequires: which + +diff --git a/tools/Makefile.am b/tools/Makefile.am +index 08323fee3a..55e292cec5 100644 +--- a/tools/Makefile.am ++++ b/tools/Makefile.am +@@ -74,7 +74,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl + findif_SOURCES = findif.c + + storage_mon_SOURCES = storage_mon.c +-storage_mon_CFLAGS = -D_GNU_SOURCE ++storage_mon_CFLAGS = -D_GNU_SOURCE ${LIBQB_CFLAGS} ++storage_mon_LDADD = ${LIBQB_LIBS} + + if BUILD_TICKLE + halib_PROGRAMS += tickle_tcp +diff --git a/tools/storage_mon.c b/tools/storage_mon.c +index b0e277cbe0..1231570c85 100644 +--- a/tools/storage_mon.c ++++ b/tools/storage_mon.c +@@ -16,9 +16,87 @@ + #ifdef __FreeBSD__ + #include + #endif ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include + + #define MAX_DEVICES 25 + #define DEFAULT_TIMEOUT 10 ++#define DEFAULT_INTERVAL 30 ++#define DEFAULT_PIDFILE HA_VARRUNDIR "storage_mon.pid" ++#define DEFAULT_ATTRNAME "#health-storage_mon" ++#define SMON_GET_RESULT_COMMAND "get_check_value" ++#define SMON_RESULT_OK "green" ++#define SMON_RESULT_NG "red" ++#define SMON_RESULT_COMMAND_ERROR "unknown command" ++#define SMON_BUFF_1MEG 1048576 ++#define SMON_MAX_IPCSNAME 256 ++#define SMON_MAX_MSGSIZE 128 ++#define SMON_MAX_RESP_SIZE 100 ++ ++#define PRINT_STORAGE_MON_ERR(fmt, ...) if (!daemonize) { \ ++ fprintf(stderr, fmt"\n", __VA_ARGS__); \ ++ } else { \ ++ syslog(LOG_ERR, fmt, __VA_ARGS__); \ ++ } ++#define PRINT_STORAGE_MON_ERR_NOARGS(str) if (!daemonize) { \ ++ fprintf(stderr, str"\n"); \ ++ } else { \ ++ syslog(LOG_ERR, str); \ ++ } ++ ++#define PRINT_STORAGE_MON_INFO(fmt, ...) if (!daemonize) { \ ++ printf(fmt"\n", __VA_ARGS__); \ ++ } else { \ ++ syslog(LOG_INFO, fmt, __VA_ARGS__); \ ++ } ++ ++struct storage_mon_timer_data { ++ int interval; ++}; ++ ++struct storage_mon_check_value_req { ++ struct qb_ipc_request_header hdr; ++ char message[SMON_MAX_MSGSIZE]; ++}; ++ ++ ++struct storage_mon_check_value_res { ++ struct qb_ipc_response_header hdr; ++ char message[SMON_MAX_MSGSIZE]; ++}; ++ ++ ++char *devices[MAX_DEVICES]; ++int scores[MAX_DEVICES]; ++size_t device_count = 0; ++int timeout = DEFAULT_TIMEOUT; ++int verbose = 0; ++int inject_error_percent = 0; ++const char *attrname = DEFAULT_ATTRNAME; ++gboolean daemonize = FALSE; ++int shutting_down = FALSE; ++static qb_ipcs_service_t *ipcs; ++int final_score = 0; ++int response_final_score = 0; ++pid_t test_forks[MAX_DEVICES]; ++size_t finished_count = 0; ++gboolean daemon_check_first_all_devices = FALSE; ++ ++static qb_loop_t *storage_mon_poll_handle; ++static qb_loop_timer_handle timer_handle; ++static qb_loop_timer_handle expire_handle; ++static struct storage_mon_timer_data timer_d; ++ ++static int test_device_main(gpointer data); ++static void wrap_test_device_main(void *data); + + static void usage(char *name, FILE *f) + { +@@ -27,6 +105,11 @@ static void usage(char *name, FILE *f) + fprintf(f, " --score score if device fails the test. Must match --device count\n"); + fprintf(f, " --timeout max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT); + fprintf(f, " --inject-errors-percent Generate EIO errors %% of the time (for testing only)\n"); ++ fprintf(f, " --daemonize test run in daemons.\n"); ++ fprintf(f, " --client client connection to daemon. requires the attrname option.\n"); ++ fprintf(f, " --interval interval to test. in seconds (default %d)(for daemonize only)\n", DEFAULT_INTERVAL); ++ fprintf(f, " --pidfile file path to record pid (default %s)(for daemonize only)\n", DEFAULT_PIDFILE); ++ fprintf(f, " --attrname attribute name to update test result (default %s)(for daemonize/client only)\n", DEFAULT_ATTRNAME); + fprintf(f, " --verbose emit extra output to stdout\n"); + fprintf(f, " --help print this message\n"); + } +@@ -47,13 +130,13 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + device_fd = open(device, flags); + if (device_fd < 0) { + if (errno != EINVAL) { +- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to open %s: %s", device, strerror(errno)); + exit(-1); + } + flags &= ~O_DIRECT; + device_fd = open(device, flags); + if (device_fd < 0) { +- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to open %s: %s", device, strerror(errno)); + exit(-1); + } + } +@@ -63,11 +146,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + res = ioctl(device_fd, BLKGETSIZE64, &devsize); + #endif + if (res < 0) { +- fprintf(stderr, "Failed to get device size for %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to get device size for %s: %s", device, strerror(errno)); + goto error; + } + if (verbose) { +- printf("%s: opened %s O_DIRECT, size=%zu\n", device, (flags & O_DIRECT)?"with":"without", devsize); ++ PRINT_STORAGE_MON_INFO("%s: opened %s O_DIRECT, size=%zu", device, (flags & O_DIRECT)?"with":"without", devsize); + } + + /* Don't fret about real randomness */ +@@ -76,11 +159,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00; + res = lseek(device_fd, seek_spot, SEEK_SET); + if (res < 0) { +- fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to seek %s: %s", device, strerror(errno)); + goto error; + } + if (verbose) { +- printf("%s: reading from pos %ld\n", device, seek_spot); ++ PRINT_STORAGE_MON_INFO("%s: reading from pos %ld", device, seek_spot); + } + + if (flags & O_DIRECT) { +@@ -93,22 +176,22 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + res = ioctl(device_fd, BLKSSZGET, &sec_size); + #endif + if (res < 0) { +- fprintf(stderr, "Failed to get block device sector size for %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno)); + goto error; + } + + if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) { +- fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno)); + goto error; + } + res = read(device_fd, buffer, sec_size); + free(buffer); + if (res < 0) { +- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno)); + goto error; + } + if (res < sec_size) { +- fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res); ++ PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res); + goto error; + } + } else { +@@ -116,28 +199,28 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + + res = read(device_fd, buffer, sizeof(buffer)); + if (res < 0) { +- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno)); + goto error; + } + if (res < (int)sizeof(buffer)) { +- fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res); ++ PRINT_STORAGE_MON_ERR("Failed to read %ld bytes from %s, got %d", sizeof(buffer), device, res); + goto error; + } + } + + /* Fake an error */ + if (inject_error_percent && ((rand() % 100) < inject_error_percent)) { +- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n"); ++ PRINT_STORAGE_MON_ERR_NOARGS("People, please fasten your seatbelts, injecting errors!"); + goto error; + } + res = close(device_fd); + if (res != 0) { +- fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno)); ++ PRINT_STORAGE_MON_ERR("Failed to close %s: %s", device, strerror(errno)); + exit(-1); + } + + if (verbose) { +- printf("%s: done\n", device); ++ PRINT_STORAGE_MON_INFO("%s: done", device); + } + exit(0); + +@@ -146,101 +229,563 @@ static void *test_device(const char *device, int verbose, int inject_error_perce + exit(-1); + } + +-static int test_device_main(size_t device_count, char *devices[MAX_DEVICES], int scores[MAX_DEVICES], int verbose, int inject_error_percent, int timeout) ++static gboolean is_child_runnning(void) + { +- pid_t test_forks[MAX_DEVICES]; + size_t i; +- struct timespec ts; +- time_t start_time; +- size_t finished_count = 0; +- int final_score = 0; + +- memset(test_forks, 0, sizeof(test_forks)); + for (i=0; i ts.tv_sec)) { ++ /* If there is an unfired timer, stop it. */ ++ qb_loop_timer_del(storage_mon_poll_handle, timer_handle); ++ ++ /* Send SIGTERM to non-terminating device monitoring processes. */ ++ if (is_child_runnning()) { ++ /* See if threads have finished */ + for (i=0; i 0 ) { ++ stop_child(test_forks[i], SIGTERM); ++ } ++ } + ++ } ++ ++ /* Set a timer for termination. */ ++ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_HIGH, 0, NULL, wrap_test_device_main, &timer_handle); ++ ++ return 0; ++} ++ ++static size_t find_child_pid(int pid) ++{ ++ size_t i; ++ ++ for (i=0; i 0 ) { ++ if (test_forks[i] == pid) { ++ return i; ++ } ++ } ++ } ++ return -1; ++} ++ ++static int32_t sigchld_handler(int32_t sig, void *data) ++{ ++ pid_t pid; ++ size_t index; ++ int status; ++ ++ if (is_child_runnning()) { ++ while(1) { ++ pid = waitpid(-1, &status, WNOHANG); ++ if (pid > 0) { ++ if (WIFEXITED(status)) { ++ index = find_child_pid(pid); ++ if (index >= 0) { ++ /* If the expire timer is running, no timeout has occurred, */ ++ /* so add the final_score from the exit code of the terminated child process. */ ++ if (qb_loop_timer_is_running(storage_mon_poll_handle, expire_handle)) { ++ if (WEXITSTATUS(status) !=0) { ++ final_score += scores[index]; ++ ++ /* Update response values immediately in preparation for inquiries from clients. */ ++ response_final_score = final_score; ++ ++ /* Even in the first demon mode check, if there is an error device, clear */ ++ /* the flag to return the response to the client without waiting for all devices to finish. */ ++ daemon_check_first_all_devices = TRUE; ++ } ++ } ++#if 0 ++ if (shutting_down == FALSE) { ++ finished_count++; ++ test_forks[index] = 0; ++ } ++#endif ++ finished_count++; ++ test_forks[index] = 0; ++ ++ } ++ } ++ } else { ++ break; ++ } ++ } ++ } ++ return 0; ++} ++ ++static void child_shutdown(int nsig) ++{ ++ exit(1); ++} ++ ++static int write_pid_file(const char *pidfile) ++{ ++ char *pid; ++ char *dir, *str = NULL; ++ int fd = -1; ++ int rc = -1; ++ int i, len; ++ ++ if (asprintf(&pid, "%jd", (intmax_t)getpid()) < 0) { ++ syslog(LOG_ERR, "Failed to allocate memory to store PID"); ++ pid = NULL; ++ goto done; ++ } ++ ++ str = strdup(pidfile); ++ if (str == NULL) { ++ syslog(LOG_ERR, "Failed to duplicate string ['%s']", pidfile); ++ goto done; ++ } ++ dir = dirname(str); ++ for (i = 1, len = strlen(dir); i < len; i++) { ++ if (dir[i] == '/') { ++ dir[i] = 0; ++ if ((mkdir(dir, 0640) < 0) && (errno != EEXIST)) { ++ syslog(LOG_ERR, "Failed to create directory %s: %s", dir, strerror(errno)); ++ goto done; ++ } ++ dir[i] = '/'; ++ } ++ } ++ if ((mkdir(dir, 0640) < 0) && (errno != EEXIST)) { ++ syslog(LOG_ERR, "Failed to create directory %s: %s", dir, strerror(errno)); ++ goto done; ++ } ++ ++ fd = open(pidfile, O_CREAT | O_WRONLY, 0640); ++ if (fd < 0) { ++ syslog(LOG_ERR, "Failed to open %s: %s", pidfile, strerror(errno)); ++ goto done; ++ } ++ ++ if (write(fd, pid, strlen(pid)) != strlen(pid)) { ++ syslog(LOG_ERR, "Failed to write '%s' to %s: %s", pid, pidfile, strerror(errno)); ++ goto done; ++ } ++ close(fd); ++ rc = 0; ++done: ++ if (pid != NULL) { ++ free(pid); ++ } ++ if (str != NULL) { ++ free(str); ++ } ++ return rc; ++} ++ ++static void child_timeout_handler(void *data) ++{ ++ size_t i; ++ ++ if (is_child_runnning()) { ++ for (i=0; i 0) { +- w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED); +- if (w < 0) { +- fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno)); +- return -1; ++ /* If timeout occurs before SIGCHLD, add child process failure score to final_score. */ ++ final_score += scores[i]; ++ ++ /* Update response values immediately in preparation for inquiries from clients. */ ++ response_final_score = final_score; ++ ++ /* Even in the first demon mode check, if there is an error device, clear */ ++ /* the flag to return the response to the client without waiting for all devices to finish. */ ++ daemon_check_first_all_devices = TRUE; ++ } ++ } ++ } ++} ++ ++static void wrap_test_device_main(void *data) ++{ ++ struct storage_mon_timer_data *timer_data = (struct storage_mon_timer_data*)data; ++ test_device_main((timer_data != NULL) ? &timer_data->interval : NULL); ++} ++ ++static int test_device_main(gpointer data) ++{ ++ size_t i; ++ struct timespec ts; ++ time_t start_time; ++ gboolean device_check = TRUE; ++ ++ if (daemonize) { ++ if (shutting_down == TRUE) { ++ goto done; ++ } ++ ++ /* In the case of daemon mode, it is avoided that the timer is triggered and the number of */ ++ /* child processes increases while the device monitoring child process is not completed. */ ++ if (is_child_runnning()) { ++ device_check = FALSE; ++ } ++ ++ if (device_count == finished_count && device_check) { ++ /* Update the result value for the client response once all checks have completed. */ ++ response_final_score = final_score; ++ ++ if (!daemon_check_first_all_devices) { ++ daemon_check_first_all_devices = TRUE; ++ } ++ } ++ } ++ ++ if (device_check) { ++ /* Reset final_score, finished_count, test_forks[] */ ++ final_score = 0; ++ finished_count = 0; ++ ++ memset(test_forks, 0, sizeof(test_forks)); ++ for (i=0; i ts.tv_sec)) { ++ for (i=0; i 0) { ++ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED); ++ if (w < 0) { ++ PRINT_STORAGE_MON_ERR("waitpid on %s failed: %s", devices[i], strerror(errno)); ++ return -1; + } + +- finished_count++; +- test_forks[i] = 0; ++ if (w == test_forks[i]) { ++ if (WIFEXITED(wstatus)) { ++ if (WEXITSTATUS(wstatus) != 0) { ++ syslog(LOG_ERR, "Error reading from device %s", devices[i]); ++ final_score += scores[i]; ++ } ++ ++ finished_count++; ++ test_forks[i] = 0; ++ } ++ } + } + } ++ ++ usleep(100000); ++ ++ clock_gettime(CLOCK_REALTIME, &ts); ++ } ++ ++ /* See which threads have not finished */ ++ for (i=0; ihdr.id, request->hdr.size, request->message); ++ ++ if (strcmp(request->message, SMON_GET_RESULT_COMMAND) != 0) { ++ syslog(LOG_DEBUG, "request command is unknown."); ++ send_score = -1; ++ } else if (!daemon_check_first_all_devices) { ++ send_score = -2; + } + +- /* See which threads have not finished */ +- for (i=0; i 0) { ++ rc = qb_ipcc_recv(conn, &response, sizeof(response), -1); ++ if (rc < 0) { ++ syslog(LOG_ERR, "qb_ipcc_recv error : %d\n", rc); ++ return(-1); + } + } + +- if (verbose) { +- printf("Final score is %d\n", final_score); ++ qb_ipcc_disconnect(conn); ++ ++ /* Set score to result */ ++ /* 0 : Normal. */ ++ /* greater than 0 : monitoring error. */ ++ /* -1 : communication system error. */ ++ /* -2 : Not all checks completed for first device in daemon mode. */ ++ rc = atoi(response.message); ++ ++ syslog(LOG_DEBUG, "daemon response[%d]: %s \n", response.hdr.id, response.message); ++ ++ return(rc); ++} ++ ++static int32_t ++storage_mon_daemon(int interval, const char *pidfile) ++{ ++ int32_t rc; ++ char ipcs_name[SMON_MAX_IPCSNAME]; ++ ++ struct qb_ipcs_service_handlers service_handle = { ++ .connection_accept = storage_mon_ipcs_connection_accept_fn, ++ .connection_created = storage_mon_ipcs_connection_created_fn, ++ .msg_process = storage_mon_ipcs_msg_process_fn, ++ .connection_destroyed = storage_mon_ipcs_connection_destroyed_fn, ++ .connection_closed = storage_mon_ipcs_connection_closed_fn, ++ }; ++ ++ struct qb_ipcs_poll_handlers poll_handle = { ++ .job_add = storage_mon_job_add, ++ .dispatch_add = storage_mon_dispatch_add, ++ .dispatch_mod = storage_mon_dispatch_mod, ++ .dispatch_del = storage_mon_dispatch_del, ++ }; ++ ++ if (daemon(0, 0) < 0) { ++ syslog(LOG_ERR, "Failed to daemonize: %s", strerror(errno)); ++ return -1; + } +- return final_score; ++ ++ umask(S_IWGRP | S_IWOTH | S_IROTH); ++ ++ if (write_pid_file(pidfile) < 0) { ++ return -1; ++ } ++ ++ snprintf(ipcs_name, SMON_MAX_IPCSNAME, "storage_mon_%s", attrname); ++ ipcs = qb_ipcs_create(ipcs_name, 0, QB_IPC_NATIVE, &service_handle); ++ if (ipcs == 0) { ++ syslog(LOG_ERR, "qb_ipcs_create"); ++ return -1; ++ } ++ ++ qb_ipcs_enforce_buffer_size(ipcs, SMON_BUFF_1MEG); ++ ++ storage_mon_poll_handle = qb_loop_create(); ++ ++ qb_ipcs_poll_handlers_set(ipcs, &poll_handle); ++ rc = qb_ipcs_run(ipcs); ++ if (rc != 0) { ++ errno = -rc; ++ syslog(LOG_ERR, "qb_ipcs_run"); ++ return -1; ++ } ++ ++ qb_loop_signal_add(storage_mon_poll_handle, QB_LOOP_HIGH, ++ SIGTERM, NULL, sigterm_handler, NULL); ++ ++ qb_loop_signal_add(storage_mon_poll_handle, QB_LOOP_MED, ++ SIGCHLD, NULL, sigchld_handler, NULL); ++ ++ timer_d.interval = interval; ++ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, 0, &timer_d, wrap_test_device_main, &timer_handle); ++ ++ qb_loop_run(storage_mon_poll_handle); ++ qb_loop_destroy(storage_mon_poll_handle); ++ ++ unlink(pidfile); ++ ++ return 0; + } + + int main(int argc, char *argv[]) + { +- char *devices[MAX_DEVICES]; +- int scores[MAX_DEVICES]; +- size_t device_count = 0; + size_t score_count = 0; +- int timeout = DEFAULT_TIMEOUT; +- int final_score = 0; + int opt, option_index; +- int verbose = 0; +- int inject_error_percent = 0; ++ int interval = DEFAULT_INTERVAL; ++ const char *pidfile = DEFAULT_PIDFILE; ++ gboolean client = FALSE; + struct option long_options[] = { + {"timeout", required_argument, 0, 't' }, + {"device", required_argument, 0, 'd' }, + {"score", required_argument, 0, 's' }, + {"inject-errors-percent", required_argument, 0, 0 }, ++ {"daemonize", no_argument, 0, 0 }, ++ {"client", no_argument, 0, 0 }, ++ {"interval", required_argument, 0, 'i' }, ++ {"pidfile", required_argument, 0, 'p' }, ++ {"attrname", required_argument, 0, 'a' }, + {"verbose", no_argument, 0, 'v' }, + {"help", no_argument, 0, 'h' }, + {0, 0, 0, 0 } + }; +- while ( (opt = getopt_long(argc, argv, "hvt:d:s:", ++ ++ while ( (opt = getopt_long(argc, argv, "hvt:d:s:i:p:a:", + long_options, &option_index)) != -1 ) { + switch (opt) { + case 0: /* Long-only options */ +@@ -251,6 +796,16 @@ int main(int argc, char *argv[]) + return -1; + } + } ++ if (strcmp(long_options[option_index].name, "daemonize") == 0) { ++ daemonize = TRUE; ++ } ++ if (strcmp(long_options[option_index].name, "client") == 0) { ++ client = TRUE; ++ } ++ if (daemonize && client) { ++ fprintf(stderr,"The daemonize option and client option cannot be specified at the same time."); ++ return -1; ++ } + break; + case 'd': + if (device_count < MAX_DEVICES) { +@@ -287,6 +842,27 @@ int main(int argc, char *argv[]) + usage(argv[0], stdout); + return 0; + break; ++ case 'i': ++ interval = atoi(optarg); ++ if (interval < 1) { ++ fprintf(stderr, "invalid interval %d. Min 1, default is %d\n", interval, DEFAULT_INTERVAL); ++ return -1; ++ } ++ break; ++ case 'p': ++ pidfile = strdup(optarg); ++ if (pidfile == NULL) { ++ fprintf(stderr, "Failed to duplicate string ['%s']\n", optarg); ++ return -1; ++ } ++ break; ++ case 'a': ++ attrname = strdup(optarg); ++ if (attrname == NULL) { ++ fprintf(stderr, "Failed to duplicate string ['%s']\n", optarg); ++ return -1; ++ } ++ break; + default: + usage(argv[0], stderr); + return -1; +@@ -294,6 +870,11 @@ int main(int argc, char *argv[]) + } + + } ++ ++ if (client) { ++ return(storage_mon_client()); ++ } ++ + if (device_count == 0) { + fprintf(stderr, "No devices to test, use the -d or --device argument\n"); + return -1; +@@ -306,7 +887,10 @@ int main(int argc, char *argv[]) + + openlog("storage_mon", 0, LOG_DAEMON); + +- +- final_score = test_device_main(device_count, devices, scores, verbose, inject_error_percent, timeout); ++ if (!daemonize) { ++ final_score = test_device_main(NULL); ++ } else { ++ return(storage_mon_daemon(interval, pidfile)); ++ } + return final_score; + } + +From 406ff43a6caeb0add7493892236753acee293f27 Mon Sep 17 00:00:00 2001 +From: Hideo Yamauchi +Date: Mon, 24 Jul 2023 06:47:39 +0900 +Subject: [PATCH 3/4] Mid: storage-mon: Retry failed attrd_updater. + +--- + heartbeat/storage-mon.in | 27 +++++++++++++++++++++++---- + 1 file changed, 23 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in +index 81d8f5bcec..9662e06dbb 100644 +--- a/heartbeat/storage-mon.in ++++ b/heartbeat/storage-mon.in +@@ -205,6 +205,25 @@ storage-mon_init() { + fi + } + ++storage-mon_update_attribute() { ++ ++ while : ++ do ++ "$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s" ++ rc=$? ++ if [ $rc -eq 0 ]; then ++ break ++ fi ++ ++ ocf_log debug "${1} attribute by attrd_updater failed" ++ if [ "$1" = "red" ]; then ++ # If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately. ++ return $OCF_ERR_GENERIC ++ fi ++ done ++ return $OCF_SUCCESS ++} ++ + storage-mon_monitor() { + if [ -z "$OCF_RESKEY_daemonize" ]; then + storage-mon_init +@@ -233,8 +252,8 @@ storage-mon_monitor() { + status="green" + fi + +- "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s" +- return $OCF_SUCCESS ++ storage-mon_update_attribute $status ++ return "$?" + else + ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1 + case "$?" in +@@ -279,8 +298,8 @@ storage-mon_monitor() { + esac + done + +- "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s" +- return $OCF_SUCCESS ++ storage-mon_update_attribute $status ++ return "$?" + fi + } + + +From d1cf0b42f1eb6c41ef5887cb7d9ce055f3bbcb3a Mon Sep 17 00:00:00 2001 +From: Hideo Yamauchi +Date: Thu, 17 Aug 2023 17:18:53 +0900 +Subject: [PATCH 4/4] Mid: storage-mon RA: Changed OCF_RESKEY_daemonize_default + and OCF_RESKEY_daemonize default and judgment part. + +--- + heartbeat/storage-mon.in | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in +index 9662e06dbb..284dec30f2 100644 +--- a/heartbeat/storage-mon.in ++++ b/heartbeat/storage-mon.in +@@ -58,7 +58,7 @@ OCF_RESKEY_io_timeout_default="10" + OCF_RESKEY_check_interval_default="30" + OCF_RESKEY_inject_errors_default="" + OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state" +-OCF_RESKEY_daemonize_default="" ++OCF_RESKEY_daemonize_default="false" + + # Explicitly list all environment variables used, to make static analysis happy + : ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}} +@@ -133,7 +133,7 @@ Used only for testing! Specify % of I/O errors to simulate drives failures. + Specifies to start storage-mon as a daemon and check for devices. + + start storage-mon with daemon +- ++ + + + +@@ -225,7 +225,7 @@ storage-mon_update_attribute() { + } + + storage-mon_monitor() { +- if [ -z "$OCF_RESKEY_daemonize" ]; then ++ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + storage-mon_init + + # Monitor _MUST!_ differentiate correctly between running +@@ -304,7 +304,7 @@ storage-mon_monitor() { + } + + storage-mon_start() { +- if [ -z "$OCF_RESKEY_daemonize" ]; then ++ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + storage-mon_monitor + if [ $? -eq $OCF_SUCCESS ]; then + return $OCF_SUCCESS +@@ -317,7 +317,6 @@ storage-mon_start() { + for DRIVE in ${OCF_RESKEY_drives}; do + cmdline="$cmdline --device $DRIVE --score 1" + done +- #cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME} --ha-sbin-dir ${HA_SBIN_DIR}" + cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}" + if [ -n "${OCF_RESKEY_inject_errors}" ]; then + cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}" +@@ -333,7 +332,7 @@ storage-mon_stop() { + storage-mon_monitor + rc=$? + +- if [ -z "$OCF_RESKEY_daemonize" ]; then ++ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + if [ $rc -eq $OCF_SUCCESS ]; then + rm "${OCF_RESKEY_state_file}" + fi +@@ -372,7 +371,7 @@ storage-mon_stop() { + storage-mon_validate() { + storage-mon_init + +- if [ -z "$OCF_RESKEY_daemonize" ]; then ++ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then + # Is the state directory writable? + state_dir=$(dirname "${OCF_RESKEY_state_file}") + touch "$state_dir/$$"