resource-agents/rhel-979-storage-mon-1-daem...

1567 lines
46 KiB
Diff

From 6045e383f65432084cd07032eb5515cb8231dc04 Mon Sep 17 00:00:00 2001
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Date: Mon, 24 Jul 2023 06:46:23 +0900
Subject: [PATCH 1/4] Mid: storage-mon: Functionalization of test_device call
processing.
---
tools/storage_mon.c | 141 +++++++++++++++++++++++---------------------
1 file changed, 75 insertions(+), 66 deletions(-)
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index f829c50814..b0e277cbe0 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -146,18 +146,87 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
exit(-1);
}
+static int test_device_main(size_t device_count, char *devices[MAX_DEVICES], int scores[MAX_DEVICES], int verbose, int inject_error_percent, int timeout)
+{
+ pid_t test_forks[MAX_DEVICES];
+ size_t i;
+ struct timespec ts;
+ time_t start_time;
+ size_t finished_count = 0;
+ int final_score = 0;
+
+ memset(test_forks, 0, sizeof(test_forks));
+ for (i=0; i<device_count; i++) {
+ test_forks[i] = fork();
+ if (test_forks[i] < 0) {
+ fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+ syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+ /* Just test the devices we have */
+ break;
+ }
+ /* child */
+ if (test_forks[i] == 0) {
+ test_device(devices[i], verbose, inject_error_percent);
+ }
+ }
+
+ /* See if they have finished */
+ clock_gettime(CLOCK_REALTIME, &ts);
+ start_time = ts.tv_sec;
+
+ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
+ for (i=0; i<device_count; i++) {
+ int wstatus;
+ pid_t w;
+
+ if (test_forks[i] > 0) {
+ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
+ if (w < 0) {
+ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
+ return -1;
+ }
+
+ if (w == test_forks[i]) {
+ if (WIFEXITED(wstatus)) {
+ if (WEXITSTATUS(wstatus) != 0) {
+ syslog(LOG_ERR, "Error reading from device %s", devices[i]);
+ final_score += scores[i];
+ }
+
+ finished_count++;
+ test_forks[i] = 0;
+ }
+ }
+ }
+ }
+
+ usleep(100000);
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ }
+
+ /* See which threads have not finished */
+ for (i=0; i<device_count; i++) {
+ if (test_forks[i] != 0) {
+ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
+ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
+ final_score += scores[i];
+ }
+ }
+
+ if (verbose) {
+ printf("Final score is %d\n", final_score);
+ }
+ return final_score;
+}
+
int main(int argc, char *argv[])
{
char *devices[MAX_DEVICES];
int scores[MAX_DEVICES];
- pid_t test_forks[MAX_DEVICES];
size_t device_count = 0;
size_t score_count = 0;
- size_t finished_count = 0;
int timeout = DEFAULT_TIMEOUT;
- struct timespec ts;
- time_t start_time;
- size_t i;
int final_score = 0;
int opt, option_index;
int verbose = 0;
@@ -237,67 +306,7 @@ int main(int argc, char *argv[])
openlog("storage_mon", 0, LOG_DAEMON);
- memset(test_forks, 0, sizeof(test_forks));
- for (i=0; i<device_count; i++) {
- test_forks[i] = fork();
- if (test_forks[i] < 0) {
- fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
- syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
- /* Just test the devices we have */
- break;
- }
- /* child */
- if (test_forks[i] == 0) {
- test_device(devices[i], verbose, inject_error_percent);
- }
- }
- /* See if they have finished */
- clock_gettime(CLOCK_REALTIME, &ts);
- start_time = ts.tv_sec;
-
- while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
- for (i=0; i<device_count; i++) {
- int wstatus;
- pid_t w;
-
- if (test_forks[i] > 0) {
- w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
- if (w < 0) {
- fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
- return -1;
- }
-
- if (w == test_forks[i]) {
- if (WIFEXITED(wstatus)) {
- if (WEXITSTATUS(wstatus) != 0) {
- syslog(LOG_ERR, "Error reading from device %s", devices[i]);
- final_score += scores[i];
- }
-
- finished_count++;
- test_forks[i] = 0;
- }
- }
- }
- }
-
- usleep(100000);
-
- clock_gettime(CLOCK_REALTIME, &ts);
- }
-
- /* See which threads have not finished */
- for (i=0; i<device_count; i++) {
- if (test_forks[i] != 0) {
- syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
- fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
- final_score += scores[i];
- }
- }
-
- if (verbose) {
- printf("Final score is %d\n", final_score);
- }
+ final_score = test_device_main(device_count, devices, scores, verbose, inject_error_percent, timeout);
return final_score;
}
From 437162be482462047502b4098d7d2c1328d453a4 Mon Sep 17 00:00:00 2001
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Date: Mon, 24 Jul 2023 06:47:20 +0900
Subject: [PATCH 2/4] Mid: storage-mon: Added daemon/client mode.
---
configure.ac | 1 +
heartbeat/storage-mon.in | 222 +++++++++---
resource-agents.spec.in | 2 +-
tools/Makefile.am | 3 +-
tools/storage_mon.c | 724 +++++++++++++++++++++++++++++++++++----
5 files changed, 828 insertions(+), 124 deletions(-)
diff --git a/configure.ac b/configure.ac
index 7b5faff584..74766899b8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -620,6 +620,7 @@ fi
PKG_CHECK_MODULES([GLIB], [$GPKGNAME])
CPPFLAGS="$CPPFLAGS $GLIB_CFLAGS"
LIBS="$LIBS $GLIB_LIBS"
+PKG_CHECK_MODULES([LIBQB], "libqb")
dnl ========================================================================
dnl Headers
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
index d764b49d7c..81d8f5bcec 100644
--- a/heartbeat/storage-mon.in
+++ b/heartbeat/storage-mon.in
@@ -48,20 +48,26 @@
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
#
-STORAGEMON=$HA_BIN/storage_mon
-ATTRDUP=/usr/sbin/attrd_updater
+STORAGEMON=${HA_BIN}/storage_mon
+ATTRDUP=${HA_SBIN_DIR}/attrd_updater
+PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid
+ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}"
OCF_RESKEY_CRM_meta_interval_default="0"
OCF_RESKEY_io_timeout_default="10"
+OCF_RESKEY_check_interval_default="30"
OCF_RESKEY_inject_errors_default=""
OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
+OCF_RESKEY_daemonize_default=""
# Explicitly list all environment variables used, to make static analysis happy
: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
: ${OCF_RESKEY_drives:=""}
: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
+: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}}
: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
+: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}}
#######################################################################
@@ -106,6 +112,14 @@ Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default).
<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
</parameter>
+<parameter name="check_interval" unique="0">
+<longdesc lang="en">
+Specify interval between I/O checks in seconds.(Only supported with the damonize option.)
+</longdesc>
+<shortdesc lang="en">I/O check interval</shortdesc>
+<content type="integer" default="${OCF_RESKEY_check_interval_default}" />
+</parameter>
+
<parameter name="inject_errors" unique="0">
<longdesc lang="en">
Used only for testing! Specify % of I/O errors to simulate drives failures.
@@ -114,6 +128,14 @@ Used only for testing! Specify % of I/O errors to simulate drives failures.
<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
</parameter>
+<parameter name="daemonize" unique="0">
+<longdesc lang="en">
+Specifies to start storage-mon as a daemon and check for devices.
+</longdesc>
+<shortdesc lang="en">start storage-mon with daemon</shortdesc>
+<content type="string" default="" />
+</parameter>
+
</parameters>
<actions>
@@ -146,6 +168,11 @@ storage-mon_init() {
exit $OCF_ERR_INSTALLED
fi
+ if [ ! -x "$ATTRDUP" ] ; then
+ ocf_log err "${ATTRDUP} not installed."
+ exit $OCF_ERR_INSTALLED
+ fi
+
i=0
for DRIVE in ${OCF_RESKEY_drives}; do
if [ ! -e "$DRIVE" ] ; then
@@ -161,7 +188,12 @@ storage-mon_init() {
fi
if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
- ocf_log err "Minimum timeout is 1. Recommended 10 (default)."
+ ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)."
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then
+ ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}."
exit $OCF_ERR_CONFIGURED
fi
@@ -173,63 +205,147 @@ storage-mon_init() {
fi
}
-storage-mon_validate() {
- storage-mon_init
-
- # Is the state directory writable?
- state_dir=$(dirname "$OCF_RESKEY_state_file")
- touch "$state_dir/$$"
- if [ $? -ne 0 ]; then
- return $OCF_ERR_CONFIGURED
- fi
- rm "$state_dir/$$"
-
- return $OCF_SUCCESS
-}
-
storage-mon_monitor() {
- storage-mon_init
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
+ storage-mon_init
- # Monitor _MUST!_ differentiate correctly between running
- # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
- # That is THREE states, not just yes/no.
+ # Monitor _MUST!_ differentiate correctly between running
+ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
+ # That is THREE states, not just yes/no.
- if [ ! -f "${OCF_RESKEY_state_file}" ]; then
- return $OCF_NOT_RUNNING
- fi
+ if [ ! -f "${OCF_RESKEY_state_file}" ]; then
+ return $OCF_NOT_RUNNING
+ fi
- # generate command line
- cmdline=""
- for DRIVE in ${OCF_RESKEY_drives}; do
- cmdline="$cmdline --device $DRIVE --score 1"
- done
- cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
- if [ -n "${OCF_RESKEY_inject_errors}" ]; then
- cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
- fi
- $STORAGEMON $cmdline
- if [ $? -ne 0 ]; then
- status="red"
+ # generate command line
+ cmdline=""
+ for DRIVE in ${OCF_RESKEY_drives}; do
+ cmdline="$cmdline --device $DRIVE --score 1"
+ done
+ cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
+ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
+ fi
+ $STORAGEMON $cmdline
+ if [ $? -ne 0 ]; then
+ status="red"
+ else
+ status="green"
+ fi
+
+ "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
+ return $OCF_SUCCESS
else
- status="green"
- fi
+ ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
+ case "$?" in
+ 0) rc=$OCF_SUCCESS;;
+ 1|2) rc=$OCF_NOT_RUNNING;;
+ *) rc=$OCF_ERR_GENERIC;;
+ esac
+
+ if [ $rc -ne $OCF_SUCCESS ]; then
+ return "$rc"
+ fi
+ if [ "$1" = "pid_check_only" ]; then
+ return "$rc"
+ fi
- "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s"
- return $OCF_SUCCESS
+ # generate client command line
+ cmdline=""
+ cmdline="$cmdline --client --attrname ${ATTRNAME}"
+ while :
+ do
+ # 0 : Normal.
+ # greater than 0 : monitoring error.
+ # 255(-1) : communication system error.
+ # 254(-2) : Not all checks completed for first device in daemon mode.
+ $STORAGEMON $cmdline
+ rc=$?
+ case "$rc" in
+ 254|255)
+ # If there is a communication error or the initial check of all devices has not been completed,
+ # it will loop and try to reconnect.
+ # When everything ends with a communication error during monitor, a monitor timeout occurs.
+ ocf_log debug "client monitor error : $rc"
+ ;;
+ 0)
+ status="green"
+ break
+ ;;
+ *)
+ status="red"
+ break
+ ;;
+ esac
+ done
+
+ "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
+ return $OCF_SUCCESS
+ fi
}
storage-mon_start() {
- storage-mon_monitor
- if [ $? -eq $OCF_SUCCESS ]; then
- return $OCF_SUCCESS
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
+ storage-mon_monitor
+ if [ $? -eq $OCF_SUCCESS ]; then
+ return $OCF_SUCCESS
+ fi
+ touch "${OCF_RESKEY_state_file}"
+ else
+ storage-mon_init
+ # generate command line
+ cmdline=""
+ for DRIVE in ${OCF_RESKEY_drives}; do
+ cmdline="$cmdline --device $DRIVE --score 1"
+ done
+ #cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME} --ha-sbin-dir ${HA_SBIN_DIR}"
+ cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}"
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
+ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
+ fi
+ $STORAGEMON $cmdline
+ if [ "$?" -ne 0 ]; then
+ return $OCF_ERR_GENERIC
+ fi
fi
- touch "${OCF_RESKEY_state_file}"
}
storage-mon_stop() {
storage-mon_monitor
- if [ $? -eq $OCF_SUCCESS ]; then
- rm "${OCF_RESKEY_state_file}"
+ rc=$?
+
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
+ if [ $rc -eq $OCF_SUCCESS ]; then
+ rm "${OCF_RESKEY_state_file}"
+ fi
+ else
+ case "$rc" in
+ $OCF_SUCCESS)
+ ;;
+ $OCF_NOT_RUNNING)
+ return "$OCF_SUCCESS";;
+ *)
+ return "$rc";;
+ esac
+
+ kill -TERM $(cat "${PIDFILE}")
+ if [ "$?" -ne 0 ]; then
+ return $OCF_ERR_GENERIC
+ fi
+
+ while true; do
+ storage-mon_monitor pid_check_only
+ rc="$?"
+ case "$rc" in
+ $OCF_SUCCESS)
+ ;;
+ $OCF_NOT_RUNNING)
+ return "$OCF_SUCCESS";;
+ *)
+ return "$rc";;
+ esac
+ sleep 1
+ done
fi
return $OCF_SUCCESS
}
@@ -237,13 +353,15 @@ storage-mon_stop() {
storage-mon_validate() {
storage-mon_init
- # Is the state directory writable?
- state_dir=$(dirname "${OCF_RESKEY_state_file}")
- touch "$state_dir/$$"
- if [ $? -ne 0 ]; then
- return $OCF_ERR_CONFIGURED
+ if [ -z "$OCF_RESKEY_daemonize" ]; then
+ # Is the state directory writable?
+ state_dir=$(dirname "${OCF_RESKEY_state_file}")
+ touch "$state_dir/$$"
+ if [ $? -ne 0 ]; then
+ return $OCF_ERR_CONFIGURED
+ fi
+ rm "$state_dir/$$"
fi
- rm "$state_dir/$$"
return $OCF_SUCCESS
}
diff --git a/resource-agents.spec.in b/resource-agents.spec.in
index 2ffa00d946..1cbf28c033 100644
--- a/resource-agents.spec.in
+++ b/resource-agents.spec.in
@@ -55,7 +55,7 @@ Provides: heartbeat-resources = %{version}
BuildRequires: make
BuildRequires: automake autoconf pkgconfig gcc
BuildRequires: perl
-BuildRequires: libxslt glib2-devel
+BuildRequires: libxslt glib2-devel libqb-devel
BuildRequires: systemd
BuildRequires: which
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 08323fee3a..55e292cec5 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -74,7 +74,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl
findif_SOURCES = findif.c
storage_mon_SOURCES = storage_mon.c
-storage_mon_CFLAGS = -D_GNU_SOURCE
+storage_mon_CFLAGS = -D_GNU_SOURCE ${LIBQB_CFLAGS}
+storage_mon_LDADD = ${LIBQB_LIBS}
if BUILD_TICKLE
halib_PROGRAMS += tickle_tcp
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
index b0e277cbe0..1231570c85 100644
--- a/tools/storage_mon.c
+++ b/tools/storage_mon.c
@@ -16,9 +16,87 @@
#ifdef __FreeBSD__
#include <sys/disk.h>
#endif
+#include <config.h>
+#include <glib.h>
+#include <libgen.h>
+
+#include <qb/qbdefs.h>
+#include <qb/qblog.h>
+#include <qb/qbloop.h>
+#include <qb/qbutil.h>
+#include <qb/qbipcs.h>
+#include <qb/qbipcc.h>
#define MAX_DEVICES 25
#define DEFAULT_TIMEOUT 10
+#define DEFAULT_INTERVAL 30
+#define DEFAULT_PIDFILE HA_VARRUNDIR "storage_mon.pid"
+#define DEFAULT_ATTRNAME "#health-storage_mon"
+#define SMON_GET_RESULT_COMMAND "get_check_value"
+#define SMON_RESULT_OK "green"
+#define SMON_RESULT_NG "red"
+#define SMON_RESULT_COMMAND_ERROR "unknown command"
+#define SMON_BUFF_1MEG 1048576
+#define SMON_MAX_IPCSNAME 256
+#define SMON_MAX_MSGSIZE 128
+#define SMON_MAX_RESP_SIZE 100
+
+#define PRINT_STORAGE_MON_ERR(fmt, ...) if (!daemonize) { \
+ fprintf(stderr, fmt"\n", __VA_ARGS__); \
+ } else { \
+ syslog(LOG_ERR, fmt, __VA_ARGS__); \
+ }
+#define PRINT_STORAGE_MON_ERR_NOARGS(str) if (!daemonize) { \
+ fprintf(stderr, str"\n"); \
+ } else { \
+ syslog(LOG_ERR, str); \
+ }
+
+#define PRINT_STORAGE_MON_INFO(fmt, ...) if (!daemonize) { \
+ printf(fmt"\n", __VA_ARGS__); \
+ } else { \
+ syslog(LOG_INFO, fmt, __VA_ARGS__); \
+ }
+
+struct storage_mon_timer_data {
+ int interval;
+};
+
+struct storage_mon_check_value_req {
+ struct qb_ipc_request_header hdr;
+ char message[SMON_MAX_MSGSIZE];
+};
+
+
+struct storage_mon_check_value_res {
+ struct qb_ipc_response_header hdr;
+ char message[SMON_MAX_MSGSIZE];
+};
+
+
+char *devices[MAX_DEVICES];
+int scores[MAX_DEVICES];
+size_t device_count = 0;
+int timeout = DEFAULT_TIMEOUT;
+int verbose = 0;
+int inject_error_percent = 0;
+const char *attrname = DEFAULT_ATTRNAME;
+gboolean daemonize = FALSE;
+int shutting_down = FALSE;
+static qb_ipcs_service_t *ipcs;
+int final_score = 0;
+int response_final_score = 0;
+pid_t test_forks[MAX_DEVICES];
+size_t finished_count = 0;
+gboolean daemon_check_first_all_devices = FALSE;
+
+static qb_loop_t *storage_mon_poll_handle;
+static qb_loop_timer_handle timer_handle;
+static qb_loop_timer_handle expire_handle;
+static struct storage_mon_timer_data timer_d;
+
+static int test_device_main(gpointer data);
+static void wrap_test_device_main(void *data);
static void usage(char *name, FILE *f)
{
@@ -27,6 +105,11 @@ static void usage(char *name, FILE *f)
fprintf(f, " --score <n> score if device fails the test. Must match --device count\n");
fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
+ fprintf(f, " --daemonize test run in daemons.\n");
+ fprintf(f, " --client client connection to daemon. requires the attrname option.\n");
+ fprintf(f, " --interval <n> interval to test. in seconds (default %d)(for daemonize only)\n", DEFAULT_INTERVAL);
+ fprintf(f, " --pidfile <path> file path to record pid (default %s)(for daemonize only)\n", DEFAULT_PIDFILE);
+ fprintf(f, " --attrname <attr> attribute name to update test result (default %s)(for daemonize/client only)\n", DEFAULT_ATTRNAME);
fprintf(f, " --verbose emit extra output to stdout\n");
fprintf(f, " --help print this message\n");
}
@@ -47,13 +130,13 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
device_fd = open(device, flags);
if (device_fd < 0) {
if (errno != EINVAL) {
- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to open %s: %s", device, strerror(errno));
exit(-1);
}
flags &= ~O_DIRECT;
device_fd = open(device, flags);
if (device_fd < 0) {
- fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to open %s: %s", device, strerror(errno));
exit(-1);
}
}
@@ -63,11 +146,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
res = ioctl(device_fd, BLKGETSIZE64, &devsize);
#endif
if (res < 0) {
- fprintf(stderr, "Failed to get device size for %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to get device size for %s: %s", device, strerror(errno));
goto error;
}
if (verbose) {
- printf("%s: opened %s O_DIRECT, size=%zu\n", device, (flags & O_DIRECT)?"with":"without", devsize);
+ PRINT_STORAGE_MON_INFO("%s: opened %s O_DIRECT, size=%zu", device, (flags & O_DIRECT)?"with":"without", devsize);
}
/* Don't fret about real randomness */
@@ -76,11 +159,11 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
res = lseek(device_fd, seek_spot, SEEK_SET);
if (res < 0) {
- fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to seek %s: %s", device, strerror(errno));
goto error;
}
if (verbose) {
- printf("%s: reading from pos %ld\n", device, seek_spot);
+ PRINT_STORAGE_MON_INFO("%s: reading from pos %ld", device, seek_spot);
}
if (flags & O_DIRECT) {
@@ -93,22 +176,22 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
res = ioctl(device_fd, BLKSSZGET, &sec_size);
#endif
if (res < 0) {
- fprintf(stderr, "Failed to get block device sector size for %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
goto error;
}
if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
- fprintf(stderr, "Failed to allocate aligned memory: %s\n", strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
goto error;
}
res = read(device_fd, buffer, sec_size);
free(buffer);
if (res < 0) {
- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
goto error;
}
if (res < sec_size) {
- fprintf(stderr, "Failed to read %d bytes from %s, got %d\n", sec_size, device, res);
+ PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res);
goto error;
}
} else {
@@ -116,28 +199,28 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
res = read(device_fd, buffer, sizeof(buffer));
if (res < 0) {
- fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
goto error;
}
if (res < (int)sizeof(buffer)) {
- fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
+ PRINT_STORAGE_MON_ERR("Failed to read %ld bytes from %s, got %d", sizeof(buffer), device, res);
goto error;
}
}
/* Fake an error */
if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
- fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
+ PRINT_STORAGE_MON_ERR_NOARGS("People, please fasten your seatbelts, injecting errors!");
goto error;
}
res = close(device_fd);
if (res != 0) {
- fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno));
+ PRINT_STORAGE_MON_ERR("Failed to close %s: %s", device, strerror(errno));
exit(-1);
}
if (verbose) {
- printf("%s: done\n", device);
+ PRINT_STORAGE_MON_INFO("%s: done", device);
}
exit(0);
@@ -146,101 +229,563 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
exit(-1);
}
-static int test_device_main(size_t device_count, char *devices[MAX_DEVICES], int scores[MAX_DEVICES], int verbose, int inject_error_percent, int timeout)
+static gboolean is_child_runnning(void)
{
- pid_t test_forks[MAX_DEVICES];
size_t i;
- struct timespec ts;
- time_t start_time;
- size_t finished_count = 0;
- int final_score = 0;
- memset(test_forks, 0, sizeof(test_forks));
for (i=0; i<device_count; i++) {
- test_forks[i] = fork();
- if (test_forks[i] < 0) {
- fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
- syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
- /* Just test the devices we have */
- break;
- }
- /* child */
- if (test_forks[i] == 0) {
- test_device(devices[i], verbose, inject_error_percent);
+ if (test_forks[i] != 0) {
+ return TRUE;
}
}
+ return FALSE;
+}
+
+static void stop_child(pid_t pid, int signal)
+{
+ errno = 0;
- /* See if they have finished */
- clock_gettime(CLOCK_REALTIME, &ts);
- start_time = ts.tv_sec;
+ if (kill(pid, signal) == 0) {
+ syslog(LOG_DEBUG, "Stopping chilg sent signal %d to process %lld", signal, (long long) pid);
+ } else {
+ syslog(LOG_ERR, "Could not stop child (process %lld) with signal %d: %s", (long long) pid, signal, strerror(errno));
+ }
+}
+
+static int32_t sigterm_handler(int num, void *data)
+{
+ size_t i;
+ shutting_down = TRUE;
- while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
+ /* If there is an unfired timer, stop it. */
+ qb_loop_timer_del(storage_mon_poll_handle, timer_handle);
+
+ /* Send SIGTERM to non-terminating device monitoring processes. */
+ if (is_child_runnning()) {
+ /* See if threads have finished */
for (i=0; i<device_count; i++) {
- int wstatus;
- pid_t w;
+ if (test_forks[i] > 0 ) {
+ stop_child(test_forks[i], SIGTERM);
+ }
+ }
+ }
+
+ /* Set a timer for termination. */
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_HIGH, 0, NULL, wrap_test_device_main, &timer_handle);
+
+ return 0;
+}
+
+static size_t find_child_pid(int pid)
+{
+ size_t i;
+
+ for (i=0; i<device_count; i++) {
+ if (test_forks[i] > 0 ) {
+ if (test_forks[i] == pid) {
+ return i;
+ }
+ }
+ }
+ return -1;
+}
+
+static int32_t sigchld_handler(int32_t sig, void *data)
+{
+ pid_t pid;
+ size_t index;
+ int status;
+
+ if (is_child_runnning()) {
+ while(1) {
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid > 0) {
+ if (WIFEXITED(status)) {
+ index = find_child_pid(pid);
+ if (index >= 0) {
+ /* If the expire timer is running, no timeout has occurred, */
+ /* so add the final_score from the exit code of the terminated child process. */
+ if (qb_loop_timer_is_running(storage_mon_poll_handle, expire_handle)) {
+ if (WEXITSTATUS(status) !=0) {
+ final_score += scores[index];
+
+ /* Update response values immediately in preparation for inquiries from clients. */
+ response_final_score = final_score;
+
+ /* Even in the first demon mode check, if there is an error device, clear */
+ /* the flag to return the response to the client without waiting for all devices to finish. */
+ daemon_check_first_all_devices = TRUE;
+ }
+ }
+#if 0
+ if (shutting_down == FALSE) {
+ finished_count++;
+ test_forks[index] = 0;
+ }
+#endif
+ finished_count++;
+ test_forks[index] = 0;
+
+ }
+ }
+ } else {
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+static void child_shutdown(int nsig)
+{
+ exit(1);
+}
+
+static int write_pid_file(const char *pidfile)
+{
+ char *pid;
+ char *dir, *str = NULL;
+ int fd = -1;
+ int rc = -1;
+ int i, len;
+
+ if (asprintf(&pid, "%jd", (intmax_t)getpid()) < 0) {
+ syslog(LOG_ERR, "Failed to allocate memory to store PID");
+ pid = NULL;
+ goto done;
+ }
+
+ str = strdup(pidfile);
+ if (str == NULL) {
+ syslog(LOG_ERR, "Failed to duplicate string ['%s']", pidfile);
+ goto done;
+ }
+ dir = dirname(str);
+ for (i = 1, len = strlen(dir); i < len; i++) {
+ if (dir[i] == '/') {
+ dir[i] = 0;
+ if ((mkdir(dir, 0640) < 0) && (errno != EEXIST)) {
+ syslog(LOG_ERR, "Failed to create directory %s: %s", dir, strerror(errno));
+ goto done;
+ }
+ dir[i] = '/';
+ }
+ }
+ if ((mkdir(dir, 0640) < 0) && (errno != EEXIST)) {
+ syslog(LOG_ERR, "Failed to create directory %s: %s", dir, strerror(errno));
+ goto done;
+ }
+
+ fd = open(pidfile, O_CREAT | O_WRONLY, 0640);
+ if (fd < 0) {
+ syslog(LOG_ERR, "Failed to open %s: %s", pidfile, strerror(errno));
+ goto done;
+ }
+
+ if (write(fd, pid, strlen(pid)) != strlen(pid)) {
+ syslog(LOG_ERR, "Failed to write '%s' to %s: %s", pid, pidfile, strerror(errno));
+ goto done;
+ }
+ close(fd);
+ rc = 0;
+done:
+ if (pid != NULL) {
+ free(pid);
+ }
+ if (str != NULL) {
+ free(str);
+ }
+ return rc;
+}
+
+static void child_timeout_handler(void *data)
+{
+ size_t i;
+
+ if (is_child_runnning()) {
+ for (i=0; i<device_count; i++) {
if (test_forks[i] > 0) {
- w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
- if (w < 0) {
- fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
- return -1;
+ /* If timeout occurs before SIGCHLD, add child process failure score to final_score. */
+ final_score += scores[i];
+
+ /* Update response values immediately in preparation for inquiries from clients. */
+ response_final_score = final_score;
+
+ /* Even in the first demon mode check, if there is an error device, clear */
+ /* the flag to return the response to the client without waiting for all devices to finish. */
+ daemon_check_first_all_devices = TRUE;
+ }
+ }
+ }
+}
+
+static void wrap_test_device_main(void *data)
+{
+ struct storage_mon_timer_data *timer_data = (struct storage_mon_timer_data*)data;
+ test_device_main((timer_data != NULL) ? &timer_data->interval : NULL);
+}
+
+static int test_device_main(gpointer data)
+{
+ size_t i;
+ struct timespec ts;
+ time_t start_time;
+ gboolean device_check = TRUE;
+
+ if (daemonize) {
+ if (shutting_down == TRUE) {
+ goto done;
+ }
+
+ /* In the case of daemon mode, it is avoided that the timer is triggered and the number of */
+ /* child processes increases while the device monitoring child process is not completed. */
+ if (is_child_runnning()) {
+ device_check = FALSE;
+ }
+
+ if (device_count == finished_count && device_check) {
+ /* Update the result value for the client response once all checks have completed. */
+ response_final_score = final_score;
+
+ if (!daemon_check_first_all_devices) {
+ daemon_check_first_all_devices = TRUE;
+ }
+ }
+ }
+
+ if (device_check) {
+ /* Reset final_score, finished_count, test_forks[] */
+ final_score = 0;
+ finished_count = 0;
+
+ memset(test_forks, 0, sizeof(test_forks));
+ for (i=0; i<device_count; i++) {
+ test_forks[i] = fork();
+ if (test_forks[i] < 0) {
+ PRINT_STORAGE_MON_ERR("Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+ /* Just test the devices we have */
+ break;
+ }
+ /* child */
+ if (test_forks[i] == 0) {
+ if (daemonize) {
+ signal(SIGTERM, &child_shutdown);
}
+ test_device(devices[i], verbose, inject_error_percent);
+ }
+ }
- if (w == test_forks[i]) {
- if (WIFEXITED(wstatus)) {
- if (WEXITSTATUS(wstatus) != 0) {
- syslog(LOG_ERR, "Error reading from device %s", devices[i]);
- final_score += scores[i];
+ if (!daemonize) {
+ /* See if they have finished */
+ clock_gettime(CLOCK_REALTIME, &ts);
+ start_time = ts.tv_sec;
+
+ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
+ for (i=0; i<device_count; i++) {
+ int wstatus;
+ pid_t w;
+
+ if (test_forks[i] > 0) {
+ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
+ if (w < 0) {
+ PRINT_STORAGE_MON_ERR("waitpid on %s failed: %s", devices[i], strerror(errno));
+ return -1;
}
- finished_count++;
- test_forks[i] = 0;
+ if (w == test_forks[i]) {
+ if (WIFEXITED(wstatus)) {
+ if (WEXITSTATUS(wstatus) != 0) {
+ syslog(LOG_ERR, "Error reading from device %s", devices[i]);
+ final_score += scores[i];
+ }
+
+ finished_count++;
+ test_forks[i] = 0;
+ }
+ }
}
}
+
+ usleep(100000);
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ }
+
+ /* See which threads have not finished */
+ for (i=0; i<device_count; i++) {
+ if (test_forks[i] != 0) {
+ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
+ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
+ final_score += scores[i];
+ }
}
+ } else {
+ /* Rrun the child process timeout watch timer. */
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, timeout * QB_TIME_NS_IN_SEC, NULL, child_timeout_handler, &expire_handle);
+ }
+ }
+ if (!daemonize) {
+ if (verbose) {
+ printf("Final score is %d\n", final_score);
+ }
+ return final_score;
+ } else {
+ if (data != NULL) {
+ /* Sets the device check to run on the next timer. */
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, timer_d.interval * QB_TIME_NS_IN_SEC, &timer_d, wrap_test_device_main, &timer_handle);
}
+ return TRUE;
+ }
+done:
+ qb_loop_stop(storage_mon_poll_handle);
+ return FALSE;
+}
+
+static int32_t
+storage_mon_job_add(enum qb_loop_priority p, void *data, qb_loop_job_dispatch_fn fn)
+{
+ return qb_loop_job_add(storage_mon_poll_handle, p, data, fn);
+}
+
+static int32_t
+storage_mon_dispatch_add(enum qb_loop_priority p, int32_t fd, int32_t evts,
+ void *data, qb_ipcs_dispatch_fn_t fn)
+{
+ return qb_loop_poll_add(storage_mon_poll_handle, p, fd, evts, data, fn);
+}
+
+static int32_t
+storage_mon_dispatch_mod(enum qb_loop_priority p, int32_t fd, int32_t evts,
+ void *data, qb_ipcs_dispatch_fn_t fn)
+{
+ return qb_loop_poll_mod(storage_mon_poll_handle, p, fd, evts, data, fn);
+}
+
+static int32_t
+storage_mon_dispatch_del(int32_t fd)
+{
+ return qb_loop_poll_del(storage_mon_poll_handle, fd);
+}
- usleep(100000);
+static int32_t
+storage_mon_ipcs_connection_accept_fn(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
+{
+ return 0;
+}
+
+static void
+storage_mon_ipcs_connection_created_fn(qb_ipcs_connection_t *c)
+{
+ struct qb_ipcs_stats srv_stats;
+
+ qb_ipcs_stats_get(ipcs, &srv_stats, QB_FALSE);
+ syslog(LOG_DEBUG, "Connection created (active:%d, closed:%d)",
+ srv_stats.active_connections, srv_stats.closed_connections);
+}
+
+static void
+storage_mon_ipcs_connection_destroyed_fn(qb_ipcs_connection_t *c)
+{
+ syslog(LOG_DEBUG, "Connection about to be freed");
+}
+
+static int32_t
+storage_mon_ipcs_connection_closed_fn(qb_ipcs_connection_t *c)
+{
+ struct qb_ipcs_connection_stats stats;
+ struct qb_ipcs_stats srv_stats;
+
+ qb_ipcs_stats_get(ipcs, &srv_stats, QB_FALSE);
+ qb_ipcs_connection_stats_get(c, &stats, QB_FALSE);
+
+ syslog(LOG_DEBUG,
+ "Connection to pid:%d destroyed (active:%d, closed:%d)",
+ stats.client_pid, srv_stats.active_connections,
+ srv_stats.closed_connections);
+
+ return 0;
+}
- clock_gettime(CLOCK_REALTIME, &ts);
+static int32_t
+storage_mon_ipcs_msg_process_fn(qb_ipcs_connection_t *c, void *data, size_t size)
+{
+ struct storage_mon_check_value_req *request;
+ struct qb_ipc_response_header resps;
+ ssize_t res;
+ struct iovec iov[2];
+ char resp[SMON_MAX_RESP_SIZE];
+ int32_t rc;
+ int send_score = response_final_score;
+
+ request = (struct storage_mon_check_value_req *)data;
+ syslog(LOG_DEBUG, "msg received (id:%d, size:%d, data:%s)",
+ request->hdr.id, request->hdr.size, request->message);
+
+ if (strcmp(request->message, SMON_GET_RESULT_COMMAND) != 0) {
+ syslog(LOG_DEBUG, "request command is unknown.");
+ send_score = -1;
+ } else if (!daemon_check_first_all_devices) {
+ send_score = -2;
}
- /* See which threads have not finished */
- for (i=0; i<device_count; i++) {
- if (test_forks[i] != 0) {
- syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
- fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
- final_score += scores[i];
+ resps.size = sizeof(struct qb_ipc_response_header);
+ resps.id = 13;
+ resps.error = 0;
+
+ rc = snprintf(resp, SMON_MAX_RESP_SIZE, "%d", send_score) + 1;
+ iov[0].iov_len = sizeof(resps);
+ iov[0].iov_base = &resps;
+ iov[1].iov_len = rc;
+ iov[1].iov_base = resp;
+ resps.size += rc;
+
+ res = qb_ipcs_response_sendv(c, iov, 2);
+ if (res < 0) {
+ errno = -res;
+ syslog(LOG_ERR, "qb_ipcs_response_send : errno = %d", errno);
+ }
+ return 0;
+}
+
+static int32_t
+storage_mon_client(void)
+{
+ struct storage_mon_check_value_req request;
+ struct storage_mon_check_value_res response;
+ qb_ipcc_connection_t *conn;
+ char ipcs_name[SMON_MAX_IPCSNAME];
+ int32_t rc;
+
+
+ snprintf(ipcs_name, SMON_MAX_IPCSNAME, "storage_mon_%s", attrname);
+ conn = qb_ipcc_connect(ipcs_name, 0);
+ if (conn == NULL) {
+ syslog(LOG_ERR, "qb_ipcc_connect error\n");
+ return(-1);
+ }
+
+ snprintf(request.message, SMON_MAX_MSGSIZE, "%s", SMON_GET_RESULT_COMMAND);
+ request.hdr.id = 0;
+ request.hdr.size = sizeof(struct storage_mon_check_value_req);
+ rc = qb_ipcc_send(conn, &request, request.hdr.size);
+ if (rc < 0) {
+ syslog(LOG_ERR, "qb_ipcc_send error : %d\n", rc);
+ return(-1);
+ }
+ if (rc > 0) {
+ rc = qb_ipcc_recv(conn, &response, sizeof(response), -1);
+ if (rc < 0) {
+ syslog(LOG_ERR, "qb_ipcc_recv error : %d\n", rc);
+ return(-1);
}
}
- if (verbose) {
- printf("Final score is %d\n", final_score);
+ qb_ipcc_disconnect(conn);
+
+ /* Set score to result */
+ /* 0 : Normal. */
+ /* greater than 0 : monitoring error. */
+ /* -1 : communication system error. */
+ /* -2 : Not all checks completed for first device in daemon mode. */
+ rc = atoi(response.message);
+
+ syslog(LOG_DEBUG, "daemon response[%d]: %s \n", response.hdr.id, response.message);
+
+ return(rc);
+}
+
+static int32_t
+storage_mon_daemon(int interval, const char *pidfile)
+{
+ int32_t rc;
+ char ipcs_name[SMON_MAX_IPCSNAME];
+
+ struct qb_ipcs_service_handlers service_handle = {
+ .connection_accept = storage_mon_ipcs_connection_accept_fn,
+ .connection_created = storage_mon_ipcs_connection_created_fn,
+ .msg_process = storage_mon_ipcs_msg_process_fn,
+ .connection_destroyed = storage_mon_ipcs_connection_destroyed_fn,
+ .connection_closed = storage_mon_ipcs_connection_closed_fn,
+ };
+
+ struct qb_ipcs_poll_handlers poll_handle = {
+ .job_add = storage_mon_job_add,
+ .dispatch_add = storage_mon_dispatch_add,
+ .dispatch_mod = storage_mon_dispatch_mod,
+ .dispatch_del = storage_mon_dispatch_del,
+ };
+
+ if (daemon(0, 0) < 0) {
+ syslog(LOG_ERR, "Failed to daemonize: %s", strerror(errno));
+ return -1;
}
- return final_score;
+
+ umask(S_IWGRP | S_IWOTH | S_IROTH);
+
+ if (write_pid_file(pidfile) < 0) {
+ return -1;
+ }
+
+ snprintf(ipcs_name, SMON_MAX_IPCSNAME, "storage_mon_%s", attrname);
+ ipcs = qb_ipcs_create(ipcs_name, 0, QB_IPC_NATIVE, &service_handle);
+ if (ipcs == 0) {
+ syslog(LOG_ERR, "qb_ipcs_create");
+ return -1;
+ }
+
+ qb_ipcs_enforce_buffer_size(ipcs, SMON_BUFF_1MEG);
+
+ storage_mon_poll_handle = qb_loop_create();
+
+ qb_ipcs_poll_handlers_set(ipcs, &poll_handle);
+ rc = qb_ipcs_run(ipcs);
+ if (rc != 0) {
+ errno = -rc;
+ syslog(LOG_ERR, "qb_ipcs_run");
+ return -1;
+ }
+
+ qb_loop_signal_add(storage_mon_poll_handle, QB_LOOP_HIGH,
+ SIGTERM, NULL, sigterm_handler, NULL);
+
+ qb_loop_signal_add(storage_mon_poll_handle, QB_LOOP_MED,
+ SIGCHLD, NULL, sigchld_handler, NULL);
+
+ timer_d.interval = interval;
+ qb_loop_timer_add(storage_mon_poll_handle, QB_LOOP_MED, 0, &timer_d, wrap_test_device_main, &timer_handle);
+
+ qb_loop_run(storage_mon_poll_handle);
+ qb_loop_destroy(storage_mon_poll_handle);
+
+ unlink(pidfile);
+
+ return 0;
}
int main(int argc, char *argv[])
{
- char *devices[MAX_DEVICES];
- int scores[MAX_DEVICES];
- size_t device_count = 0;
size_t score_count = 0;
- int timeout = DEFAULT_TIMEOUT;
- int final_score = 0;
int opt, option_index;
- int verbose = 0;
- int inject_error_percent = 0;
+ int interval = DEFAULT_INTERVAL;
+ const char *pidfile = DEFAULT_PIDFILE;
+ gboolean client = FALSE;
struct option long_options[] = {
{"timeout", required_argument, 0, 't' },
{"device", required_argument, 0, 'd' },
{"score", required_argument, 0, 's' },
{"inject-errors-percent", required_argument, 0, 0 },
+ {"daemonize", no_argument, 0, 0 },
+ {"client", no_argument, 0, 0 },
+ {"interval", required_argument, 0, 'i' },
+ {"pidfile", required_argument, 0, 'p' },
+ {"attrname", required_argument, 0, 'a' },
{"verbose", no_argument, 0, 'v' },
{"help", no_argument, 0, 'h' },
{0, 0, 0, 0 }
};
- while ( (opt = getopt_long(argc, argv, "hvt:d:s:",
+
+ while ( (opt = getopt_long(argc, argv, "hvt:d:s:i:p:a:",
long_options, &option_index)) != -1 ) {
switch (opt) {
case 0: /* Long-only options */
@@ -251,6 +796,16 @@ int main(int argc, char *argv[])
return -1;
}
}
+ if (strcmp(long_options[option_index].name, "daemonize") == 0) {
+ daemonize = TRUE;
+ }
+ if (strcmp(long_options[option_index].name, "client") == 0) {
+ client = TRUE;
+ }
+ if (daemonize && client) {
+ fprintf(stderr,"The daemonize option and client option cannot be specified at the same time.");
+ return -1;
+ }
break;
case 'd':
if (device_count < MAX_DEVICES) {
@@ -287,6 +842,27 @@ int main(int argc, char *argv[])
usage(argv[0], stdout);
return 0;
break;
+ case 'i':
+ interval = atoi(optarg);
+ if (interval < 1) {
+ fprintf(stderr, "invalid interval %d. Min 1, default is %d\n", interval, DEFAULT_INTERVAL);
+ return -1;
+ }
+ break;
+ case 'p':
+ pidfile = strdup(optarg);
+ if (pidfile == NULL) {
+ fprintf(stderr, "Failed to duplicate string ['%s']\n", optarg);
+ return -1;
+ }
+ break;
+ case 'a':
+ attrname = strdup(optarg);
+ if (attrname == NULL) {
+ fprintf(stderr, "Failed to duplicate string ['%s']\n", optarg);
+ return -1;
+ }
+ break;
default:
usage(argv[0], stderr);
return -1;
@@ -294,6 +870,11 @@ int main(int argc, char *argv[])
}
}
+
+ if (client) {
+ return(storage_mon_client());
+ }
+
if (device_count == 0) {
fprintf(stderr, "No devices to test, use the -d or --device argument\n");
return -1;
@@ -306,7 +887,10 @@ int main(int argc, char *argv[])
openlog("storage_mon", 0, LOG_DAEMON);
-
- final_score = test_device_main(device_count, devices, scores, verbose, inject_error_percent, timeout);
+ if (!daemonize) {
+ final_score = test_device_main(NULL);
+ } else {
+ return(storage_mon_daemon(interval, pidfile));
+ }
return final_score;
}
From 406ff43a6caeb0add7493892236753acee293f27 Mon Sep 17 00:00:00 2001
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Date: Mon, 24 Jul 2023 06:47:39 +0900
Subject: [PATCH 3/4] Mid: storage-mon: Retry failed attrd_updater.
---
heartbeat/storage-mon.in | 27 +++++++++++++++++++++++----
1 file changed, 23 insertions(+), 4 deletions(-)
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
index 81d8f5bcec..9662e06dbb 100644
--- a/heartbeat/storage-mon.in
+++ b/heartbeat/storage-mon.in
@@ -205,6 +205,25 @@ storage-mon_init() {
fi
}
+storage-mon_update_attribute() {
+
+ while :
+ do
+ "$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ break
+ fi
+
+ ocf_log debug "${1} attribute by attrd_updater failed"
+ if [ "$1" = "red" ]; then
+ # If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately.
+ return $OCF_ERR_GENERIC
+ fi
+ done
+ return $OCF_SUCCESS
+}
+
storage-mon_monitor() {
if [ -z "$OCF_RESKEY_daemonize" ]; then
storage-mon_init
@@ -233,8 +252,8 @@ storage-mon_monitor() {
status="green"
fi
- "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
- return $OCF_SUCCESS
+ storage-mon_update_attribute $status
+ return "$?"
else
ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
case "$?" in
@@ -279,8 +298,8 @@ storage-mon_monitor() {
esac
done
- "$ATTRDUP" -n ${ATTRNAME} -U "$status" -d "5s"
- return $OCF_SUCCESS
+ storage-mon_update_attribute $status
+ return "$?"
fi
}
From d1cf0b42f1eb6c41ef5887cb7d9ce055f3bbcb3a Mon Sep 17 00:00:00 2001
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
Date: Thu, 17 Aug 2023 17:18:53 +0900
Subject: [PATCH 4/4] Mid: storage-mon RA: Changed OCF_RESKEY_daemonize_default
and OCF_RESKEY_daemonize default and judgment part.
---
heartbeat/storage-mon.in | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
index 9662e06dbb..284dec30f2 100644
--- a/heartbeat/storage-mon.in
+++ b/heartbeat/storage-mon.in
@@ -58,7 +58,7 @@ OCF_RESKEY_io_timeout_default="10"
OCF_RESKEY_check_interval_default="30"
OCF_RESKEY_inject_errors_default=""
OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
-OCF_RESKEY_daemonize_default=""
+OCF_RESKEY_daemonize_default="false"
# Explicitly list all environment variables used, to make static analysis happy
: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
@@ -133,7 +133,7 @@ Used only for testing! Specify % of I/O errors to simulate drives failures.
Specifies to start storage-mon as a daemon and check for devices.
</longdesc>
<shortdesc lang="en">start storage-mon with daemon</shortdesc>
-<content type="string" default="" />
+<content type="boolean" default="${OCF_RESKEY_daemonize_default}" />
</parameter>
</parameters>
@@ -225,7 +225,7 @@ storage-mon_update_attribute() {
}
storage-mon_monitor() {
- if [ -z "$OCF_RESKEY_daemonize" ]; then
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
storage-mon_init
# Monitor _MUST!_ differentiate correctly between running
@@ -304,7 +304,7 @@ storage-mon_monitor() {
}
storage-mon_start() {
- if [ -z "$OCF_RESKEY_daemonize" ]; then
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
storage-mon_monitor
if [ $? -eq $OCF_SUCCESS ]; then
return $OCF_SUCCESS
@@ -317,7 +317,6 @@ storage-mon_start() {
for DRIVE in ${OCF_RESKEY_drives}; do
cmdline="$cmdline --device $DRIVE --score 1"
done
- #cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME} --ha-sbin-dir ${HA_SBIN_DIR}"
cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}"
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
@@ -333,7 +332,7 @@ storage-mon_stop() {
storage-mon_monitor
rc=$?
- if [ -z "$OCF_RESKEY_daemonize" ]; then
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
if [ $rc -eq $OCF_SUCCESS ]; then
rm "${OCF_RESKEY_state_file}"
fi
@@ -372,7 +371,7 @@ storage-mon_stop() {
storage-mon_validate() {
storage-mon_init
- if [ -z "$OCF_RESKEY_daemonize" ]; then
+ if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
# Is the state directory writable?
state_dir=$(dirname "${OCF_RESKEY_state_file}")
touch "$state_dir/$$"