715 lines
21 KiB
Diff
715 lines
21 KiB
Diff
From 90b595650d7d8a6f6a69a9f7060c6406aa731c18 Mon Sep 17 00:00:00 2001
|
|
From: "Fabio M. Di Nitto" <fdinitto@redhat.com>
|
|
Date: Wed, 28 Jul 2021 10:08:10 +0200
|
|
Subject: [PATCH] Add storage-mon pacemaker health check
|
|
|
|
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
---
|
|
.gitignore | 41 ++++++
|
|
configure.ac | 1 +
|
|
doc/man/Makefile.am | 3 +-
|
|
heartbeat/Makefile.am | 17 +--
|
|
heartbeat/storage-mon.in | 263 +++++++++++++++++++++++++++++++++++++++
|
|
tools/Makefile.am | 5 +-
|
|
tools/storage_mon.c | 263 +++++++++++++++++++++++++++++++++++++++
|
|
7 files changed, 583 insertions(+), 10 deletions(-)
|
|
create mode 100644 heartbeat/storage-mon.in
|
|
create mode 100644 tools/storage_mon.c
|
|
|
|
diff --git a/.gitignore b/.gitignore
|
|
index 38d3566205..f7277bf04e 100644
|
|
--- a/.gitignore
|
|
+++ b/.gitignore
|
|
@@ -45,6 +45,46 @@ heartbeat/ocf-shellfuncs
|
|
heartbeat/send_ua
|
|
heartbeat/shellfuncs
|
|
heartbeat/*.pyc
|
|
+heartbeat/AoEtarget
|
|
+heartbeat/CTDB
|
|
+heartbeat/ManageRAID
|
|
+heartbeat/ManageVE
|
|
+heartbeat/Squid
|
|
+heartbeat/SysInfo
|
|
+heartbeat/aws-vpc-route53
|
|
+heartbeat/azure-events
|
|
+heartbeat/clvm
|
|
+heartbeat/conntrackd
|
|
+heartbeat/dnsupdate
|
|
+heartbeat/dummypy
|
|
+heartbeat/eDir88
|
|
+heartbeat/fio
|
|
+heartbeat/galera
|
|
+heartbeat/gcp-pd-move
|
|
+heartbeat/gcp-vpc-move-ip
|
|
+heartbeat/gcp-vpc-move-route
|
|
+heartbeat/gcp-vpc-move-vip
|
|
+heartbeat/iSCSILogicalUnit
|
|
+heartbeat/iSCSITarget
|
|
+heartbeat/jira
|
|
+heartbeat/kamailio
|
|
+heartbeat/lxc
|
|
+heartbeat/lxd-info
|
|
+heartbeat/machine-info
|
|
+heartbeat/mariadb
|
|
+heartbeat/mpathpersist
|
|
+heartbeat/nfsnotify
|
|
+heartbeat/openstack-info
|
|
+heartbeat/rabbitmq-cluster
|
|
+heartbeat/redis
|
|
+heartbeat/rsyslog
|
|
+heartbeat/sg_persist
|
|
+heartbeat/slapd
|
|
+heartbeat/smb-share
|
|
+heartbeat/storage-mon
|
|
+heartbeat/sybaseASE
|
|
+heartbeat/syslog-ng
|
|
+heartbeat/vsftpd
|
|
include/agent_config.h
|
|
include/config.h
|
|
include/config.h.in
|
|
@@ -61,6 +101,7 @@ systemd/resource-agents.conf
|
|
tools/findif
|
|
tools/ocf-tester
|
|
tools/send_arp
|
|
+tools/storage_mon
|
|
tools/tickle_tcp
|
|
tools/ocft/README
|
|
tools/ocft/README.zh_CN
|
|
diff --git a/configure.ac b/configure.ac
|
|
index 717fb95432..c125df98f6 100644
|
|
--- a/configure.ac
|
|
+++ b/configure.ac
|
|
@@ -1002,6 +1002,7 @@ AC_CONFIG_FILES([heartbeat/rsyslog], [chmod +x heartbeat/rsyslog])
|
|
AC_CONFIG_FILES([heartbeat/smb-share], [chmod +x heartbeat/smb-share])
|
|
AC_CONFIG_FILES([heartbeat/sg_persist], [chmod +x heartbeat/sg_persist])
|
|
AC_CONFIG_FILES([heartbeat/slapd], [chmod +x heartbeat/slapd])
|
|
+AC_CONFIG_FILES([heartbeat/storage-mon], [chmod +x heartbeat/storage-mon])
|
|
AC_CONFIG_FILES([heartbeat/sybaseASE], [chmod +x heartbeat/sybaseASE])
|
|
AC_CONFIG_FILES([heartbeat/syslog-ng], [chmod +x heartbeat/syslog-ng])
|
|
AC_CONFIG_FILES([heartbeat/vsftpd], [chmod +x heartbeat/vsftpd])
|
|
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
|
|
index 947d83cb2b..97904ccb16 100644
|
|
--- a/doc/man/Makefile.am
|
|
+++ b/doc/man/Makefile.am
|
|
@@ -138,6 +138,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
|
|
ocf_heartbeat_mariadb.7 \
|
|
ocf_heartbeat_mdraid.7 \
|
|
ocf_heartbeat_minio.7 \
|
|
+ ocf_heartbeat_mpathpersist.7 \
|
|
ocf_heartbeat_mysql.7 \
|
|
ocf_heartbeat_mysql-proxy.7 \
|
|
ocf_heartbeat_nagios.7 \
|
|
@@ -175,7 +176,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
|
|
ocf_heartbeat_smb-share.7 \
|
|
ocf_heartbeat_sybaseASE.7 \
|
|
ocf_heartbeat_sg_persist.7 \
|
|
- ocf_heartbeat_mpathpersist.7 \
|
|
+ ocf_heartbeat_storage-mon.7 \
|
|
ocf_heartbeat_symlink.7 \
|
|
ocf_heartbeat_syslog-ng.7 \
|
|
ocf_heartbeat_tomcat.7 \
|
|
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
|
index 9af44cc127..5d52d211f2 100644
|
|
--- a/heartbeat/Makefile.am
|
|
+++ b/heartbeat/Makefile.am
|
|
@@ -32,22 +32,22 @@ ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat
|
|
dtddir = $(datadir)/$(PACKAGE_NAME)
|
|
dtd_DATA = ra-api-1.dtd metadata.rng
|
|
|
|
+ocf_PROGRAMS =
|
|
+
|
|
if USE_IPV6ADDR_AGENT
|
|
-ocf_PROGRAMS = IPv6addr
|
|
-else
|
|
-ocf_PROGRAMS =
|
|
+ocf_PROGRAMS += IPv6addr
|
|
endif
|
|
|
|
+halib_PROGRAMS =
|
|
+
|
|
if IPV6ADDR_COMPATIBLE
|
|
-halib_PROGRAMS = send_ua
|
|
-else
|
|
-halib_PROGRAMS =
|
|
+halib_PROGRAMS += send_ua
|
|
endif
|
|
|
|
IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c
|
|
-send_ua_SOURCES = send_ua.c IPv6addr_utils.c
|
|
-
|
|
IPv6addr_LDADD = -lplumb $(LIBNETLIBS)
|
|
+
|
|
+send_ua_SOURCES = send_ua.c IPv6addr_utils.c
|
|
send_ua_LDADD = $(LIBNETLIBS)
|
|
|
|
osp_SCRIPTS = nova-compute-wait \
|
|
@@ -170,6 +170,7 @@ ocf_SCRIPTS = AoEtarget \
|
|
mpathpersist \
|
|
slapd \
|
|
+ storage-mon \
|
|
sybaseASE \
|
|
symlink \
|
|
syslog-ng \
|
|
tomcat \
|
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
|
new file mode 100644
|
|
index 0000000000..5b289fe554
|
|
--- /dev/null
|
|
+++ b/heartbeat/storage-mon.in
|
|
@@ -0,0 +1,263 @@
|
|
+#!@BASH_SHELL@
|
|
+#
|
|
+# Copyright (C) 2021 Red Hat, Inc. All rights reserved.
|
|
+#
|
|
+# Authors: Christine Caulfield <ccaulfie@redhat.com>
|
|
+# Fabio M. Di Nitto <fdinitto@redhat.com>
|
|
+#
|
|
+# This program is free software; you can redistribute it and/or modify
|
|
+# it under the terms of version 2 of the GNU General Public License as
|
|
+# published by the Free Software Foundation.
|
|
+#
|
|
+# This program is distributed in the hope that it would be useful, but
|
|
+# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
+#
|
|
+# Further, this software is distributed without any warranty that it is
|
|
+# free of the rightful claim of any third person regarding infringement
|
|
+# or the like. Any license provided herein, whether implied or
|
|
+# otherwise, applies only to this software file. Patent licenses, if
|
|
+# any, provided herein do not apply to combinations of this program with
|
|
+# other software, or any other product whatsoever.
|
|
+#
|
|
+# You should have received a copy of the GNU General Public License
|
|
+# along with this program; if not, write the Free Software Foundation,
|
|
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
|
|
+#
|
|
+
|
|
+#
|
|
+# Checks storage I/O status of all given drives and writes the #health-storage
|
|
+# status into the CIB
|
|
+# Implementation is heavily based on ocf:pacemaker:HealtSMART
|
|
+#
|
|
+# It sends a single block on IO to a radom location on the device and reports any errors returned.
|
|
+# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some
|
|
+# instances).
|
|
+#
|
|
+# It's worth making a note in the RA description that the smartmon RA is also recommended (this
|
|
+# does not replace it), and that Pacemaker health checking should be configued.
|
|
+#
|
|
+# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health
|
|
+
|
|
+#######################################################################
|
|
+
|
|
+#######################################################################
|
|
+# Initialization:
|
|
+
|
|
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
|
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
+
|
|
+#
|
|
+STORAGEMON=$HA_BIN/storage_mon
|
|
+ATTRDUP=/usr/sbin/attrd_updater
|
|
+
|
|
+OCF_RESKEY_CRM_meta_interval_default="0"
|
|
+OCF_RESKEY_io_timeout_default="10"
|
|
+OCF_RESKEY_inject_errors_default=""
|
|
+OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
|
|
+
|
|
+# Explicitly list all environment variables used, to make static analysis happy
|
|
+: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
|
|
+: ${OCF_RESKEY_drives:=""}
|
|
+: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
|
|
+: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
|
|
+: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
|
|
+
|
|
+#######################################################################
|
|
+
|
|
+meta_data() {
|
|
+ cat <<END
|
|
+<?xml version="1.0"?>
|
|
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
|
+<resource-agent name="storage-mon">
|
|
+<version>1.0</version>
|
|
+
|
|
+<longdesc lang="en">
|
|
+System health agent that checks the storage I/O status of the given drives and
|
|
+updates the #health-storage attribute. Usage is highly recommended in combination
|
|
+with storage-mon monitoring agent. The agent currently support a maximum of 25
|
|
+devices per instance.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">storage I/O health status</shortdesc>
|
|
+
|
|
+<parameters>
|
|
+
|
|
+<parameter name="state_file" unique="1">
|
|
+<longdesc lang="en">
|
|
+Location to store the resource state in.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">State file</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_state_file_default}" />
|
|
+</parameter>
|
|
+
|
|
+<parameter name="drives" unique="1" required="1">
|
|
+<longdesc lang="en">
|
|
+The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Drives to check</shortdesc>
|
|
+<content type="string" default="" />
|
|
+</parameter>
|
|
+
|
|
+<parameter name="io_timeout" unique="0">
|
|
+<longdesc lang="en">
|
|
+Specify disk I/O timeout in seconds. Minimum 1, recommeded 10 (default).
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Disk I/O timeout</shortdesc>
|
|
+<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
|
|
+</parameter>
|
|
+
|
|
+<parameter name="inject_errors" unique="0">
|
|
+<longdesc lang="en">
|
|
+Used only for testing! Specify % of I/O errors to simulate drives failures.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc>
|
|
+<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
|
|
+</parameter>
|
|
+
|
|
+</parameters>
|
|
+
|
|
+<actions>
|
|
+<action name="start" timeout="10s" />
|
|
+<action name="stop" timeout="120s" />
|
|
+<action name="monitor" timeout="120s" interval="30s" start-delay="0s" />
|
|
+<action name="meta-data" timeout="5s" />
|
|
+<action name="validate-all" timeout="10s" />
|
|
+</actions>
|
|
+</resource-agent>
|
|
+END
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+#######################################################################
|
|
+
|
|
+storage-mon_usage() {
|
|
+ cat <<END
|
|
+usage: $0 {start|stop|monitor|validate-all|meta-data}
|
|
+
|
|
+Expects to have a fully populated OCF RA-compliant environment set.
|
|
+END
|
|
+ return $1
|
|
+}
|
|
+
|
|
+storage-mon_init() {
|
|
+ #Test for presence of storage_mon helper
|
|
+ if [ ! -x "$STORAGEMON" ] ; then
|
|
+ ocf_log err "${STORAGEMON} not installed."
|
|
+ exit $OCF_ERR_INSTALLED
|
|
+ fi
|
|
+
|
|
+ i=0
|
|
+ for DRIVE in ${OCF_RESKEY_drives}; do
|
|
+ if [ ! -e "$DRIVE" ] ; then
|
|
+ ocf_log err "${DRIVE} not found on the system"
|
|
+ exit $OCF_ERR_INSTALLED
|
|
+ fi
|
|
+ i=$((i + 1))
|
|
+ done
|
|
+
|
|
+ if [ "$i" -gt "25" ]; then
|
|
+ ocf_log err "Too many drives ($i) configured for this agent. Max 25."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
|
|
+ ocf_log err "Minimum timeout is 1. Recommended 10 (default)."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
|
|
+ if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then
|
|
+ ocf_log err "Inject errors % has to be a value between 1 and 100."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+ fi
|
|
+}
|
|
+
|
|
+storage-mon_validate() {
|
|
+ storage-mon_init
|
|
+
|
|
+ # Is the state directory writable?
|
|
+ state_dir=$(dirname "$OCF_RESKEY_state_file")
|
|
+ touch "$state_dir/$$"
|
|
+ if [ $? -ne 0 ]; then
|
|
+ return $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+ rm "$state_dir/$$"
|
|
+
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+storage-mon_monitor() {
|
|
+ storage-mon_init
|
|
+
|
|
+ # Monitor _MUST!_ differentiate correctly between running
|
|
+ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
|
|
+ # That is THREE states, not just yes/no.
|
|
+
|
|
+ if [ ! -f "${OCF_RESKEY_state_file}" ]; then
|
|
+ return $OCF_NOT_RUNNING
|
|
+ fi
|
|
+
|
|
+ # generate command line
|
|
+ cmdline=""
|
|
+ for DRIVE in ${OCF_RESKEY_drives}; do
|
|
+ cmdline="$cmdline --device $DRIVE --score 1"
|
|
+ done
|
|
+ cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
|
|
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
|
|
+ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
|
|
+ fi
|
|
+ $STORAGEMON $cmdline
|
|
+ if [ $? -ne 0 ]; then
|
|
+ status="red"
|
|
+ else
|
|
+ status="green"
|
|
+ fi
|
|
+
|
|
+ "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s"
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+storage-mon_start() {
|
|
+ storage-mon_monitor
|
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+ touch "${OCF_RESKEY_state_file}"
|
|
+}
|
|
+
|
|
+storage-mon_stop() {
|
|
+ storage-mon_monitor
|
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
|
+ rm "${OCF_RESKEY_state_file}"
|
|
+ fi
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+storage-mon_validate() {
|
|
+ storage-mon_init
|
|
+
|
|
+ # Is the state directory writable?
|
|
+ state_dir=$(dirname "${OCF_RESKEY_state_file}")
|
|
+ touch "$state_dir/$$"
|
|
+ if [ $? -ne 0 ]; then
|
|
+ return $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+ rm "$state_dir/$$"
|
|
+
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+case "$__OCF_ACTION" in
|
|
+ start) storage-mon_start;;
|
|
+ stop) storage-mon_stop;;
|
|
+ monitor) storage-mon_monitor;;
|
|
+ validate-all) storage-mon_validate;;
|
|
+ meta-data) meta_data;;
|
|
+ usage|help) storage-mon_usage $OCF_SUCCESS;;
|
|
+ *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;;
|
|
+esac
|
|
+rc=$?
|
|
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
|
|
+exit $rc
|
|
+# vim: set filetype=sh:
|
|
diff --git a/tools/Makefile.am b/tools/Makefile.am
|
|
index 1186967cfb..83ff43651d 100644
|
|
--- a/tools/Makefile.am
|
|
+++ b/tools/Makefile.am
|
|
@@ -29,7 +29,8 @@ EXTRA_DIST = ocf-tester.8 sfex_init.8
|
|
|
|
sbin_PROGRAMS =
|
|
sbin_SCRIPTS = ocf-tester
|
|
-halib_PROGRAMS = findif
|
|
+halib_PROGRAMS = findif \
|
|
+ storage_mon
|
|
|
|
man8_MANS = ocf-tester.8
|
|
|
|
@@ -67,6 +68,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl
|
|
|
|
findif_SOURCES = findif.c
|
|
|
|
+storage_mon_SOURCES = storage_mon.c
|
|
+
|
|
if BUILD_TICKLE
|
|
halib_PROGRAMS += tickle_tcp
|
|
tickle_tcp_SOURCES = tickle_tcp.c
|
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
|
new file mode 100644
|
|
index 0000000000..7b65bb4191
|
|
--- /dev/null
|
|
+++ b/tools/storage_mon.c
|
|
@@ -0,0 +1,263 @@
|
|
+#include <stdio.h>
|
|
+#include <getopt.h>
|
|
+#include <stdlib.h>
|
|
+#include <stdint.h>
|
|
+#include <syslog.h>
|
|
+#include <unistd.h>
|
|
+#include <errno.h>
|
|
+#include <string.h>
|
|
+#include <fcntl.h>
|
|
+#include <time.h>
|
|
+#include <sys/types.h>
|
|
+#include <sys/wait.h>
|
|
+#include <sys/stat.h>
|
|
+#include <sys/ioctl.h>
|
|
+#include <sys/mount.h>
|
|
+#ifdef __FreeBSD__
|
|
+#include <sys/disk.h>
|
|
+#endif
|
|
+
|
|
+#define MAX_DEVICES 25
|
|
+#define DEFAULT_TIMEOUT 10
|
|
+
|
|
+static void usage(char *name, FILE *f)
|
|
+{
|
|
+ fprintf(f, "usage: %s [-hv] [-d <device>]... [-s <score>]... [-t <secs>]\n", name);
|
|
+ fprintf(f, " --device <dev> device to test, up to %d instances\n", MAX_DEVICES);
|
|
+ fprintf(f, " --score <n> score if device fails the test. Must match --device count\n");
|
|
+ fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
|
|
+ fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
|
|
+ fprintf(f, " --verbose emit extra output to stdout\n");
|
|
+ fprintf(f, " --help print this messages\n");
|
|
+}
|
|
+
|
|
+/* Check one device */
|
|
+static void *test_device(const char *device, int verbose, int inject_error_percent)
|
|
+{
|
|
+ uint64_t devsize;
|
|
+ int device_fd;
|
|
+ int res;
|
|
+ off_t seek_spot;
|
|
+ char buffer[512];
|
|
+
|
|
+ if (verbose) {
|
|
+ printf("Testing device %s\n", device);
|
|
+ }
|
|
+
|
|
+ device_fd = open(device, O_RDONLY);
|
|
+ if (device_fd < 0) {
|
|
+ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
|
|
+ exit(-1);
|
|
+ }
|
|
+#ifdef __FreeBSD__
|
|
+ res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize);
|
|
+#else
|
|
+ res = ioctl(device_fd, BLKGETSIZE64, &devsize);
|
|
+#endif
|
|
+ if (res != 0) {
|
|
+ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
|
|
+ close(device_fd);
|
|
+ exit(-1);
|
|
+ }
|
|
+ if (verbose) {
|
|
+ fprintf(stderr, "%s: size=%zu\n", device, devsize);
|
|
+ }
|
|
+ /* Don't fret about real randomness */
|
|
+ srand(time(NULL) + getpid());
|
|
+ /* Pick a random place on the device - sector aligned */
|
|
+ seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
|
|
+ res = lseek(device_fd, seek_spot, SEEK_SET);
|
|
+ if (res < 0) {
|
|
+ fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
|
|
+ close(device_fd);
|
|
+ exit(-1);
|
|
+ }
|
|
+
|
|
+ if (verbose) {
|
|
+ printf("%s: reading from pos %ld\n", device, seek_spot);
|
|
+ }
|
|
+
|
|
+ res = read(device_fd, buffer, sizeof(buffer));
|
|
+ if (res < 0) {
|
|
+ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
|
|
+ close(device_fd);
|
|
+ exit(-1);
|
|
+ }
|
|
+ if (res < (int)sizeof(buffer)) {
|
|
+ fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
|
|
+ close(device_fd);
|
|
+ exit(-1);
|
|
+ }
|
|
+
|
|
+ /* Fake an error */
|
|
+ if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
|
|
+ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
|
|
+ close(device_fd);
|
|
+ exit(-1);
|
|
+ }
|
|
+ res = close(device_fd);
|
|
+ if (res != 0) {
|
|
+ fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno));
|
|
+ close(device_fd);
|
|
+ exit(-1);
|
|
+ }
|
|
+
|
|
+ if (verbose) {
|
|
+ printf("%s: done\n", device);
|
|
+ }
|
|
+ exit(0);
|
|
+}
|
|
+
|
|
+int main(int argc, char *argv[])
|
|
+{
|
|
+ char *devices[MAX_DEVICES];
|
|
+ int scores[MAX_DEVICES];
|
|
+ pid_t test_forks[MAX_DEVICES];
|
|
+ size_t device_count = 0;
|
|
+ size_t score_count = 0;
|
|
+ size_t finished_count = 0;
|
|
+ int timeout = DEFAULT_TIMEOUT;
|
|
+ struct timespec ts;
|
|
+ time_t start_time;
|
|
+ size_t i;
|
|
+ int final_score = 0;
|
|
+ int opt, option_index;
|
|
+ int verbose = 0;
|
|
+ int inject_error_percent = 0;
|
|
+ struct option long_options[] = {
|
|
+ {"timeout", required_argument, 0, 't' },
|
|
+ {"device", required_argument, 0, 'd' },
|
|
+ {"score", required_argument, 0, 's' },
|
|
+ {"inject-errors-percent", required_argument, 0, 0 },
|
|
+ {"verbose", no_argument, 0, 'v' },
|
|
+ {"help", no_argument, 0, 'h' },
|
|
+ {0, 0, 0, 0 }
|
|
+ };
|
|
+ while ( (opt = getopt_long(argc, argv, "hvt:d:s:",
|
|
+ long_options, &option_index)) != -1 ) {
|
|
+ switch (opt) {
|
|
+ case 0: /* Long-only options */
|
|
+ if (strcmp(long_options[option_index].name, "inject-errors-percent") == 0) {
|
|
+ inject_error_percent = atoi(optarg);
|
|
+ if (inject_error_percent < 1 || inject_error_percent > 100) {
|
|
+ fprintf(stderr, "inject_error_percent should be between 1 and 100\n");
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ case 'd':
|
|
+ if (device_count < MAX_DEVICES) {
|
|
+ devices[device_count++] = strdup(optarg);
|
|
+ } else {
|
|
+ fprintf(stderr, "too many devices, max is %d\n", MAX_DEVICES);
|
|
+ return -1;
|
|
+ }
|
|
+ break;
|
|
+ case 's':
|
|
+ if (device_count < MAX_DEVICES) {
|
|
+ int score = atoi(optarg);
|
|
+ if (score < 1 || score > 10) {
|
|
+ fprintf(stderr, "Score must be between 1 and 10 inclusive\n");
|
|
+ return -1;
|
|
+ }
|
|
+ scores[score_count++] = score;
|
|
+ } else {
|
|
+ fprintf(stderr, "too many scores, max is %d\n", MAX_DEVICES);
|
|
+ return -1;
|
|
+ }
|
|
+ break;
|
|
+ case 'v':
|
|
+ verbose++;
|
|
+ break;
|
|
+ case 't':
|
|
+ timeout = atoi(optarg);
|
|
+ if (timeout < 1) {
|
|
+ fprintf(stderr, "invalid timeout %d. Min 1, recommended %d (default)\n", timeout, DEFAULT_TIMEOUT);
|
|
+ return -1;
|
|
+ }
|
|
+ break;
|
|
+ case 'h':
|
|
+ usage(argv[0], stdout);
|
|
+ break;
|
|
+ default:
|
|
+ usage(argv[0], stderr);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ }
|
|
+ if (device_count == 0) {
|
|
+ fprintf(stderr, "No devices to test, use the -d or --device argument\n");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (device_count != score_count) {
|
|
+ fprintf(stderr, "There must be the same number of devices and scores\n");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ openlog("storage_mon", 0, LOG_DAEMON);
|
|
+
|
|
+ memset(test_forks, 0, sizeof(test_forks));
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ test_forks[i] = fork();
|
|
+ if (test_forks[i] < 0) {
|
|
+ fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
+ syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
|
|
+ /* Just test the devices we have */
|
|
+ break;
|
|
+ }
|
|
+ /* child */
|
|
+ if (test_forks[i] == 0) {
|
|
+ test_device(devices[i], verbose, inject_error_percent);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* See if they have finished */
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ start_time = ts.tv_sec;
|
|
+
|
|
+ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ int wstatus;
|
|
+ pid_t w;
|
|
+
|
|
+ if (test_forks[i] > 0) {
|
|
+ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
|
|
+ if (w < 0) {
|
|
+ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (w == test_forks[i]) {
|
|
+ if (WIFEXITED(wstatus)) {
|
|
+ if (WEXITSTATUS(wstatus) == 0) {
|
|
+ finished_count++;
|
|
+ test_forks[i] = 0;
|
|
+ } else {
|
|
+ syslog(LOG_ERR, "Error reading from device %s", devices[i]);
|
|
+ final_score += scores[i];
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ usleep(100000);
|
|
+
|
|
+ clock_gettime(CLOCK_REALTIME, &ts);
|
|
+ }
|
|
+
|
|
+ /* See which threads have not finished */
|
|
+ for (i=0; i<device_count; i++) {
|
|
+ if (test_forks[i] != 0) {
|
|
+ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
|
|
+ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
|
|
+ final_score += scores[i];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (verbose) {
|
|
+ printf("Final score is %d\n", final_score);
|
|
+ }
|
|
+ return final_score;
|
|
+}
|