resource-agents/SOURCES/bz1509319-storage-mon-new-r...

715 lines
21 KiB
Diff

From 90b595650d7d8a6f6a69a9f7060c6406aa731c18 Mon Sep 17 00:00:00 2001
From: "Fabio M. Di Nitto" <fdinitto@redhat.com>
Date: Wed, 28 Jul 2021 10:08:10 +0200
Subject: [PATCH] Add storage-mon pacemaker health check
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
---
.gitignore | 41 ++++++
configure.ac | 1 +
doc/man/Makefile.am | 3 +-
heartbeat/Makefile.am | 17 +--
heartbeat/storage-mon.in | 263 +++++++++++++++++++++++++++++++++++++++
tools/Makefile.am | 5 +-
tools/storage_mon.c | 263 +++++++++++++++++++++++++++++++++++++++
7 files changed, 583 insertions(+), 10 deletions(-)
create mode 100644 heartbeat/storage-mon.in
create mode 100644 tools/storage_mon.c
diff --git a/.gitignore b/.gitignore
index 38d3566205..f7277bf04e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,46 @@ heartbeat/ocf-shellfuncs
heartbeat/send_ua
heartbeat/shellfuncs
heartbeat/*.pyc
+heartbeat/AoEtarget
+heartbeat/CTDB
+heartbeat/ManageRAID
+heartbeat/ManageVE
+heartbeat/Squid
+heartbeat/SysInfo
+heartbeat/aws-vpc-route53
+heartbeat/azure-events
+heartbeat/clvm
+heartbeat/conntrackd
+heartbeat/dnsupdate
+heartbeat/dummypy
+heartbeat/eDir88
+heartbeat/fio
+heartbeat/galera
+heartbeat/gcp-pd-move
+heartbeat/gcp-vpc-move-ip
+heartbeat/gcp-vpc-move-route
+heartbeat/gcp-vpc-move-vip
+heartbeat/iSCSILogicalUnit
+heartbeat/iSCSITarget
+heartbeat/jira
+heartbeat/kamailio
+heartbeat/lxc
+heartbeat/lxd-info
+heartbeat/machine-info
+heartbeat/mariadb
+heartbeat/mpathpersist
+heartbeat/nfsnotify
+heartbeat/openstack-info
+heartbeat/rabbitmq-cluster
+heartbeat/redis
+heartbeat/rsyslog
+heartbeat/sg_persist
+heartbeat/slapd
+heartbeat/smb-share
+heartbeat/storage-mon
+heartbeat/sybaseASE
+heartbeat/syslog-ng
+heartbeat/vsftpd
include/agent_config.h
include/config.h
include/config.h.in
@@ -61,6 +101,7 @@ systemd/resource-agents.conf
tools/findif
tools/ocf-tester
tools/send_arp
+tools/storage_mon
tools/tickle_tcp
tools/ocft/README
tools/ocft/README.zh_CN
diff --git a/configure.ac b/configure.ac
index 717fb95432..c125df98f6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1002,6 +1002,7 @@ AC_CONFIG_FILES([heartbeat/rsyslog], [chmod +x heartbeat/rsyslog])
AC_CONFIG_FILES([heartbeat/smb-share], [chmod +x heartbeat/smb-share])
AC_CONFIG_FILES([heartbeat/sg_persist], [chmod +x heartbeat/sg_persist])
AC_CONFIG_FILES([heartbeat/slapd], [chmod +x heartbeat/slapd])
+AC_CONFIG_FILES([heartbeat/storage-mon], [chmod +x heartbeat/storage-mon])
AC_CONFIG_FILES([heartbeat/sybaseASE], [chmod +x heartbeat/sybaseASE])
AC_CONFIG_FILES([heartbeat/syslog-ng], [chmod +x heartbeat/syslog-ng])
AC_CONFIG_FILES([heartbeat/vsftpd], [chmod +x heartbeat/vsftpd])
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
index 947d83cb2b..97904ccb16 100644
--- a/doc/man/Makefile.am
+++ b/doc/man/Makefile.am
@@ -138,6 +138,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
ocf_heartbeat_mariadb.7 \
ocf_heartbeat_mdraid.7 \
ocf_heartbeat_minio.7 \
+ ocf_heartbeat_mpathpersist.7 \
ocf_heartbeat_mysql.7 \
ocf_heartbeat_mysql-proxy.7 \
ocf_heartbeat_nagios.7 \
@@ -175,7 +176,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
ocf_heartbeat_smb-share.7 \
ocf_heartbeat_sybaseASE.7 \
ocf_heartbeat_sg_persist.7 \
- ocf_heartbeat_mpathpersist.7 \
+ ocf_heartbeat_storage-mon.7 \
ocf_heartbeat_symlink.7 \
ocf_heartbeat_syslog-ng.7 \
ocf_heartbeat_tomcat.7 \
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
index 9af44cc127..5d52d211f2 100644
--- a/heartbeat/Makefile.am
+++ b/heartbeat/Makefile.am
@@ -32,22 +32,22 @@ ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat
dtddir = $(datadir)/$(PACKAGE_NAME)
dtd_DATA = ra-api-1.dtd metadata.rng
+ocf_PROGRAMS =
+
if USE_IPV6ADDR_AGENT
-ocf_PROGRAMS = IPv6addr
-else
-ocf_PROGRAMS =
+ocf_PROGRAMS += IPv6addr
endif
+halib_PROGRAMS =
+
if IPV6ADDR_COMPATIBLE
-halib_PROGRAMS = send_ua
-else
-halib_PROGRAMS =
+halib_PROGRAMS += send_ua
endif
IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c
-send_ua_SOURCES = send_ua.c IPv6addr_utils.c
-
IPv6addr_LDADD = -lplumb $(LIBNETLIBS)
+
+send_ua_SOURCES = send_ua.c IPv6addr_utils.c
send_ua_LDADD = $(LIBNETLIBS)
osp_SCRIPTS = nova-compute-wait \
@@ -170,6 +170,7 @@ ocf_SCRIPTS = AoEtarget \
mpathpersist \
slapd \
+ storage-mon \
sybaseASE \
symlink \
syslog-ng \
tomcat \
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
new file mode 100644
index 0000000000..5b289fe554
--- /dev/null
+++ b/heartbeat/storage-mon.in
@@ -0,0 +1,263 @@
+#!@BASH_SHELL@
+#
+# Copyright (C) 2021 Red Hat, Inc. All rights reserved.
+#
+# Authors: Christine Caulfield <ccaulfie@redhat.com>
+# Fabio M. Di Nitto <fdinitto@redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like. Any license provided herein, whether implied or
+# otherwise, applies only to this software file. Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+
+#
+# Checks storage I/O status of all given drives and writes the #health-storage
+# status into the CIB
+# Implementation is heavily based on ocf:pacemaker:HealtSMART
+#
+# It sends a single block on IO to a radom location on the device and reports any errors returned.
+# If the IO hangs, that will also be returned. (bear in mind tha tmay also hang the C app in some
+# instances).
+#
+# It's worth making a note in the RA description that the smartmon RA is also recommended (this
+# does not replace it), and that Pacemaker health checking should be configued.
+#
+# https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/singlehtml/index.html#tracking-node-health
+
+#######################################################################
+
+#######################################################################
+# Initialization:
+
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
+
+#
+STORAGEMON=$HA_BIN/storage_mon
+ATTRDUP=/usr/sbin/attrd_updater
+
+OCF_RESKEY_CRM_meta_interval_default="0"
+OCF_RESKEY_io_timeout_default="10"
+OCF_RESKEY_inject_errors_default=""
+OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
+
+# Explicitly list all environment variables used, to make static analysis happy
+: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
+: ${OCF_RESKEY_drives:=""}
+: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
+: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
+: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
+
+#######################################################################
+
+meta_data() {
+ cat <<END
+<?xml version="1.0"?>
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
+<resource-agent name="storage-mon">
+<version>1.0</version>
+
+<longdesc lang="en">
+System health agent that checks the storage I/O status of the given drives and
+updates the #health-storage attribute. Usage is highly recommended in combination
+with storage-mon monitoring agent. The agent currently support a maximum of 25
+devices per instance.
+</longdesc>
+<shortdesc lang="en">storage I/O health status</shortdesc>
+
+<parameters>
+
+<parameter name="state_file" unique="1">
+<longdesc lang="en">
+Location to store the resource state in.
+</longdesc>
+<shortdesc lang="en">State file</shortdesc>
+<content type="string" default="${OCF_RESKEY_state_file_default}" />
+</parameter>
+
+<parameter name="drives" unique="1" required="1">
+<longdesc lang="en">
+The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
+</longdesc>
+<shortdesc lang="en">Drives to check</shortdesc>
+<content type="string" default="" />
+</parameter>
+
+<parameter name="io_timeout" unique="0">
+<longdesc lang="en">
+Specify disk I/O timeout in seconds. Minimum 1, recommeded 10 (default).
+</longdesc>
+<shortdesc lang="en">Disk I/O timeout</shortdesc>
+<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
+</parameter>
+
+<parameter name="inject_errors" unique="0">
+<longdesc lang="en">
+Used only for testing! Specify % of I/O errors to simulate drives failures.
+</longdesc>
+<shortdesc lang="en">Specify % of I/O errors to simulate drives failures</shortdesc>
+<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
+</parameter>
+
+</parameters>
+
+<actions>
+<action name="start" timeout="10s" />
+<action name="stop" timeout="120s" />
+<action name="monitor" timeout="120s" interval="30s" start-delay="0s" />
+<action name="meta-data" timeout="5s" />
+<action name="validate-all" timeout="10s" />
+</actions>
+</resource-agent>
+END
+ return $OCF_SUCCESS
+}
+
+#######################################################################
+
+storage-mon_usage() {
+ cat <<END
+usage: $0 {start|stop|monitor|validate-all|meta-data}
+
+Expects to have a fully populated OCF RA-compliant environment set.
+END
+ return $1
+}
+
+storage-mon_init() {
+ #Test for presence of storage_mon helper
+ if [ ! -x "$STORAGEMON" ] ; then
+ ocf_log err "${STORAGEMON} not installed."
+ exit $OCF_ERR_INSTALLED
+ fi
+
+ i=0
+ for DRIVE in ${OCF_RESKEY_drives}; do
+ if [ ! -e "$DRIVE" ] ; then
+ ocf_log err "${DRIVE} not found on the system"
+ exit $OCF_ERR_INSTALLED
+ fi
+ i=$((i + 1))
+ done
+
+ if [ "$i" -gt "25" ]; then
+ ocf_log err "Too many drives ($i) configured for this agent. Max 25."
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
+ ocf_log err "Minimum timeout is 1. Recommended 10 (default)."
+ exit $OCF_ERR_CONFIGURED
+ fi
+
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
+ if [ "${OCF_RESKEY_inject_errors}" -lt "1" ] || [ "${OCF_RESKEY_inject_errors}" -gt "100" ]; then
+ ocf_log err "Inject errors % has to be a value between 1 and 100."
+ exit $OCF_ERR_CONFIGURED
+ fi
+ fi
+}
+
+storage-mon_validate() {
+ storage-mon_init
+
+ # Is the state directory writable?
+ state_dir=$(dirname "$OCF_RESKEY_state_file")
+ touch "$state_dir/$$"
+ if [ $? -ne 0 ]; then
+ return $OCF_ERR_CONFIGURED
+ fi
+ rm "$state_dir/$$"
+
+ return $OCF_SUCCESS
+}
+
+storage-mon_monitor() {
+ storage-mon_init
+
+ # Monitor _MUST!_ differentiate correctly between running
+ # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
+ # That is THREE states, not just yes/no.
+
+ if [ ! -f "${OCF_RESKEY_state_file}" ]; then
+ return $OCF_NOT_RUNNING
+ fi
+
+ # generate command line
+ cmdline=""
+ for DRIVE in ${OCF_RESKEY_drives}; do
+ cmdline="$cmdline --device $DRIVE --score 1"
+ done
+ cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
+ if [ -n "${OCF_RESKEY_inject_errors}" ]; then
+ cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
+ fi
+ $STORAGEMON $cmdline
+ if [ $? -ne 0 ]; then
+ status="red"
+ else
+ status="green"
+ fi
+
+ "$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s"
+ return $OCF_SUCCESS
+}
+
+storage-mon_start() {
+ storage-mon_monitor
+ if [ $? -eq $OCF_SUCCESS ]; then
+ return $OCF_SUCCESS
+ fi
+ touch "${OCF_RESKEY_state_file}"
+}
+
+storage-mon_stop() {
+ storage-mon_monitor
+ if [ $? -eq $OCF_SUCCESS ]; then
+ rm "${OCF_RESKEY_state_file}"
+ fi
+ return $OCF_SUCCESS
+}
+
+storage-mon_validate() {
+ storage-mon_init
+
+ # Is the state directory writable?
+ state_dir=$(dirname "${OCF_RESKEY_state_file}")
+ touch "$state_dir/$$"
+ if [ $? -ne 0 ]; then
+ return $OCF_ERR_CONFIGURED
+ fi
+ rm "$state_dir/$$"
+
+ return $OCF_SUCCESS
+}
+
+case "$__OCF_ACTION" in
+ start) storage-mon_start;;
+ stop) storage-mon_stop;;
+ monitor) storage-mon_monitor;;
+ validate-all) storage-mon_validate;;
+ meta-data) meta_data;;
+ usage|help) storage-mon_usage $OCF_SUCCESS;;
+ *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;;
+esac
+rc=$?
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
+exit $rc
+# vim: set filetype=sh:
diff --git a/tools/Makefile.am b/tools/Makefile.am
index 1186967cfb..83ff43651d 100644
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -29,7 +29,8 @@ EXTRA_DIST = ocf-tester.8 sfex_init.8
sbin_PROGRAMS =
sbin_SCRIPTS = ocf-tester
-halib_PROGRAMS = findif
+halib_PROGRAMS = findif \
+ storage_mon
man8_MANS = ocf-tester.8
@@ -67,6 +68,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl
findif_SOURCES = findif.c
+storage_mon_SOURCES = storage_mon.c
+
if BUILD_TICKLE
halib_PROGRAMS += tickle_tcp
tickle_tcp_SOURCES = tickle_tcp.c
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
new file mode 100644
index 0000000000..7b65bb4191
--- /dev/null
+++ b/tools/storage_mon.c
@@ -0,0 +1,263 @@
+#include <stdio.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#ifdef __FreeBSD__
+#include <sys/disk.h>
+#endif
+
+#define MAX_DEVICES 25
+#define DEFAULT_TIMEOUT 10
+
+static void usage(char *name, FILE *f)
+{
+ fprintf(f, "usage: %s [-hv] [-d <device>]... [-s <score>]... [-t <secs>]\n", name);
+ fprintf(f, " --device <dev> device to test, up to %d instances\n", MAX_DEVICES);
+ fprintf(f, " --score <n> score if device fails the test. Must match --device count\n");
+ fprintf(f, " --timeout <n> max time to wait for a device test to come back. in seconds (default %d)\n", DEFAULT_TIMEOUT);
+ fprintf(f, " --inject-errors-percent <n> Generate EIO errors <n>%% of the time (for testing only)\n");
+ fprintf(f, " --verbose emit extra output to stdout\n");
+ fprintf(f, " --help print this messages\n");
+}
+
+/* Check one device */
+static void *test_device(const char *device, int verbose, int inject_error_percent)
+{
+ uint64_t devsize;
+ int device_fd;
+ int res;
+ off_t seek_spot;
+ char buffer[512];
+
+ if (verbose) {
+ printf("Testing device %s\n", device);
+ }
+
+ device_fd = open(device, O_RDONLY);
+ if (device_fd < 0) {
+ fprintf(stderr, "Failed to open %s: %s\n", device, strerror(errno));
+ exit(-1);
+ }
+#ifdef __FreeBSD__
+ res = ioctl(device_fd, DIOCGMEDIASIZE, &devsize);
+#else
+ res = ioctl(device_fd, BLKGETSIZE64, &devsize);
+#endif
+ if (res != 0) {
+ fprintf(stderr, "Failed to stat %s: %s\n", device, strerror(errno));
+ close(device_fd);
+ exit(-1);
+ }
+ if (verbose) {
+ fprintf(stderr, "%s: size=%zu\n", device, devsize);
+ }
+ /* Don't fret about real randomness */
+ srand(time(NULL) + getpid());
+ /* Pick a random place on the device - sector aligned */
+ seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
+ res = lseek(device_fd, seek_spot, SEEK_SET);
+ if (res < 0) {
+ fprintf(stderr, "Failed to seek %s: %s\n", device, strerror(errno));
+ close(device_fd);
+ exit(-1);
+ }
+
+ if (verbose) {
+ printf("%s: reading from pos %ld\n", device, seek_spot);
+ }
+
+ res = read(device_fd, buffer, sizeof(buffer));
+ if (res < 0) {
+ fprintf(stderr, "Failed to read %s: %s\n", device, strerror(errno));
+ close(device_fd);
+ exit(-1);
+ }
+ if (res < (int)sizeof(buffer)) {
+ fprintf(stderr, "Failed to read %ld bytes from %s, got %d\n", sizeof(buffer), device, res);
+ close(device_fd);
+ exit(-1);
+ }
+
+ /* Fake an error */
+ if (inject_error_percent && ((rand() % 100) < inject_error_percent)) {
+ fprintf(stderr, "People, please fasten your seatbelts, injecting errors!\n");
+ close(device_fd);
+ exit(-1);
+ }
+ res = close(device_fd);
+ if (res != 0) {
+ fprintf(stderr, "Failed to close %s: %s\n", device, strerror(errno));
+ close(device_fd);
+ exit(-1);
+ }
+
+ if (verbose) {
+ printf("%s: done\n", device);
+ }
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ char *devices[MAX_DEVICES];
+ int scores[MAX_DEVICES];
+ pid_t test_forks[MAX_DEVICES];
+ size_t device_count = 0;
+ size_t score_count = 0;
+ size_t finished_count = 0;
+ int timeout = DEFAULT_TIMEOUT;
+ struct timespec ts;
+ time_t start_time;
+ size_t i;
+ int final_score = 0;
+ int opt, option_index;
+ int verbose = 0;
+ int inject_error_percent = 0;
+ struct option long_options[] = {
+ {"timeout", required_argument, 0, 't' },
+ {"device", required_argument, 0, 'd' },
+ {"score", required_argument, 0, 's' },
+ {"inject-errors-percent", required_argument, 0, 0 },
+ {"verbose", no_argument, 0, 'v' },
+ {"help", no_argument, 0, 'h' },
+ {0, 0, 0, 0 }
+ };
+ while ( (opt = getopt_long(argc, argv, "hvt:d:s:",
+ long_options, &option_index)) != -1 ) {
+ switch (opt) {
+ case 0: /* Long-only options */
+ if (strcmp(long_options[option_index].name, "inject-errors-percent") == 0) {
+ inject_error_percent = atoi(optarg);
+ if (inject_error_percent < 1 || inject_error_percent > 100) {
+ fprintf(stderr, "inject_error_percent should be between 1 and 100\n");
+ return -1;
+ }
+ }
+ break;
+ case 'd':
+ if (device_count < MAX_DEVICES) {
+ devices[device_count++] = strdup(optarg);
+ } else {
+ fprintf(stderr, "too many devices, max is %d\n", MAX_DEVICES);
+ return -1;
+ }
+ break;
+ case 's':
+ if (device_count < MAX_DEVICES) {
+ int score = atoi(optarg);
+ if (score < 1 || score > 10) {
+ fprintf(stderr, "Score must be between 1 and 10 inclusive\n");
+ return -1;
+ }
+ scores[score_count++] = score;
+ } else {
+ fprintf(stderr, "too many scores, max is %d\n", MAX_DEVICES);
+ return -1;
+ }
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 't':
+ timeout = atoi(optarg);
+ if (timeout < 1) {
+ fprintf(stderr, "invalid timeout %d. Min 1, recommended %d (default)\n", timeout, DEFAULT_TIMEOUT);
+ return -1;
+ }
+ break;
+ case 'h':
+ usage(argv[0], stdout);
+ break;
+ default:
+ usage(argv[0], stderr);
+ break;
+ }
+
+ }
+ if (device_count == 0) {
+ fprintf(stderr, "No devices to test, use the -d or --device argument\n");
+ return -1;
+ }
+
+ if (device_count != score_count) {
+ fprintf(stderr, "There must be the same number of devices and scores\n");
+ return -1;
+ }
+
+ openlog("storage_mon", 0, LOG_DAEMON);
+
+ memset(test_forks, 0, sizeof(test_forks));
+ for (i=0; i<device_count; i++) {
+ test_forks[i] = fork();
+ if (test_forks[i] < 0) {
+ fprintf(stderr, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+ syslog(LOG_ERR, "Error spawning fork for %s: %s\n", devices[i], strerror(errno));
+ /* Just test the devices we have */
+ break;
+ }
+ /* child */
+ if (test_forks[i] == 0) {
+ test_device(devices[i], verbose, inject_error_percent);
+ }
+ }
+
+ /* See if they have finished */
+ clock_gettime(CLOCK_REALTIME, &ts);
+ start_time = ts.tv_sec;
+
+ while ((finished_count < device_count) && ((start_time + timeout) > ts.tv_sec)) {
+ for (i=0; i<device_count; i++) {
+ int wstatus;
+ pid_t w;
+
+ if (test_forks[i] > 0) {
+ w = waitpid(test_forks[i], &wstatus, WUNTRACED | WNOHANG | WCONTINUED);
+ if (w < 0) {
+ fprintf(stderr, "waitpid on %s failed: %s\n", devices[i], strerror(errno));
+ return -1;
+ }
+
+ if (w == test_forks[i]) {
+ if (WIFEXITED(wstatus)) {
+ if (WEXITSTATUS(wstatus) == 0) {
+ finished_count++;
+ test_forks[i] = 0;
+ } else {
+ syslog(LOG_ERR, "Error reading from device %s", devices[i]);
+ final_score += scores[i];
+ }
+ }
+ }
+ }
+ }
+
+ usleep(100000);
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ }
+
+ /* See which threads have not finished */
+ for (i=0; i<device_count; i++) {
+ if (test_forks[i] != 0) {
+ syslog(LOG_ERR, "Reading from device %s did not complete in %d seconds timeout", devices[i], timeout);
+ fprintf(stderr, "Thread for device %s did not complete in time\n", devices[i]);
+ final_score += scores[i];
+ }
+ }
+
+ if (verbose) {
+ printf("Final score is %d\n", final_score);
+ }
+ return final_score;
+}