diff --git a/.gitignore b/.gitignore index 0fb7102..ba457ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /sbd-*.tar.gz +/sbd-*.src.rpm diff --git a/0001-Fix-regressions.sh-make-parameter-passing-consistent.patch b/0001-Fix-regressions.sh-make-parameter-passing-consistent.patch new file mode 100644 index 0000000..6f17a5a --- /dev/null +++ b/0001-Fix-regressions.sh-make-parameter-passing-consistent.patch @@ -0,0 +1,82 @@ +From 1d2a7b8d059d4f090b351b8decca0ddf274c82a0 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Wed, 20 Nov 2019 15:20:19 +0100 +Subject: [PATCH] Fix: regressions.sh: make parameter passing consistent + +--- + tests/regressions.sh | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/tests/regressions.sh b/tests/regressions.sh +index 6cfb303..7ab80be 100755 +--- a/tests/regressions.sh ++++ b/tests/regressions.sh +@@ -32,7 +32,7 @@ + : ${SBD_USE_DM:="yes"} + + sbd() { +- LD_PRELOAD=${SBD_PRELOAD} SBD_WATCHDOG_TIMEOUT=5 SBD_DEVICE="${SBD_DEVICE}" SBD_PRELOAD_LOG=${SBD_PRELOAD_LOG} SBD_WATCHDOG_DEV=/dev/watchdog setsid ${SBD_BINARY} -p ${SBD_PIDFILE} $* ++ LD_PRELOAD=${SBD_PRELOAD} SBD_WATCHDOG_TIMEOUT=5 SBD_DEVICE="${SBD_DEVICE}" SBD_PRELOAD_LOG=${SBD_PRELOAD_LOG} SBD_WATCHDOG_DEV=/dev/watchdog setsid ${SBD_BINARY} -p ${SBD_PIDFILE} "$@" + } + + sbd_wipe_disk() { +@@ -98,26 +98,26 @@ sbd_daemon_cleanup() { + pkill -TERM --pidfile ${SBD_PIDFILE} 2>/dev/null + sleep 5 + pkill -KILL --pidfile ${SBD_PIDFILE} 2>/dev/null +- pkill -KILL --parent $(cat ${SBD_PIDFILE} 2>/dev/null) 2>/dev/null ++ pkill -KILL --parent "$(cat ${SBD_PIDFILE} 2>/dev/null)" 2>/dev/null + echo > ${SBD_PIDFILE} + } + + _ok() { +- echo -- $@ +- $@ ++ echo "-- $*" ++ "$@" + rc=$? + if [ $rc -ne 0 ]; then +- echo "$@ failed with $rc" ++ echo "$* failed with $rc" + exit $rc + fi + } + + _no() { +- echo -- $@ +- $@ ++ echo "-- $*" ++ "$@" + rc=$? + if [ $rc -eq 0 ]; then +- echo "$@ did NOT fail ($rc)" ++ echo "$* did NOT fail ($rc)" + exit $rc + fi + return 0 +@@ -126,7 +126,7 @@ _no() { + _in_log() { + grep "$@" ${SBD_PRELOAD_LOG} >/dev/null + if [ $? -ne 0 ]; then +- echo "didn't find '$@' in log:" ++ echo "didn't find '$*' in log:" + cat ${SBD_PRELOAD_LOG} + sbd_daemon_cleanup + exit 1 +@@ -227,10 +227,10 @@ test_stall_inquisitor() { + sbd_daemon_cleanup + sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 watch + sleep 10 +- _ok kill -0 $(cat ${SBD_PIDFILE}) +- kill -STOP $(cat ${SBD_PIDFILE}) ++ _ok kill -0 "$(cat ${SBD_PIDFILE})" ++ kill -STOP "$(cat ${SBD_PIDFILE})" + sleep 10 +- kill -CONT $(cat ${SBD_PIDFILE}) 2>/dev/null ++ kill -CONT "$(cat ${SBD_PIDFILE})" 2>/dev/null + _in_log "watchdog fired" + } + +-- +1.8.3.1 + diff --git a/0001-Refactor-fail-earlier-on-invalid-servants.patch b/0001-Refactor-fail-earlier-on-invalid-servants.patch deleted file mode 100644 index cb71002..0000000 --- a/0001-Refactor-fail-earlier-on-invalid-servants.patch +++ /dev/null @@ -1,142 +0,0 @@ -From 8301cbafed191f30656a22876941cc7c9189b623 Mon Sep 17 00:00:00 2001 -From: Klaus Wenninger -Date: Thu, 31 Jan 2019 14:42:01 +0100 -Subject: [PATCH] Refactor: fail earlier on invalid servants - ---- - src/sbd-inquisitor.c | 51 ++++++++++++++++++++++++++++++++------------------- - src/sbd-md.c | 7 +------ - src/sbd.h | 2 +- - 3 files changed, 34 insertions(+), 26 deletions(-) - -diff --git a/src/sbd-inquisitor.c b/src/sbd-inquisitor.c -index 8e0bc87..9be6c99 100644 ---- a/src/sbd-inquisitor.c -+++ b/src/sbd-inquisitor.c -@@ -42,19 +42,36 @@ void recruit_servant(const char *devname, pid_t pid) - struct servants_list_item *newbie; - - if (lookup_servant_by_dev(devname)) { -- cl_log(LOG_DEBUG, "Servant %s already exists", devname); -- return; -+ cl_log(LOG_DEBUG, "Servant %s already exists", devname); -+ return; - } - - newbie = malloc(sizeof(*newbie)); -- if (!newbie) { -- fprintf(stderr, "malloc failed in recruit_servant.\n"); -- exit(1); -+ if (newbie) { -+ memset(newbie, 0, sizeof(*newbie)); -+ newbie->devname = strdup(devname); -+ newbie->pid = pid; -+ newbie->first_start = 1; -+ } -+ if (!newbie || !newbie->devname) { -+ fprintf(stderr, "heap allocation failed in recruit_servant.\n"); -+ exit(1); -+ } -+ -+ /* some sanity-check on our newbie */ -+ if (sbd_is_disk(newbie)) { -+ cl_log(LOG_INFO, "Monitoring %s", devname); -+ disk_count++; -+ } else if (sbd_is_pcmk(newbie) || sbd_is_cluster(newbie)) { -+ /* alive just after pcmk and cluster servants have shown up */ -+ newbie->outdated = 1; -+ } else { -+ /* toss our newbie */ -+ cl_log(LOG_ERR, "Refusing to recruit unrecognized servant %s", devname); -+ free((void *) newbie->devname); -+ free(newbie); -+ return; - } -- memset(newbie, 0, sizeof(*newbie)); -- newbie->devname = strdup(devname); -- newbie->pid = pid; -- newbie->first_start = 1; - - if (!s) { - servants_leader = newbie; -@@ -65,12 +82,6 @@ void recruit_servant(const char *devname, pid_t pid) - } - - servant_count++; -- if(sbd_is_disk(newbie)) { -- cl_log(LOG_INFO, "Monitoring %s", devname); -- disk_count++; -- } else { -- newbie->outdated = 1; -- } - } - - int assign_servant(const char* devname, functionp_t functionp, int mode, const void* argp) -@@ -148,7 +159,7 @@ void servant_start(struct servants_list_item *s) - if (sbd_is_disk(s)) { - #if SUPPORT_SHARED_DISK - DBGLOG(LOG_INFO, "Starting servant for device %s", s->devname); -- s->pid = assign_servant(s->devname, servant, start_mode, s); -+ s->pid = assign_servant(s->devname, servant_md, start_mode, s); - #else - cl_log(LOG_ERR, "Shared disk functionality not supported"); - return; -@@ -785,12 +796,14 @@ parse_device_line(const char *line) - - if (lpc > last) { - entry = calloc(1, 1 + lpc - last); -+ if (!entry) { -+ fprintf(stderr, "heap allocation failed parsing device-line.\n"); -+ exit(1); -+ } - rc = sscanf(line + last, "%[^;]", entry); - } - -- if (entry == NULL) { -- /* Skip */ -- } else if (rc != 1) { -+ if (rc != 1) { - cl_log(LOG_WARNING, "Could not parse (%d %d): %s", last, lpc, line + last); - } else { - cl_log(LOG_DEBUG, "Adding '%s'", entry); -diff --git a/src/sbd-md.c b/src/sbd-md.c -index 579d273..ba2c34d 100644 ---- a/src/sbd-md.c -+++ b/src/sbd-md.c -@@ -1031,7 +1031,7 @@ static int servant_check_timeout_inconsistent(struct sector_header_s *hdr) - return 0; - } - --int servant(const char *diskname, int mode, const void* argp) -+int servant_md(const char *diskname, int mode, const void* argp) - { - struct sector_mbox_s *s_mbox = NULL; - struct sector_node_s *s_node = NULL; -@@ -1046,11 +1046,6 @@ int servant(const char *diskname, int mode, const void* argp) - char uuid[37]; - const struct servants_list_item *s = argp; - -- if (!diskname) { -- cl_log(LOG_ERR, "Empty disk name %s.", diskname); -- return -1; -- } -- - cl_log(LOG_INFO, "Servant starting for device %s", diskname); - - /* Block most of the signals */ -diff --git a/src/sbd.h b/src/sbd.h -index 386c85c..6fe07f9 100644 ---- a/src/sbd.h -+++ b/src/sbd.h -@@ -175,7 +175,7 @@ int ping_via_slots(const char *name, struct servants_list_item *servants); - int dump_headers(struct servants_list_item *servants); - unsigned long get_first_msgwait(struct servants_list_item *servants); - int messenger(const char *name, const char *msg, struct servants_list_item *servants); --int servant(const char *diskname, int mode, const void* argp); -+int servant_md(const char *diskname, int mode, const void* argp); - #endif - - int servant_pcmk(const char *diskname, int mode, const void* argp); --- -1.8.3.1 - diff --git a/0002-Doc-add-environment-section-to-man-page.patch b/0002-Doc-add-environment-section-to-man-page.patch new file mode 100644 index 0000000..2ad9556 --- /dev/null +++ b/0002-Doc-add-environment-section-to-man-page.patch @@ -0,0 +1,1459 @@ +From 9dd82a8b4daa5a7bd8ab3afa43b081f212efb1ac Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Wed, 29 Jan 2020 20:34:18 +0100 +Subject: [PATCH] Doc: add environment section to man-page + +Environment section is auto-generated from sbd.sysconfig. +--- + .gitignore | 1 + + Makefile.am | 6 +- + README.md | 3 +- + man/Makefile.am | 8 +- + man/sbd.8.pod | 668 ----------------------------------------------------- + man/sbd.8.pod.in | 675 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + src/sbd.sysconfig | 3 +- + 7 files changed, 690 insertions(+), 674 deletions(-) + delete mode 100644 man/sbd.8.pod + create mode 100644 man/sbd.8.pod.in + +diff --git a/Makefile.am b/Makefile.am +index 1c29f75..bd4346d 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -9,8 +9,8 @@ TARFILE = $(distdir).tar.gz + DIST_ARCHIVES = $(TARFILE) + KEEP_EXISTING_TAR = no + INJECT_GIT_COMMIT = yes +-DISTCLEANFILES = sbd-* sbd-*/ + CLEANFILES = *.rpm *.tar.* sbd-* ++DISTCLEANFILES = sbd-* sbd-*/ + + RPM_ROOT = $(shell pwd) + RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \ +@@ -31,7 +31,7 @@ export SBD_BINARY := src/sbd + export SBD_PRELOAD := tests/.libs/libsbdtestbed.so + export SBD_USE_DM := no + +-EXTRA_DIST = sbd.spec tests/regressions.sh ++EXTRA_DIST = sbd.spec tests/regressions.sh man/sbd.8.pod.in + + export: + rm -f $(PACKAGE)-HEAD.tar.* +@@ -43,7 +43,7 @@ export: + echo `date`: Using existing tarball: $(TARFILE); \ + else \ + rm -f $(PACKAGE).tar.*; \ +- (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclusive="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \ ++ (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclude="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude="*.pod" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \ + if test -n "$$(git status -s)" || test "$(INJECT_GIT_COMMIT)" = "yes"; then \ + if test -n "$$(git status -s)"; then git diff HEAD --name-only|grep -v "^\."|xargs -n1 git diff HEAD > uncommitted.diff; fi; \ + rm -rf $(distdir); tar -xzf $(TARFILE); rm $(TARFILE); \ +diff --git a/README.md b/README.md +index d02a8bd..42a3fde 100644 +--- a/README.md ++++ b/README.md +@@ -5,5 +5,6 @@ A highly reliable fencing or Shoot-the-other-node-in-the-head (STONITH) mechanis + The component works with Pacemaker clusters, and is currently known to + compile and function on Pacemaker 1.1.7+ and corosync 1.4.x or 2.3.x. + +-Please see https://github.com/l-mb/sbd/blob/master/man/sbd.8.pod for the full documentation. ++Please see https://github.com/clusterlabs/sbd/blob/master/man/sbd.8.pod.in & ++https://github.com/clusterlabs/sbd/blob/master/src/sbd.sysconfig for the full documentation. + +diff --git a/man/Makefile.am b/man/Makefile.am +index 3f89085..995712d 100644 +--- a/man/Makefile.am ++++ b/man/Makefile.am +@@ -1,6 +1,12 @@ + dist_man_MANS = sbd.8 + +-EXTRA_DIST = sbd.8.pod ++DISTCLEANFILES = sbd.8.pod sbd.8 sbd.sysconfig.pod ++ ++sbd.sysconfig.pod: ../src/sbd.sysconfig ++ sed -r -n -e "s/^## Type: (.*)/Allows C<\1>/;t type;s/^## Default: (.*)/ defaulting to C<\1>/;t default;s/^#*(.*)=.*/=item B<\1>\n/;t variable;s/^#*//;s/^ *//;H;d;:type;h;d;:default;H;x;s/\n//;x;d;:variable;G;p" $< > $@ ++ ++sbd.8.pod: sbd.8.pod.in sbd.sysconfig.pod ++ sed -e "s/@environment_section@//;t insert;p;d;:insert;rsbd.sysconfig.pod" $< > $@ + + sbd.8: sbd.8.pod + @POD2MAN@ -s 8 -c "STONITH Block Device" -r "SBD" -n "SBD" $< $@ +diff --git a/man/sbd.8.pod b/man/sbd.8.pod +deleted file mode 100644 +index 377c579..0000000 +--- a/man/sbd.8.pod ++++ /dev/null +@@ -1,668 +0,0 @@ +-=head1 NAME +- +-sbd - STONITH Block Device daemon +- +-=head1 SYNOPSIS +- +-sbd <-d F> [options] C +- +-=head1 SUMMARY +- +-SBD provides a node fencing mechanism (Shoot the other node in the head, +-STONITH) for Pacemaker-based clusters through the exchange of messages +-via shared block storage such as for example a SAN, iSCSI, FCoE. This +-isolates the fencing mechanism from changes in firmware version or +-dependencies on specific firmware controllers, and it can be used as a +-STONITH mechanism in all configurations that have reliable shared +-storage. +- +-SBD can also be used without any shared storage. In this mode, the +-watchdog device will be used to reset the node if it loses quorum, if +-any monitored daemon is lost and not recovered or if Pacemaker decides +-that the node requires fencing. +- +-The F binary implements both the daemon that watches the message +-slots as well as the management tool for interacting with the block +-storage device(s). This mode of operation is specified via the +-C parameter; some of these modes take additional parameters. +- +-To use SBD with shared storage, you must first C the messaging +-layout on one to three block devices. Second, configure +-F to list those devices (and possibly adjust other +-options), and restart the cluster stack on each node to ensure that +-C is started. Third, configure the C fencing +-resource in the Pacemaker CIB. +- +-Each of these steps is documented in more detail below the description +-of the command options. +- +-C can only be used as root. +- +-=head2 GENERAL OPTIONS +- +-=over +- +-=item B<-d> F +- +-Specify the block device(s) to be used. If you have more than one, +-specify this option up to three times. This parameter is mandatory for +-all modes, since SBD always needs a block device to interact with. +- +-This man page uses F, F, and F as +-example device names for brevity. However, in your production +-environment, you should instead always refer to them by using the long, +-stable device name (e.g., +-F). +- +-=item B<-v|-vv|-vvv> +- +-Enable verbose|debug|debug-library logging (optional) +- +-=item B<-h> +- +-Display a concise summary of C options. +- +-=item B<-n> I +- +-Set local node name; defaults to C. This should not need to be +-set. +- +-=item B<-R> +- +-Do B enable realtime priority. By default, C runs at realtime +-priority, locks itself into memory, and also acquires highest IO +-priority to protect itself against interference from other processes on +-the system. This is a debugging-only option. +- +-=item B<-I> I +- +-Async IO timeout (defaults to 3 seconds, optional). You should not need +-to adjust this unless your IO setup is really very slow. +- +-(In daemon mode, the watchdog is refreshed when the majority of devices +-could be read within this time.) +- +-=back +- +-=head2 create +- +-Example usage: +- +- sbd -d /dev/sdc2 -d /dev/sdd3 create +- +-If you specify the I command, sbd will write a metadata header +-to the device(s) specified and also initialize the messaging slots for +-up to 255 nodes. +- +-B: This command will not prompt for confirmation. Roughly the +-first megabyte of the specified block device(s) will be overwritten +-immediately and without backup. +- +-This command accepts a few options to adjust the default timings that +-are written to the metadata (to ensure they are identical across all +-nodes accessing the device). +- +-=over +- +-=item B<-1> I +- +-Set watchdog timeout to N seconds. This depends mostly on your storage +-latency; the majority of devices must be successfully read within this +-time, or else the node will self-fence. +- +-If your sbd device(s) reside on a multipath setup or iSCSI, this should +-be the time required to detect a path failure. You may be able to reduce +-this if your device outages are independent, or if you are using the +-Pacemaker integration. +- +-=item B<-2> I +- +-Set slot allocation timeout to N seconds. You should not need to tune +-this. +- +-=item B<-3> I +- +-Set daemon loop timeout to N seconds. You should not need to tune this. +- +-=item B<-4> I +- +-Set I timeout to N seconds. This should be twice the I +-timeout. This is the time after which a message written to a node's slot +-will be considered delivered. (Or long enough for the node to detect +-that it needed to self-fence.) +- +-This also affects the I in Pacemaker's CIB; see below. +- +-=back +- +-=head2 list +- +-Example usage: +- +- # sbd -d /dev/sda1 list +- 0 hex-0 clear +- 1 hex-7 clear +- 2 hex-9 clear +- +-List all allocated slots on device, and messages. You should see all +-cluster nodes that have ever been started against this device. Nodes +-that are currently running should have a I state; nodes that have +-been fenced, but not yet restarted, will show the appropriate fencing +-message. +- +-=head2 dump +- +-Example usage: +- +- # sbd -d /dev/sda1 dump +- ==Dumping header on disk /dev/sda1 +- Header version : 2 +- Number of slots : 255 +- Sector size : 512 +- Timeout (watchdog) : 15 +- Timeout (allocate) : 2 +- Timeout (loop) : 1 +- Timeout (msgwait) : 30 +- ==Header on disk /dev/sda1 is dumped +- +-Dump meta-data header from device. +- +-=head2 watch +- +-Example usage: +- +- sbd -d /dev/sdc2 -d /dev/sdd3 -P watch +- +-This command will make C start in daemon mode. It will constantly monitor +-the message slot of the local node for incoming messages, reachability, and +-optionally take Pacemaker's state into account. +- +-C B be started on boot before the cluster stack! See below +-for enabling this according to your boot environment. +- +-The options for this mode are rarely specified directly on the +-commandline directly, but most frequently set via F. +- +-It also constantly monitors connectivity to the storage device, and +-self-fences in case the partition becomes unreachable, guaranteeing that it +-does not disconnect from fencing messages. +- +-A node slot is automatically allocated on the device(s) the first time +-the daemon starts watching the device; hence, manual allocation is not +-usually required. +- +-If a watchdog is used together with the C as is strongly +-recommended, the watchdog is activated at initial start of the sbd +-daemon. The watchdog is refreshed every time the majority of SBD devices +-has been successfully read. Using a watchdog provides additional +-protection against C crashing. +- +-If the Pacemaker integration is activated, C will B self-fence +-if device majority is lost, if: +- +-=over +- +-=item 1. +- +-The partition the node is in is still quorate according to the CIB; +- +-=item 2. +- +-it is still quorate according to Corosync's node count; +- +-=item 3. +- +-the node itself is considered online and healthy by Pacemaker. +- +-=back +- +-This allows C to survive temporary outages of the majority of +-devices. However, while the cluster is in such a degraded state, it can +-neither successfully fence nor be shutdown cleanly (as taking the +-cluster below the quorum threshold will immediately cause all remaining +-nodes to self-fence). In short, it will not tolerate any further faults. +-Please repair the system before continuing. +- +-There is one C process that acts as a master to which all watchers +-report; one per device to monitor the node's slot; and, optionally, one +-that handles the Pacemaker integration. +- +-=over +- +-=item B<-W> +- +-Enable or disable use of the system watchdog to protect against the sbd +-processes failing and the node being left in an undefined state. Specify +-this once to enable, twice to disable. +- +-Defaults to I. +- +-=item B<-w> F +- +-This can be used to override the default watchdog device used and should not +-usually be necessary. +- +-=item B<-p> F +- +-This option can be used to specify a pidfile for the main sbd process. +- +-=item B<-F> I +- +-Number of failures before a failing servant process will not be restarted +-immediately until the dampening delay has expired. If set to zero, servants +-will be restarted immediately and indefinitely. If set to one, a failed +-servant will be restarted once every B<-t> seconds. If set to a different +-value, the servant will be restarted that many times within the dampening +-period and then delay. +- +-Defaults to I<1>. +- +-=item B<-t> I +- +-Dampening delay before faulty servants are restarted. Combined with C<-F 1>, +-the most logical way to tune the restart frequency of servant processes. +-Default is 5 seconds. +- +-If set to zero, processes will be restarted indefinitely and immediately. +- +-=item B<-P> +- +-Enable Pacemaker integration which checks Pacemaker quorum and node health. +-Specify this once to enable, twice to disable. +- +-Defaults to I. +- +-=item B<-S> I +- +-Set the start mode. (Defaults to I<0>.) +- +-If this is set to zero, sbd will always start up unconditionally, +-regardless of whether the node was previously fenced or not. +- +-If set to one, sbd will only start if the node was previously shutdown +-cleanly (as indicated by an exit request message in the slot), or if the +-slot is empty. A reset, crashdump, or power-off request in any slot will +-halt the start up. +- +-This is useful to prevent nodes from rejoining if they were faulty. The +-node must be manually "unfenced" by sending an empty message to it: +- +- sbd -d /dev/sda1 message node1 clear +- +-=item B<-s> I +- +-Set the start-up wait time for devices. (Defaults to I<120>.) +- +-Dynamic block devices such as iSCSI might not be fully initialized and +-present yet. This allows one to set a timeout for waiting for devices to +-appear on start-up. If set to 0, start-up will be aborted immediately if +-no devices are available. +- +-=item B<-Z> +- +-Enable trace mode. B Specifying this once will turn all reboots or power-offs, be +-they caused by self-fence decisions or messages, into a crashdump. +-Specifying this twice will just log them but not continue running. +- +-=item B<-T> +- +-By default, the daemon will set the watchdog timeout as specified in the +-device metadata. However, this does not work for every watchdog device. +-In this case, you must manually ensure that the watchdog timeout used by +-the system correctly matches the SBD settings, and then specify this +-option to allow C to continue with start-up. +- +-=item B<-5> I +- +-Warn if the time interval for tickling the watchdog exceeds this many seconds. +-Since the node is unable to log the watchdog expiry (it reboots immediately +-without a chance to write its logs to disk), this is very useful for getting +-an indication that the watchdog timeout is too short for the IO load of the +-system. +- +-Default is 3 seconds, set to zero to disable. +- +-=item B<-C> I +- +-Watchdog timeout to set before crashdumping. If SBD is set to crashdump +-instead of reboot - either via the trace mode settings or the I +-fencing agent's parameter -, SBD will adjust the watchdog timeout to this +-setting before triggering the dump. Otherwise, the watchdog might trigger and +-prevent a successful crashdump from ever being written. +- +-Set to zero (= default) to disable. +- +-=item B<-r> I +- +-Actions to be executed when the watchers don't timely report to the sbd +-master process or one of the watchers detects that the master process +-has died. +- +-Set timeout-action to comma-separated combination of +-noflush|flush plus reboot|crashdump|off. +-If just one of both is given the other stays at the default. +- +-This doesn't affect actions like off, crashdump, reboot explicitly +-triggered via message slots. +-And it does as well not configure the action a watchdog would +-trigger should it run off (there is no generic interface). +- +-Defaults to flush,reboot. +- +-=back +- +-=head2 allocate +- +-Example usage: +- +- sbd -d /dev/sda1 allocate node1 +- +-Explicitly allocates a slot for the specified node name. This should +-rarely be necessary, as every node will automatically allocate itself a +-slot the first time it starts up on watch mode. +- +-=head2 message +- +-Example usage: +- +- sbd -d /dev/sda1 message node1 test +- +-Writes the specified message to node's slot. This is rarely done +-directly, but rather abstracted via the C fencing agent +-configured as a cluster resource. +- +-Supported message types are: +- +-=over +- +-=item test +- +-This only generates a log message on the receiving node and can be used +-to check if SBD is seeing the device. Note that this could overwrite a +-fencing request send by the cluster, so should not be used during +-production. +- +-=item reset +- +-Reset the target upon receipt of this message. +- +-=item off +- +-Power-off the target. +- +-=item crashdump +- +-Cause the target node to crashdump. +- +-=item exit +- +-This will make the C daemon exit cleanly on the target. You should +-B send this message manually; this is handled properly during +-shutdown of the cluster stack. Manually stopping the daemon means the +-node is unprotected! +- +-=item clear +- +-This message indicates that no real message has been sent to the node. +-You should not set this manually; C will clear the message slot +-automatically during start-up, and setting this manually could overwrite +-a fencing message by the cluster. +- +-=back +- +-=head2 query-watchdog +- +-Example usage: +- +- sbd query-watchdog +- +-Check for available watchdog devices and print some info. +- +-B: This command will arm the watchdog during query, and if your +-watchdog refuses disarming (for example, if its kernel module has the +-'nowayout' parameter set) this will reset your system. +- +-=head2 test-watchdog +- +-Example usage: +- +- sbd test-watchdog [-w /dev/watchdog3] +- +-Test specified watchdog device (/dev/watchdog by default). +- +-B: This command will arm the watchdog and have your system reset +-in case your watchdog is working properly! If issued from an interactive +-session, it will prompt for confirmation. +- +-=head1 Base system configuration +- +-=head2 Configure a watchdog +- +-It is highly recommended that you configure your Linux system to load a +-watchdog driver with hardware assistance (as is available on most modern +-systems), such as I, I, or others. As a fall-back, you +-can use the I module. +- +-No other software must access the watchdog timer; it can only be +-accessed by one process at any given time. Some hardware vendors ship +-systems management software that use the watchdog for system resets +-(f.e. HP ASR daemon). Such software has to be disabled if the watchdog +-is to be used by SBD. +- +-=head2 Choosing and initializing the block device(s) +- +-First, you have to decide if you want to use one, two, or three devices. +- +-If you are using multiple ones, they should reside on independent +-storage setups. Putting all three of them on the same logical unit for +-example would not provide any additional redundancy. +- +-The SBD device can be connected via Fibre Channel, Fibre Channel over +-Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of +-network-based quorum server; the advantage is that it does not require +-a smart host at your third location, just block storage. +- +-The SBD partitions themselves B be mirrored (via MD, +-DRBD, or the storage layer itself), since this could result in a +-split-mirror scenario. Nor can they reside on cLVM2 volume groups, since +-they must be accessed by the cluster stack before it has started the +-cLVM2 daemons; hence, these should be either raw partitions or logical +-units on (multipath) storage. +- +-The block device(s) must be accessible from all nodes. (While it is not +-necessary that they share the same path name on all nodes, this is +-considered a very good idea.) +- +-SBD will only use about one megabyte per device, so you can easily +-create a small partition, or very small logical units. (The size of the +-SBD device depends on the block size of the underlying device. Thus, 1MB +-is fine on plain SCSI devices and SAN storage with 512 byte blocks. On +-the IBM s390x architecture in particular, disks default to 4k blocks, +-and thus require roughly 4MB.) +- +-The number of devices will affect the operation of SBD as follows: +- +-=over +- +-=item One device +- +-In its most simple implementation, you use one device only. This is +-appropriate for clusters where all your data is on the same shared +-storage (with internal redundancy) anyway; the SBD device does not +-introduce an additional single point of failure then. +- +-If the SBD device is not accessible, the daemon will fail to start and +-inhibit startup of cluster services. +- +-=item Two devices +- +-This configuration is a trade-off, primarily aimed at environments where +-host-based mirroring is used, but no third storage device is available. +- +-SBD will not commit suicide if it loses access to one mirror leg; this +-allows the cluster to continue to function even in the face of one outage. +- +-However, SBD will not fence the other side while only one mirror leg is +-available, since it does not have enough knowledge to detect an asymmetric +-split of the storage. So it will not be able to automatically tolerate a +-second failure while one of the storage arrays is down. (Though you +-can use the appropriate crm command to acknowledge the fence manually.) +- +-It will not start unless both devices are accessible on boot. +- +-=item Three devices +- +-In this most reliable and recommended configuration, SBD will only +-self-fence if more than one device is lost; hence, this configuration is +-resilient against temporary single device outages (be it due to failures +-or maintenance). Fencing messages can still be successfully relayed if +-at least two devices remain accessible. +- +-This configuration is appropriate for more complex scenarios where +-storage is not confined to a single array. For example, host-based +-mirroring solutions could have one SBD per mirror leg (not mirrored +-itself), and an additional tie-breaker on iSCSI. +- +-It will only start if at least two devices are accessible on boot. +- +-=back +- +-After you have chosen the devices and created the appropriate partitions +-and perhaps multipath alias names to ease management, use the C +-command described above to initialize the SBD metadata on them. +- +-=head3 Sharing the block device(s) between multiple clusters +- +-It is possible to share the block devices between multiple clusters, +-provided the total number of nodes accessing them does not exceed I<255> +-nodes, and they all must share the same SBD timeouts (since these are +-part of the metadata). +- +-If you are using multiple devices this can reduce the setup overhead +-required. However, you should B share devices between clusters in +-different security domains. +- +-=head2 Configure SBD to start on boot +- +-On systems using C, the C or C system +-start-up scripts must handle starting or stopping C as required +-before starting the rest of the cluster stack. +- +-For C, sbd simply has to be enabled using +- +- systemctl enable sbd.service +- +-The daemon is brought online on each node before corosync and Pacemaker +-are started, and terminated only after all other cluster components have +-been shut down - ensuring that cluster resources are never activated +-without SBD supervision. +- +-=head2 Configuration via sysconfig +- +-The system instance of C is configured via F. +-In this file, you must specify the device(s) used, as well as any +-options to pass to the daemon: +- +- SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1" +- SBD_PACEMAKER="true" +- +-C will fail to start if no C is specified. See the +-installed template for more options that can be configured here. +-In general configuration done via parameters takes precedence over +-the configuration from the configuration file. +- +-=head2 Testing the sbd installation +- +-After a restart of the cluster stack on this node, you can now try +-sending a test message to it as root, from this or any other node: +- +- sbd -d /dev/sda1 message node1 test +- +-The node will acknowledge the receipt of the message in the system logs: +- +- Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2 +- +-This confirms that SBD is indeed up and running on the node, and that it +-is ready to receive messages. +- +-Make B that F is identical on all cluster +-nodes, and that all cluster nodes are running the daemon. +- +-=head1 Pacemaker CIB integration +- +-=head2 Fencing resource +- +-Pacemaker can only interact with SBD to issue a node fence if there is a +-configure fencing resource. This should be a primitive, not a clone, as +-follows: +- +- primitive fencing-sbd stonith:external/sbd \ +- params pcmk_delay_max=30 +- +-This will automatically use the same devices as configured in +-F. +- +-While you should not configure this as a clone (as Pacemaker will register +-the fencing device on each node automatically), the I +-setting enables random fencing delay which ensures, in a scenario where a +-split-brain scenario did occur in a two node cluster, that one of the nodes +-has a better chance to survive to avoid double fencing. +- +-SBD also supports turning the reset request into a crash request, which +-may be helpful for debugging if you have kernel crashdumping configured; +-then, every fence request will cause the node to dump core. You can +-enable this via the C parameter on the fencing +-resource. This is B recommended for production use, but only for +-debugging phases. +- +-=head2 General cluster properties +- +-You must also enable STONITH in general, and set the STONITH timeout to +-be at least twice the I timeout you have configured, to allow +-enough time for the fencing message to be delivered. If your I +-timeout is 60 seconds, this is a possible configuration: +- +- property stonith-enabled="true" +- property stonith-timeout="120s" +- +-B: if I is too low for I and the +-system overhead, sbd will never be able to successfully complete a fence +-request. This will create a fencing loop. +- +-Note that the sbd fencing agent will try to detect this and +-automatically extend the I setting to a reasonable +-value, on the assumption that sbd modifying your configuration is +-preferable to not fencing. +- +-=head1 Management tasks +- +-=head2 Recovering from temporary SBD device outage +- +-If you have multiple devices, failure of a single device is not immediately +-fatal. C will retry to restart the monitor for the device every 5 +-seconds by default. However, you can tune this via the options to the +-I command. +- +-In case you wish the immediately force a restart of all currently +-disabled monitor processes, you can send a I to the SBD +-I process. +- +- +-=head1 LICENSE +- +-Copyright (C) 2008-2013 Lars Marowsky-Bree +- +-This program is free software; you can redistribute it and/or +-modify it under the terms of the GNU General Public +-License as published by the Free Software Foundation; either +-version 2 of the License, or (at your option) any later version. +- +-This software is distributed in the hope that it will be useful, +-but WITHOUT ANY WARRANTY; without even the implied warranty of +-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-General Public License for more details. +- +-For details see the GNU General Public License at +-http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or +-http://www.gnu.org/licenses/gpl.html (the newest as per "any later"). +diff --git a/man/sbd.8.pod.in b/man/sbd.8.pod.in +new file mode 100644 +index 0000000..ff89c82 +--- /dev/null ++++ b/man/sbd.8.pod.in +@@ -0,0 +1,675 @@ ++=head1 NAME ++ ++sbd - STONITH Block Device daemon ++ ++=head1 SYNOPSIS ++ ++sbd <-d F> [options] C ++ ++=head1 SUMMARY ++ ++SBD provides a node fencing mechanism (Shoot the other node in the head, ++STONITH) for Pacemaker-based clusters through the exchange of messages ++via shared block storage such as for example a SAN, iSCSI, FCoE. This ++isolates the fencing mechanism from changes in firmware version or ++dependencies on specific firmware controllers, and it can be used as a ++STONITH mechanism in all configurations that have reliable shared ++storage. ++ ++SBD can also be used without any shared storage. In this mode, the ++watchdog device will be used to reset the node if it loses quorum, if ++any monitored daemon is lost and not recovered or if Pacemaker decides ++that the node requires fencing. ++ ++The F binary implements both the daemon that watches the message ++slots as well as the management tool for interacting with the block ++storage device(s). This mode of operation is specified via the ++C parameter; some of these modes take additional parameters. ++ ++To use SBD with shared storage, you must first C the messaging ++layout on one to three block devices. Second, configure ++F to list those devices (and possibly adjust other ++options), and restart the cluster stack on each node to ensure that ++C is started. Third, configure the C fencing ++resource in the Pacemaker CIB. ++ ++Each of these steps is documented in more detail below the description ++of the command options. ++ ++C can only be used as root. ++ ++=head2 GENERAL OPTIONS ++ ++=over ++ ++=item B<-d> F ++ ++Specify the block device(s) to be used. If you have more than one, ++specify this option up to three times. This parameter is mandatory for ++all modes, since SBD always needs a block device to interact with. ++ ++This man page uses F, F, and F as ++example device names for brevity. However, in your production ++environment, you should instead always refer to them by using the long, ++stable device name (e.g., ++F). ++ ++=item B<-v|-vv|-vvv> ++ ++Enable verbose|debug|debug-library logging (optional) ++ ++=item B<-h> ++ ++Display a concise summary of C options. ++ ++=item B<-n> I ++ ++Set local node name; defaults to C. This should not need to be ++set. ++ ++=item B<-R> ++ ++Do B enable realtime priority. By default, C runs at realtime ++priority, locks itself into memory, and also acquires highest IO ++priority to protect itself against interference from other processes on ++the system. This is a debugging-only option. ++ ++=item B<-I> I ++ ++Async IO timeout (defaults to 3 seconds, optional). You should not need ++to adjust this unless your IO setup is really very slow. ++ ++(In daemon mode, the watchdog is refreshed when the majority of devices ++could be read within this time.) ++ ++=back ++ ++=head2 create ++ ++Example usage: ++ ++ sbd -d /dev/sdc2 -d /dev/sdd3 create ++ ++If you specify the I command, sbd will write a metadata header ++to the device(s) specified and also initialize the messaging slots for ++up to 255 nodes. ++ ++B: This command will not prompt for confirmation. Roughly the ++first megabyte of the specified block device(s) will be overwritten ++immediately and without backup. ++ ++This command accepts a few options to adjust the default timings that ++are written to the metadata (to ensure they are identical across all ++nodes accessing the device). ++ ++=over ++ ++=item B<-1> I ++ ++Set watchdog timeout to N seconds. This depends mostly on your storage ++latency; the majority of devices must be successfully read within this ++time, or else the node will self-fence. ++ ++If your sbd device(s) reside on a multipath setup or iSCSI, this should ++be the time required to detect a path failure. You may be able to reduce ++this if your device outages are independent, or if you are using the ++Pacemaker integration. ++ ++=item B<-2> I ++ ++Set slot allocation timeout to N seconds. You should not need to tune ++this. ++ ++=item B<-3> I ++ ++Set daemon loop timeout to N seconds. You should not need to tune this. ++ ++=item B<-4> I ++ ++Set I timeout to N seconds. This should be twice the I ++timeout. This is the time after which a message written to a node's slot ++will be considered delivered. (Or long enough for the node to detect ++that it needed to self-fence.) ++ ++This also affects the I in Pacemaker's CIB; see below. ++ ++=back ++ ++=head2 list ++ ++Example usage: ++ ++ # sbd -d /dev/sda1 list ++ 0 hex-0 clear ++ 1 hex-7 clear ++ 2 hex-9 clear ++ ++List all allocated slots on device, and messages. You should see all ++cluster nodes that have ever been started against this device. Nodes ++that are currently running should have a I state; nodes that have ++been fenced, but not yet restarted, will show the appropriate fencing ++message. ++ ++=head2 dump ++ ++Example usage: ++ ++ # sbd -d /dev/sda1 dump ++ ==Dumping header on disk /dev/sda1 ++ Header version : 2 ++ Number of slots : 255 ++ Sector size : 512 ++ Timeout (watchdog) : 15 ++ Timeout (allocate) : 2 ++ Timeout (loop) : 1 ++ Timeout (msgwait) : 30 ++ ==Header on disk /dev/sda1 is dumped ++ ++Dump meta-data header from device. ++ ++=head2 watch ++ ++Example usage: ++ ++ sbd -d /dev/sdc2 -d /dev/sdd3 -P watch ++ ++This command will make C start in daemon mode. It will constantly monitor ++the message slot of the local node for incoming messages, reachability, and ++optionally take Pacemaker's state into account. ++ ++C B be started on boot before the cluster stack! See below ++for enabling this according to your boot environment. ++ ++The options for this mode are rarely specified directly on the ++commandline directly, but most frequently set via F. ++ ++It also constantly monitors connectivity to the storage device, and ++self-fences in case the partition becomes unreachable, guaranteeing that it ++does not disconnect from fencing messages. ++ ++A node slot is automatically allocated on the device(s) the first time ++the daemon starts watching the device; hence, manual allocation is not ++usually required. ++ ++If a watchdog is used together with the C as is strongly ++recommended, the watchdog is activated at initial start of the sbd ++daemon. The watchdog is refreshed every time the majority of SBD devices ++has been successfully read. Using a watchdog provides additional ++protection against C crashing. ++ ++If the Pacemaker integration is activated, C will B self-fence ++if device majority is lost, if: ++ ++=over ++ ++=item 1. ++ ++The partition the node is in is still quorate according to the CIB; ++ ++=item 2. ++ ++it is still quorate according to Corosync's node count; ++ ++=item 3. ++ ++the node itself is considered online and healthy by Pacemaker. ++ ++=back ++ ++This allows C to survive temporary outages of the majority of ++devices. However, while the cluster is in such a degraded state, it can ++neither successfully fence nor be shutdown cleanly (as taking the ++cluster below the quorum threshold will immediately cause all remaining ++nodes to self-fence). In short, it will not tolerate any further faults. ++Please repair the system before continuing. ++ ++There is one C process that acts as a master to which all watchers ++report; one per device to monitor the node's slot; and, optionally, one ++that handles the Pacemaker integration. ++ ++=over ++ ++=item B<-W> ++ ++Enable or disable use of the system watchdog to protect against the sbd ++processes failing and the node being left in an undefined state. Specify ++this once to enable, twice to disable. ++ ++Defaults to I. ++ ++=item B<-w> F ++ ++This can be used to override the default watchdog device used and should not ++usually be necessary. ++ ++=item B<-p> F ++ ++This option can be used to specify a pidfile for the main sbd process. ++ ++=item B<-F> I ++ ++Number of failures before a failing servant process will not be restarted ++immediately until the dampening delay has expired. If set to zero, servants ++will be restarted immediately and indefinitely. If set to one, a failed ++servant will be restarted once every B<-t> seconds. If set to a different ++value, the servant will be restarted that many times within the dampening ++period and then delay. ++ ++Defaults to I<1>. ++ ++=item B<-t> I ++ ++Dampening delay before faulty servants are restarted. Combined with C<-F 1>, ++the most logical way to tune the restart frequency of servant processes. ++Default is 5 seconds. ++ ++If set to zero, processes will be restarted indefinitely and immediately. ++ ++=item B<-P> ++ ++Enable Pacemaker integration which checks Pacemaker quorum and node health. ++Specify this once to enable, twice to disable. ++ ++Defaults to I. ++ ++=item B<-S> I ++ ++Set the start mode. (Defaults to I<0>.) ++ ++If this is set to zero, sbd will always start up unconditionally, ++regardless of whether the node was previously fenced or not. ++ ++If set to one, sbd will only start if the node was previously shutdown ++cleanly (as indicated by an exit request message in the slot), or if the ++slot is empty. A reset, crashdump, or power-off request in any slot will ++halt the start up. ++ ++This is useful to prevent nodes from rejoining if they were faulty. The ++node must be manually "unfenced" by sending an empty message to it: ++ ++ sbd -d /dev/sda1 message node1 clear ++ ++=item B<-s> I ++ ++Set the start-up wait time for devices. (Defaults to I<120>.) ++ ++Dynamic block devices such as iSCSI might not be fully initialized and ++present yet. This allows one to set a timeout for waiting for devices to ++appear on start-up. If set to 0, start-up will be aborted immediately if ++no devices are available. ++ ++=item B<-Z> ++ ++Enable trace mode. B Specifying this once will turn all reboots or power-offs, be ++they caused by self-fence decisions or messages, into a crashdump. ++Specifying this twice will just log them but not continue running. ++ ++=item B<-T> ++ ++By default, the daemon will set the watchdog timeout as specified in the ++device metadata. However, this does not work for every watchdog device. ++In this case, you must manually ensure that the watchdog timeout used by ++the system correctly matches the SBD settings, and then specify this ++option to allow C to continue with start-up. ++ ++=item B<-5> I ++ ++Warn if the time interval for tickling the watchdog exceeds this many seconds. ++Since the node is unable to log the watchdog expiry (it reboots immediately ++without a chance to write its logs to disk), this is very useful for getting ++an indication that the watchdog timeout is too short for the IO load of the ++system. ++ ++Default is 3 seconds, set to zero to disable. ++ ++=item B<-C> I ++ ++Watchdog timeout to set before crashdumping. If SBD is set to crashdump ++instead of reboot - either via the trace mode settings or the I ++fencing agent's parameter -, SBD will adjust the watchdog timeout to this ++setting before triggering the dump. Otherwise, the watchdog might trigger and ++prevent a successful crashdump from ever being written. ++ ++Set to zero (= default) to disable. ++ ++=item B<-r> I ++ ++Actions to be executed when the watchers don't timely report to the sbd ++master process or one of the watchers detects that the master process ++has died. ++ ++Set timeout-action to comma-separated combination of ++noflush|flush plus reboot|crashdump|off. ++If just one of both is given the other stays at the default. ++ ++This doesn't affect actions like off, crashdump, reboot explicitly ++triggered via message slots. ++And it does as well not configure the action a watchdog would ++trigger should it run off (there is no generic interface). ++ ++Defaults to flush,reboot. ++ ++=back ++ ++=head2 allocate ++ ++Example usage: ++ ++ sbd -d /dev/sda1 allocate node1 ++ ++Explicitly allocates a slot for the specified node name. This should ++rarely be necessary, as every node will automatically allocate itself a ++slot the first time it starts up on watch mode. ++ ++=head2 message ++ ++Example usage: ++ ++ sbd -d /dev/sda1 message node1 test ++ ++Writes the specified message to node's slot. This is rarely done ++directly, but rather abstracted via the C fencing agent ++configured as a cluster resource. ++ ++Supported message types are: ++ ++=over ++ ++=item test ++ ++This only generates a log message on the receiving node and can be used ++to check if SBD is seeing the device. Note that this could overwrite a ++fencing request send by the cluster, so should not be used during ++production. ++ ++=item reset ++ ++Reset the target upon receipt of this message. ++ ++=item off ++ ++Power-off the target. ++ ++=item crashdump ++ ++Cause the target node to crashdump. ++ ++=item exit ++ ++This will make the C daemon exit cleanly on the target. You should ++B send this message manually; this is handled properly during ++shutdown of the cluster stack. Manually stopping the daemon means the ++node is unprotected! ++ ++=item clear ++ ++This message indicates that no real message has been sent to the node. ++You should not set this manually; C will clear the message slot ++automatically during start-up, and setting this manually could overwrite ++a fencing message by the cluster. ++ ++=back ++ ++=head2 query-watchdog ++ ++Example usage: ++ ++ sbd query-watchdog ++ ++Check for available watchdog devices and print some info. ++ ++B: This command will arm the watchdog during query, and if your ++watchdog refuses disarming (for example, if its kernel module has the ++'nowayout' parameter set) this will reset your system. ++ ++=head2 test-watchdog ++ ++Example usage: ++ ++ sbd test-watchdog [-w /dev/watchdog3] ++ ++Test specified watchdog device (/dev/watchdog by default). ++ ++B: This command will arm the watchdog and have your system reset ++in case your watchdog is working properly! If issued from an interactive ++session, it will prompt for confirmation. ++ ++=head1 Base system configuration ++ ++=head2 Configure a watchdog ++ ++It is highly recommended that you configure your Linux system to load a ++watchdog driver with hardware assistance (as is available on most modern ++systems), such as I, I, or others. As a fall-back, you ++can use the I module. ++ ++No other software must access the watchdog timer; it can only be ++accessed by one process at any given time. Some hardware vendors ship ++systems management software that use the watchdog for system resets ++(f.e. HP ASR daemon). Such software has to be disabled if the watchdog ++is to be used by SBD. ++ ++=head2 Choosing and initializing the block device(s) ++ ++First, you have to decide if you want to use one, two, or three devices. ++ ++If you are using multiple ones, they should reside on independent ++storage setups. Putting all three of them on the same logical unit for ++example would not provide any additional redundancy. ++ ++The SBD device can be connected via Fibre Channel, Fibre Channel over ++Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of ++network-based quorum server; the advantage is that it does not require ++a smart host at your third location, just block storage. ++ ++The SBD partitions themselves B be mirrored (via MD, ++DRBD, or the storage layer itself), since this could result in a ++split-mirror scenario. Nor can they reside on cLVM2 volume groups, since ++they must be accessed by the cluster stack before it has started the ++cLVM2 daemons; hence, these should be either raw partitions or logical ++units on (multipath) storage. ++ ++The block device(s) must be accessible from all nodes. (While it is not ++necessary that they share the same path name on all nodes, this is ++considered a very good idea.) ++ ++SBD will only use about one megabyte per device, so you can easily ++create a small partition, or very small logical units. (The size of the ++SBD device depends on the block size of the underlying device. Thus, 1MB ++is fine on plain SCSI devices and SAN storage with 512 byte blocks. On ++the IBM s390x architecture in particular, disks default to 4k blocks, ++and thus require roughly 4MB.) ++ ++The number of devices will affect the operation of SBD as follows: ++ ++=over ++ ++=item One device ++ ++In its most simple implementation, you use one device only. This is ++appropriate for clusters where all your data is on the same shared ++storage (with internal redundancy) anyway; the SBD device does not ++introduce an additional single point of failure then. ++ ++If the SBD device is not accessible, the daemon will fail to start and ++inhibit startup of cluster services. ++ ++=item Two devices ++ ++This configuration is a trade-off, primarily aimed at environments where ++host-based mirroring is used, but no third storage device is available. ++ ++SBD will not commit suicide if it loses access to one mirror leg; this ++allows the cluster to continue to function even in the face of one outage. ++ ++However, SBD will not fence the other side while only one mirror leg is ++available, since it does not have enough knowledge to detect an asymmetric ++split of the storage. So it will not be able to automatically tolerate a ++second failure while one of the storage arrays is down. (Though you ++can use the appropriate crm command to acknowledge the fence manually.) ++ ++It will not start unless both devices are accessible on boot. ++ ++=item Three devices ++ ++In this most reliable and recommended configuration, SBD will only ++self-fence if more than one device is lost; hence, this configuration is ++resilient against temporary single device outages (be it due to failures ++or maintenance). Fencing messages can still be successfully relayed if ++at least two devices remain accessible. ++ ++This configuration is appropriate for more complex scenarios where ++storage is not confined to a single array. For example, host-based ++mirroring solutions could have one SBD per mirror leg (not mirrored ++itself), and an additional tie-breaker on iSCSI. ++ ++It will only start if at least two devices are accessible on boot. ++ ++=back ++ ++After you have chosen the devices and created the appropriate partitions ++and perhaps multipath alias names to ease management, use the C ++command described above to initialize the SBD metadata on them. ++ ++=head3 Sharing the block device(s) between multiple clusters ++ ++It is possible to share the block devices between multiple clusters, ++provided the total number of nodes accessing them does not exceed I<255> ++nodes, and they all must share the same SBD timeouts (since these are ++part of the metadata). ++ ++If you are using multiple devices this can reduce the setup overhead ++required. However, you should B share devices between clusters in ++different security domains. ++ ++=head2 Configure SBD to start on boot ++ ++On systems using C, the C or C system ++start-up scripts must handle starting or stopping C as required ++before starting the rest of the cluster stack. ++ ++For C, sbd simply has to be enabled using ++ ++ systemctl enable sbd.service ++ ++The daemon is brought online on each node before corosync and Pacemaker ++are started, and terminated only after all other cluster components have ++been shut down - ensuring that cluster resources are never activated ++without SBD supervision. ++ ++=head2 Configuration via sysconfig ++ ++The system instance of C is configured via F. ++In this file, you must specify the device(s) used, as well as any ++options to pass to the daemon: ++ ++ SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1" ++ SBD_PACEMAKER="true" ++ ++C will fail to start if no C is specified. See the ++installed template or section for configuration via environment ++for more options that can be configured here. ++In general configuration done via parameters takes precedence over ++the configuration from the configuration file. ++ ++=head2 Configuration via environment ++ ++=over ++@environment_section@ ++=back ++ ++=head2 Testing the sbd installation ++ ++After a restart of the cluster stack on this node, you can now try ++sending a test message to it as root, from this or any other node: ++ ++ sbd -d /dev/sda1 message node1 test ++ ++The node will acknowledge the receipt of the message in the system logs: ++ ++ Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2 ++ ++This confirms that SBD is indeed up and running on the node, and that it ++is ready to receive messages. ++ ++Make B that F is identical on all cluster ++nodes, and that all cluster nodes are running the daemon. ++ ++=head1 Pacemaker CIB integration ++ ++=head2 Fencing resource ++ ++Pacemaker can only interact with SBD to issue a node fence if there is a ++configure fencing resource. This should be a primitive, not a clone, as ++follows: ++ ++ primitive fencing-sbd stonith:external/sbd \ ++ params pcmk_delay_max=30 ++ ++This will automatically use the same devices as configured in ++F. ++ ++While you should not configure this as a clone (as Pacemaker will register ++the fencing device on each node automatically), the I ++setting enables random fencing delay which ensures, in a scenario where a ++split-brain scenario did occur in a two node cluster, that one of the nodes ++has a better chance to survive to avoid double fencing. ++ ++SBD also supports turning the reset request into a crash request, which ++may be helpful for debugging if you have kernel crashdumping configured; ++then, every fence request will cause the node to dump core. You can ++enable this via the C parameter on the fencing ++resource. This is B recommended for production use, but only for ++debugging phases. ++ ++=head2 General cluster properties ++ ++You must also enable STONITH in general, and set the STONITH timeout to ++be at least twice the I timeout you have configured, to allow ++enough time for the fencing message to be delivered. If your I ++timeout is 60 seconds, this is a possible configuration: ++ ++ property stonith-enabled="true" ++ property stonith-timeout="120s" ++ ++B: if I is too low for I and the ++system overhead, sbd will never be able to successfully complete a fence ++request. This will create a fencing loop. ++ ++Note that the sbd fencing agent will try to detect this and ++automatically extend the I setting to a reasonable ++value, on the assumption that sbd modifying your configuration is ++preferable to not fencing. ++ ++=head1 Management tasks ++ ++=head2 Recovering from temporary SBD device outage ++ ++If you have multiple devices, failure of a single device is not immediately ++fatal. C will retry to restart the monitor for the device every 5 ++seconds by default. However, you can tune this via the options to the ++I command. ++ ++In case you wish the immediately force a restart of all currently ++disabled monitor processes, you can send a I to the SBD ++I process. ++ ++ ++=head1 LICENSE ++ ++Copyright (C) 2008-2013 Lars Marowsky-Bree ++ ++This program is free software; you can redistribute it and/or ++modify it under the terms of the GNU General Public ++License as published by the Free Software Foundation; either ++version 2 of the License, or (at your option) any later version. ++ ++This software is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++General Public License for more details. ++ ++For details see the GNU General Public License at ++http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or ++http://www.gnu.org/licenses/gpl.html (the newest as per "any later"). +diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig +index e1a60ed..33b50d0 100644 +--- a/src/sbd.sysconfig ++++ b/src/sbd.sysconfig +@@ -14,7 +14,7 @@ + # + SBD_PACEMAKER=yes + +-## Type: list(always,clean) ++## Type: always / clean + ## Default: always + # + # Specify the start mode for sbd. Setting this to "clean" will only +@@ -103,6 +103,7 @@ SBD_TIMEOUT_ACTION=flush,reboot + # Thus in auto-mode sbd will check if the slice has RT-budget assigned. + # If that is the case sbd will stay in that slice while it will + # be moved to root-slice otherwise. ++# + SBD_MOVE_TO_ROOT_CGROUP=auto + + ## Type: string +-- +1.8.3.1 + diff --git a/0003-Fix-scheduling-overhaul-the-whole-thing.patch b/0003-Fix-scheduling-overhaul-the-whole-thing.patch new file mode 100644 index 0000000..05fab9d --- /dev/null +++ b/0003-Fix-scheduling-overhaul-the-whole-thing.patch @@ -0,0 +1,152 @@ +From 4bc08cf76fc01e98cbec76bf32bb333b77f69217 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 27 Feb 2020 19:02:57 +0100 +Subject: [PATCH] Fix: scheduling: overhaul the whole thing + +- prevent possible lockup when format in proc changes +- properly get and handle scheduler policy & prio +- on SCHED_RR failing push to the max with SCHED_OTHER +--- + src/sbd-common.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 44 insertions(+), 12 deletions(-) + +diff --git a/src/sbd-common.c b/src/sbd-common.c +index 9ec43b2..c2da758 100644 +--- a/src/sbd-common.c ++++ b/src/sbd-common.c +@@ -26,6 +26,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifdef _POSIX_MEMLOCK + # include +@@ -298,7 +301,7 @@ watchdog_populate_list(void) + FILE *file; + + snprintf(entry_name, sizeof(entry_name), +- SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); ++ SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); + file = fopen(entry_name, "r"); + if (file) { + int major, minor; +@@ -667,7 +670,7 @@ static int get_realtime_budget(void) + { + FILE *f; + char fname[PATH_MAX]; +- int res = -1, lnum = 0; ++ int res = -1, lnum = 0, num; + char *cgroup = NULL, *namespecs = NULL; + + snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid()); +@@ -677,7 +680,8 @@ static int get_realtime_budget(void) + (intmax_t)getpid()); + goto exit_res; + } +- while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum, &namespecs, &cgroup) !=EOF ) { ++ while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum, ++ &namespecs, &cgroup)) !=EOF ) { + if (namespecs && strstr(namespecs, "cpuacct")) { + free(namespecs); + break; +@@ -690,6 +694,11 @@ static int get_realtime_budget(void) + free(namespecs); + namespecs = NULL; + } ++ /* not to get stuck if format changes */ ++ if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) || ++ (fscanf(f, "\n") == EOF))) { ++ break; ++ } + } + fclose(f); + if (cgroup == NULL) { +@@ -776,15 +785,17 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) + return; + } + ++do { + #ifdef SCHED_RR + if (move_to_root_cgroup) { + sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup); + } + + { +- int pcurrent = 0; + int pmin = sched_get_priority_min(SCHED_RR); + int pmax = sched_get_priority_max(SCHED_RR); ++ struct sched_param sp; ++ int pcurrent; + + if (priority == 0) { + priority = pmax; +@@ -794,26 +805,47 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) + priority = pmax; + } + +- pcurrent = sched_getscheduler(0); +- if (pcurrent < 0) { ++ if (sched_getparam(0, &sp) < 0) { + cl_perror("Unable to get scheduler priority"); + +- } else if(pcurrent < priority) { +- struct sched_param sp; ++ } else if ((pcurrent = sched_getscheduler(0)) < 0) { ++ cl_perror("Unable to get scheduler policy"); + ++ } else if ((pcurrent == SCHED_RR) && ++ (sp.sched_priority >= priority)) { ++ cl_log(LOG_INFO, ++ "Stay with priority (%d) for policy SCHED_RR", ++ sp.sched_priority); ++ break; ++ } else { + memset(&sp, 0, sizeof(sp)); + sp.sched_priority = priority; + + if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { +- cl_perror("Unable to set scheduler priority to %d", priority); ++ cl_perror( ++ "Unable to set scheduler policy to SCHED_RR priority %d", ++ priority); + } else { +- cl_log(LOG_INFO, "Scheduler priority is now %d", priority); ++ cl_log(LOG_INFO, ++ "Scheduler policy is now SCHED_RR priority %d", ++ priority); ++ break; + } + } + } + #else +- cl_log(LOG_ERR, "System does not support updating the scheduler priority"); ++ cl_log(LOG_ERR, "System does not support updating the scheduler policy"); ++#endif ++#ifdef PRIO_PGRP ++ if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) { ++ cl_perror("Unable to raise the scheduler priority"); ++ } else { ++ cl_log(LOG_INFO, "Scheduler priority raised to the maximum"); ++ } ++#else ++ cl_perror("System does not support setting the scheduler priority"); + #endif ++} while (0); + + sbd_memlock(heapgrowK, stackgrowK); + } +@@ -826,7 +858,7 @@ maximize_priority(void) + return; + } + +- sbd_make_realtime(0, 256, 256); ++ sbd_make_realtime(0, 256, 256); + + if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { +-- +1.8.3.1 + diff --git a/gating.yaml b/gating.yaml new file mode 100644 index 0000000..9e15ced --- /dev/null +++ b/gating.yaml @@ -0,0 +1,15 @@ +--- !Policy +product_versions: + - fedora-* +decision_context: bodhi_update_push_testing +subject_type: koji_build +rules: + - !PassingTestCaseRule {test_case_name: fedora-ci.koji-build.tier0.functional} +--- !Policy +product_versions: + - fedora-* +decision_context: bodhi_update_push_stable +subject_type: koji_build +rules: + - !PassingTestCaseRule {test_case_name: fedora-ci.koji-build.tier0.functional} + diff --git a/sbd.spec b/sbd.spec index 86d6f85..d5f624c 100644 --- a/sbd.spec +++ b/sbd.spec @@ -15,24 +15,28 @@ # Please submit bugfixes or comments via http://bugs.opensuse.org/ # -%global commit 7f33d1a409d0a4e2cd69946688c48eaa8f3c5d26 +%global commit 25fce8a7d5e8cd5abc2379077381b10bd6cec183 %global shortcommit %(c=%{commit}; echo ${c:0:7}) -%global github_owner clusterlabs +%global github_owner Clusterlabs +%global buildnum 1 Name: sbd Summary: Storage-based death License: GPLv2+ -Version: 1.4.0 -Release: 3%{?dist} +Group: System Environment/Daemons +Version: 1.4.1 +Release: %{buildnum}%{?dist} Url: https://github.com/%{github_owner}/%{name} Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{commit}.tar.gz -Patch0: 0001-Refactor-fail-earlier-on-invalid-servants.patch +Patch0: 0001-Fix-regressions.sh-make-parameter-passing-consistent.patch +Patch1: 0002-Doc-add-environment-section-to-man-page.patch +Patch2: 0003-Fix-scheduling-overhaul-the-whole-thing.patch BuildRequires: autoconf BuildRequires: automake BuildRequires: libuuid-devel BuildRequires: glib2-devel BuildRequires: libaio-devel -BuildRequires: corosynclib-devel +BuildRequires: corosync-devel BuildRequires: pacemaker-libs-devel BuildRequires: libtool BuildRequires: libuuid-devel @@ -40,6 +44,7 @@ BuildRequires: libxml2-devel BuildRequires: pkgconfig BuildRequires: make BuildRequires: systemd +Conflicts: fence-agents-sbd < 4.5.0 %if 0%{?rhel} ExclusiveArch: i686 x86_64 s390x aarch64 ppc64le @@ -53,15 +58,28 @@ ExclusiveArch: i686 x86_64 s390x aarch64 ppc64le This package contains the storage-based death functionality. +%package tests +Summary: Storage-based death environment for regression tests +License: GPLv2+ +Group: System Environment/Daemons + +%description tests +This package provides an environment + testscripts for +regression-testing sbd. + ########################################################### %prep %autosetup -n %{name}-%{commit} -p1 +%ifarch s390x s390 +sed -i src/sbd.sysconfig -e "s/Default: 5/Default: 15/" +sed -i src/sbd.sysconfig -e "s/SBD_WATCHDOG_TIMEOUT=5/SBD_WATCHDOG_TIMEOUT=15/" +%endif ########################################################### %build -autoreconf -i +./autogen.sh export CFLAGS="$RPM_OPT_FLAGS -Wall -Werror" %configure make %{?_smp_mflags} @@ -73,6 +91,7 @@ make %{?_smp_mflags} make DESTDIR=$RPM_BUILD_ROOT LIBDIR=%{_libdir} install rm -rf ${RPM_BUILD_ROOT}%{_libdir}/stonith +install -D -m 0755 tests/regressions.sh $RPM_BUILD_ROOT/usr/share/sbd/regressions.sh %if %{defined _unitdir} install -D -m 0644 src/sbd.service $RPM_BUILD_ROOT/%{_unitdir}/sbd.service install -D -m 0644 src/sbd_remote.service $RPM_BUILD_ROOT/%{_unitdir}/sbd_remote.service @@ -81,12 +100,26 @@ install -D -m 0644 src/sbd_remote.service $RPM_BUILD_ROOT/%{_unitdir}/sbd_remote mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd +# Don't package static libs +find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f +find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f + ########################################################### %if %{defined _unitdir} %post %systemd_post sbd.service %systemd_post sbd_remote.service +if [ $1 -ne 1 ] ; then + if systemctl --quiet is-enabled sbd.service 2>/dev/null + then + systemctl --quiet reenable sbd.service 2>/dev/null || : + fi + if systemctl --quiet is-enabled sbd_remote.service 2>/dev/null + then + systemctl --quiet reenable sbd_remote.service 2>/dev/null || : + fi +fi %preun %systemd_preun sbd.service @@ -102,6 +135,7 @@ install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd %defattr(-,root,root) %config(noreplace) %{_sysconfdir}/sysconfig/sbd %{_sbindir}/sbd +%exclude %{_datadir}/sbd/regressions.sh %doc %{_mandir}/man8/sbd* %if %{defined _unitdir} %{_unitdir}/sbd.service @@ -109,7 +143,21 @@ install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd %endif %doc COPYING +%files tests +########################################################### +%defattr(-,root,root) +%dir %{_datadir}/sbd +%{_datadir}/sbd/regressions.sh +%{_libdir}/libsbdtestbed* + %changelog +* Thu Mar 5 2020 Klaus Wenninger - 1.4.1-4 +- Rebase to upstream v1.4.1 +- Make coverity happy with parameter passing in regressions.sh +- Add auto generated environment section to man-page +- Overhaul setting scheduler policy/priority +- Enable Fedora CI Gating + * Thu Jan 30 2020 Fedora Release Engineering - 1.4.0-3 - Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild diff --git a/sources b/sources index fab3041..50b7de1 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (sbd-7f33d1a409d0a4e2cd69946688c48eaa8f3c5d26.tar.gz) = 1baca43ad95d8d0886cbd1db82eeb617a2055e5a31d16f2802c27b120894c6829bfc99663e2da402a37e6c52d0cda6cec63cbc15cb6c580895e5a1d30e5b1c62 +SHA512 (sbd-25fce8a7d5e8cd5abc2379077381b10bd6cec183.tar.gz) = 3b89ee0aa88282f17c8daf725a1e7a8c2f2affdcf6ff6f4ca4faf250760d778a65c5693e5df3fcc7554d60dd9b0cb1a0350e266fadb7668320f3c676d8799a29 diff --git a/tests/inventory b/tests/inventory new file mode 100755 index 0000000..e0cea17 --- /dev/null +++ b/tests/inventory @@ -0,0 +1,4 @@ +#!/bin/bash +export TEST_DOCKER_EXTRA_ARGS="--privileged --network host" +exec merge-standard-inventory "$@" + diff --git a/tests/tests.yml b/tests/tests.yml new file mode 100644 index 0000000..49bc170 --- /dev/null +++ b/tests/tests.yml @@ -0,0 +1,16 @@ +--- +- hosts: localhost + roles: + - role: standard-test-basic + tags: + - classic + - container + tests: + - smoke: + dir: . + run: /usr/share/sbd/regressions.sh + required_packages: + - sbd + - sbd-tests + - device-mapper +