From 03fe02f1d649987a9b7d372390ad11ef815f9bca Mon Sep 17 00:00:00 2001 From: DistroBaker Date: Fri, 30 Oct 2020 15:45:43 +0100 Subject: [PATCH] Merged update from upstream sources This is an automated DistroBaker update from upstream sources. If you do not know what this is about or would like to opt out, contact the OSCI team. Source: https://src.fedoraproject.org/rpms/sbd.git#3045a1b9b159a529231ddf470f2c82f435ee579f --- .gitignore | 2 + ...sh-make-parameter-passing-consistent.patch | 82 + ...-add-environment-section-to-man-page.patch | 1459 +++++++++++++++++ ...-scheduling-overhaul-the-whole-thing.patch | 152 ++ gating.yaml | 9 + sbd.spec | 227 +++ sources | 1 + tests/inventory | 4 + tests/tests.yml | 16 + 9 files changed, 1952 insertions(+) create mode 100644 0001-Fix-regressions.sh-make-parameter-passing-consistent.patch create mode 100644 0002-Doc-add-environment-section-to-man-page.patch create mode 100644 0003-Fix-scheduling-overhaul-the-whole-thing.patch create mode 100644 gating.yaml create mode 100644 sbd.spec create mode 100644 sources create mode 100755 tests/inventory create mode 100644 tests/tests.yml diff --git a/.gitignore b/.gitignore index e69de29..ba457ce 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,2 @@ +/sbd-*.tar.gz +/sbd-*.src.rpm diff --git a/0001-Fix-regressions.sh-make-parameter-passing-consistent.patch b/0001-Fix-regressions.sh-make-parameter-passing-consistent.patch new file mode 100644 index 0000000..6f17a5a --- /dev/null +++ b/0001-Fix-regressions.sh-make-parameter-passing-consistent.patch @@ -0,0 +1,82 @@ +From 1d2a7b8d059d4f090b351b8decca0ddf274c82a0 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Wed, 20 Nov 2019 15:20:19 +0100 +Subject: [PATCH] Fix: regressions.sh: make parameter passing consistent + +--- + tests/regressions.sh | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/tests/regressions.sh b/tests/regressions.sh +index 6cfb303..7ab80be 100755 +--- a/tests/regressions.sh ++++ b/tests/regressions.sh +@@ -32,7 +32,7 @@ + : ${SBD_USE_DM:="yes"} + + sbd() { +- LD_PRELOAD=${SBD_PRELOAD} SBD_WATCHDOG_TIMEOUT=5 SBD_DEVICE="${SBD_DEVICE}" SBD_PRELOAD_LOG=${SBD_PRELOAD_LOG} SBD_WATCHDOG_DEV=/dev/watchdog setsid ${SBD_BINARY} -p ${SBD_PIDFILE} $* ++ LD_PRELOAD=${SBD_PRELOAD} SBD_WATCHDOG_TIMEOUT=5 SBD_DEVICE="${SBD_DEVICE}" SBD_PRELOAD_LOG=${SBD_PRELOAD_LOG} SBD_WATCHDOG_DEV=/dev/watchdog setsid ${SBD_BINARY} -p ${SBD_PIDFILE} "$@" + } + + sbd_wipe_disk() { +@@ -98,26 +98,26 @@ sbd_daemon_cleanup() { + pkill -TERM --pidfile ${SBD_PIDFILE} 2>/dev/null + sleep 5 + pkill -KILL --pidfile ${SBD_PIDFILE} 2>/dev/null +- pkill -KILL --parent $(cat ${SBD_PIDFILE} 2>/dev/null) 2>/dev/null ++ pkill -KILL --parent "$(cat ${SBD_PIDFILE} 2>/dev/null)" 2>/dev/null + echo > ${SBD_PIDFILE} + } + + _ok() { +- echo -- $@ +- $@ ++ echo "-- $*" ++ "$@" + rc=$? + if [ $rc -ne 0 ]; then +- echo "$@ failed with $rc" ++ echo "$* failed with $rc" + exit $rc + fi + } + + _no() { +- echo -- $@ +- $@ ++ echo "-- $*" ++ "$@" + rc=$? + if [ $rc -eq 0 ]; then +- echo "$@ did NOT fail ($rc)" ++ echo "$* did NOT fail ($rc)" + exit $rc + fi + return 0 +@@ -126,7 +126,7 @@ _no() { + _in_log() { + grep "$@" ${SBD_PRELOAD_LOG} >/dev/null + if [ $? -ne 0 ]; then +- echo "didn't find '$@' in log:" ++ echo "didn't find '$*' in log:" + cat ${SBD_PRELOAD_LOG} + sbd_daemon_cleanup + exit 1 +@@ -227,10 +227,10 @@ test_stall_inquisitor() { + sbd_daemon_cleanup + sbd -d ${D[1]} -d ${D[2]} -d ${D[3]} -n test-1 watch + sleep 10 +- _ok kill -0 $(cat ${SBD_PIDFILE}) +- kill -STOP $(cat ${SBD_PIDFILE}) ++ _ok kill -0 "$(cat ${SBD_PIDFILE})" ++ kill -STOP "$(cat ${SBD_PIDFILE})" + sleep 10 +- kill -CONT $(cat ${SBD_PIDFILE}) 2>/dev/null ++ kill -CONT "$(cat ${SBD_PIDFILE})" 2>/dev/null + _in_log "watchdog fired" + } + +-- +1.8.3.1 + diff --git a/0002-Doc-add-environment-section-to-man-page.patch b/0002-Doc-add-environment-section-to-man-page.patch new file mode 100644 index 0000000..2ad9556 --- /dev/null +++ b/0002-Doc-add-environment-section-to-man-page.patch @@ -0,0 +1,1459 @@ +From 9dd82a8b4daa5a7bd8ab3afa43b081f212efb1ac Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Wed, 29 Jan 2020 20:34:18 +0100 +Subject: [PATCH] Doc: add environment section to man-page + +Environment section is auto-generated from sbd.sysconfig. +--- + .gitignore | 1 + + Makefile.am | 6 +- + README.md | 3 +- + man/Makefile.am | 8 +- + man/sbd.8.pod | 668 ----------------------------------------------------- + man/sbd.8.pod.in | 675 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + src/sbd.sysconfig | 3 +- + 7 files changed, 690 insertions(+), 674 deletions(-) + delete mode 100644 man/sbd.8.pod + create mode 100644 man/sbd.8.pod.in + +diff --git a/Makefile.am b/Makefile.am +index 1c29f75..bd4346d 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -9,8 +9,8 @@ TARFILE = $(distdir).tar.gz + DIST_ARCHIVES = $(TARFILE) + KEEP_EXISTING_TAR = no + INJECT_GIT_COMMIT = yes +-DISTCLEANFILES = sbd-* sbd-*/ + CLEANFILES = *.rpm *.tar.* sbd-* ++DISTCLEANFILES = sbd-* sbd-*/ + + RPM_ROOT = $(shell pwd) + RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \ +@@ -31,7 +31,7 @@ export SBD_BINARY := src/sbd + export SBD_PRELOAD := tests/.libs/libsbdtestbed.so + export SBD_USE_DM := no + +-EXTRA_DIST = sbd.spec tests/regressions.sh ++EXTRA_DIST = sbd.spec tests/regressions.sh man/sbd.8.pod.in + + export: + rm -f $(PACKAGE)-HEAD.tar.* +@@ -43,7 +43,7 @@ export: + echo `date`: Using existing tarball: $(TARFILE); \ + else \ + rm -f $(PACKAGE).tar.*; \ +- (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclusive="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \ ++ (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclude="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude="*.pod" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \ + if test -n "$$(git status -s)" || test "$(INJECT_GIT_COMMIT)" = "yes"; then \ + if test -n "$$(git status -s)"; then git diff HEAD --name-only|grep -v "^\."|xargs -n1 git diff HEAD > uncommitted.diff; fi; \ + rm -rf $(distdir); tar -xzf $(TARFILE); rm $(TARFILE); \ +diff --git a/README.md b/README.md +index d02a8bd..42a3fde 100644 +--- a/README.md ++++ b/README.md +@@ -5,5 +5,6 @@ A highly reliable fencing or Shoot-the-other-node-in-the-head (STONITH) mechanis + The component works with Pacemaker clusters, and is currently known to + compile and function on Pacemaker 1.1.7+ and corosync 1.4.x or 2.3.x. + +-Please see https://github.com/l-mb/sbd/blob/master/man/sbd.8.pod for the full documentation. ++Please see https://github.com/clusterlabs/sbd/blob/master/man/sbd.8.pod.in & ++https://github.com/clusterlabs/sbd/blob/master/src/sbd.sysconfig for the full documentation. + +diff --git a/man/Makefile.am b/man/Makefile.am +index 3f89085..995712d 100644 +--- a/man/Makefile.am ++++ b/man/Makefile.am +@@ -1,6 +1,12 @@ + dist_man_MANS = sbd.8 + +-EXTRA_DIST = sbd.8.pod ++DISTCLEANFILES = sbd.8.pod sbd.8 sbd.sysconfig.pod ++ ++sbd.sysconfig.pod: ../src/sbd.sysconfig ++ sed -r -n -e "s/^## Type: (.*)/Allows C<\1>/;t type;s/^## Default: (.*)/ defaulting to C<\1>/;t default;s/^#*(.*)=.*/=item B<\1>\n/;t variable;s/^#*//;s/^ *//;H;d;:type;h;d;:default;H;x;s/\n//;x;d;:variable;G;p" $< > $@ ++ ++sbd.8.pod: sbd.8.pod.in sbd.sysconfig.pod ++ sed -e "s/@environment_section@//;t insert;p;d;:insert;rsbd.sysconfig.pod" $< > $@ + + sbd.8: sbd.8.pod + @POD2MAN@ -s 8 -c "STONITH Block Device" -r "SBD" -n "SBD" $< $@ +diff --git a/man/sbd.8.pod b/man/sbd.8.pod +deleted file mode 100644 +index 377c579..0000000 +--- a/man/sbd.8.pod ++++ /dev/null +@@ -1,668 +0,0 @@ +-=head1 NAME +- +-sbd - STONITH Block Device daemon +- +-=head1 SYNOPSIS +- +-sbd <-d F> [options] C +- +-=head1 SUMMARY +- +-SBD provides a node fencing mechanism (Shoot the other node in the head, +-STONITH) for Pacemaker-based clusters through the exchange of messages +-via shared block storage such as for example a SAN, iSCSI, FCoE. This +-isolates the fencing mechanism from changes in firmware version or +-dependencies on specific firmware controllers, and it can be used as a +-STONITH mechanism in all configurations that have reliable shared +-storage. +- +-SBD can also be used without any shared storage. In this mode, the +-watchdog device will be used to reset the node if it loses quorum, if +-any monitored daemon is lost and not recovered or if Pacemaker decides +-that the node requires fencing. +- +-The F binary implements both the daemon that watches the message +-slots as well as the management tool for interacting with the block +-storage device(s). This mode of operation is specified via the +-C parameter; some of these modes take additional parameters. +- +-To use SBD with shared storage, you must first C the messaging +-layout on one to three block devices. Second, configure +-F to list those devices (and possibly adjust other +-options), and restart the cluster stack on each node to ensure that +-C is started. Third, configure the C fencing +-resource in the Pacemaker CIB. +- +-Each of these steps is documented in more detail below the description +-of the command options. +- +-C can only be used as root. +- +-=head2 GENERAL OPTIONS +- +-=over +- +-=item B<-d> F +- +-Specify the block device(s) to be used. If you have more than one, +-specify this option up to three times. This parameter is mandatory for +-all modes, since SBD always needs a block device to interact with. +- +-This man page uses F, F, and F as +-example device names for brevity. However, in your production +-environment, you should instead always refer to them by using the long, +-stable device name (e.g., +-F). +- +-=item B<-v|-vv|-vvv> +- +-Enable verbose|debug|debug-library logging (optional) +- +-=item B<-h> +- +-Display a concise summary of C options. +- +-=item B<-n> I +- +-Set local node name; defaults to C. This should not need to be +-set. +- +-=item B<-R> +- +-Do B enable realtime priority. By default, C runs at realtime +-priority, locks itself into memory, and also acquires highest IO +-priority to protect itself against interference from other processes on +-the system. This is a debugging-only option. +- +-=item B<-I> I +- +-Async IO timeout (defaults to 3 seconds, optional). You should not need +-to adjust this unless your IO setup is really very slow. +- +-(In daemon mode, the watchdog is refreshed when the majority of devices +-could be read within this time.) +- +-=back +- +-=head2 create +- +-Example usage: +- +- sbd -d /dev/sdc2 -d /dev/sdd3 create +- +-If you specify the I command, sbd will write a metadata header +-to the device(s) specified and also initialize the messaging slots for +-up to 255 nodes. +- +-B: This command will not prompt for confirmation. Roughly the +-first megabyte of the specified block device(s) will be overwritten +-immediately and without backup. +- +-This command accepts a few options to adjust the default timings that +-are written to the metadata (to ensure they are identical across all +-nodes accessing the device). +- +-=over +- +-=item B<-1> I +- +-Set watchdog timeout to N seconds. This depends mostly on your storage +-latency; the majority of devices must be successfully read within this +-time, or else the node will self-fence. +- +-If your sbd device(s) reside on a multipath setup or iSCSI, this should +-be the time required to detect a path failure. You may be able to reduce +-this if your device outages are independent, or if you are using the +-Pacemaker integration. +- +-=item B<-2> I +- +-Set slot allocation timeout to N seconds. You should not need to tune +-this. +- +-=item B<-3> I +- +-Set daemon loop timeout to N seconds. You should not need to tune this. +- +-=item B<-4> I +- +-Set I timeout to N seconds. This should be twice the I +-timeout. This is the time after which a message written to a node's slot +-will be considered delivered. (Or long enough for the node to detect +-that it needed to self-fence.) +- +-This also affects the I in Pacemaker's CIB; see below. +- +-=back +- +-=head2 list +- +-Example usage: +- +- # sbd -d /dev/sda1 list +- 0 hex-0 clear +- 1 hex-7 clear +- 2 hex-9 clear +- +-List all allocated slots on device, and messages. You should see all +-cluster nodes that have ever been started against this device. Nodes +-that are currently running should have a I state; nodes that have +-been fenced, but not yet restarted, will show the appropriate fencing +-message. +- +-=head2 dump +- +-Example usage: +- +- # sbd -d /dev/sda1 dump +- ==Dumping header on disk /dev/sda1 +- Header version : 2 +- Number of slots : 255 +- Sector size : 512 +- Timeout (watchdog) : 15 +- Timeout (allocate) : 2 +- Timeout (loop) : 1 +- Timeout (msgwait) : 30 +- ==Header on disk /dev/sda1 is dumped +- +-Dump meta-data header from device. +- +-=head2 watch +- +-Example usage: +- +- sbd -d /dev/sdc2 -d /dev/sdd3 -P watch +- +-This command will make C start in daemon mode. It will constantly monitor +-the message slot of the local node for incoming messages, reachability, and +-optionally take Pacemaker's state into account. +- +-C B be started on boot before the cluster stack! See below +-for enabling this according to your boot environment. +- +-The options for this mode are rarely specified directly on the +-commandline directly, but most frequently set via F. +- +-It also constantly monitors connectivity to the storage device, and +-self-fences in case the partition becomes unreachable, guaranteeing that it +-does not disconnect from fencing messages. +- +-A node slot is automatically allocated on the device(s) the first time +-the daemon starts watching the device; hence, manual allocation is not +-usually required. +- +-If a watchdog is used together with the C as is strongly +-recommended, the watchdog is activated at initial start of the sbd +-daemon. The watchdog is refreshed every time the majority of SBD devices +-has been successfully read. Using a watchdog provides additional +-protection against C crashing. +- +-If the Pacemaker integration is activated, C will B self-fence +-if device majority is lost, if: +- +-=over +- +-=item 1. +- +-The partition the node is in is still quorate according to the CIB; +- +-=item 2. +- +-it is still quorate according to Corosync's node count; +- +-=item 3. +- +-the node itself is considered online and healthy by Pacemaker. +- +-=back +- +-This allows C to survive temporary outages of the majority of +-devices. However, while the cluster is in such a degraded state, it can +-neither successfully fence nor be shutdown cleanly (as taking the +-cluster below the quorum threshold will immediately cause all remaining +-nodes to self-fence). In short, it will not tolerate any further faults. +-Please repair the system before continuing. +- +-There is one C process that acts as a master to which all watchers +-report; one per device to monitor the node's slot; and, optionally, one +-that handles the Pacemaker integration. +- +-=over +- +-=item B<-W> +- +-Enable or disable use of the system watchdog to protect against the sbd +-processes failing and the node being left in an undefined state. Specify +-this once to enable, twice to disable. +- +-Defaults to I. +- +-=item B<-w> F +- +-This can be used to override the default watchdog device used and should not +-usually be necessary. +- +-=item B<-p> F +- +-This option can be used to specify a pidfile for the main sbd process. +- +-=item B<-F> I +- +-Number of failures before a failing servant process will not be restarted +-immediately until the dampening delay has expired. If set to zero, servants +-will be restarted immediately and indefinitely. If set to one, a failed +-servant will be restarted once every B<-t> seconds. If set to a different +-value, the servant will be restarted that many times within the dampening +-period and then delay. +- +-Defaults to I<1>. +- +-=item B<-t> I +- +-Dampening delay before faulty servants are restarted. Combined with C<-F 1>, +-the most logical way to tune the restart frequency of servant processes. +-Default is 5 seconds. +- +-If set to zero, processes will be restarted indefinitely and immediately. +- +-=item B<-P> +- +-Enable Pacemaker integration which checks Pacemaker quorum and node health. +-Specify this once to enable, twice to disable. +- +-Defaults to I. +- +-=item B<-S> I +- +-Set the start mode. (Defaults to I<0>.) +- +-If this is set to zero, sbd will always start up unconditionally, +-regardless of whether the node was previously fenced or not. +- +-If set to one, sbd will only start if the node was previously shutdown +-cleanly (as indicated by an exit request message in the slot), or if the +-slot is empty. A reset, crashdump, or power-off request in any slot will +-halt the start up. +- +-This is useful to prevent nodes from rejoining if they were faulty. The +-node must be manually "unfenced" by sending an empty message to it: +- +- sbd -d /dev/sda1 message node1 clear +- +-=item B<-s> I +- +-Set the start-up wait time for devices. (Defaults to I<120>.) +- +-Dynamic block devices such as iSCSI might not be fully initialized and +-present yet. This allows one to set a timeout for waiting for devices to +-appear on start-up. If set to 0, start-up will be aborted immediately if +-no devices are available. +- +-=item B<-Z> +- +-Enable trace mode. B Specifying this once will turn all reboots or power-offs, be +-they caused by self-fence decisions or messages, into a crashdump. +-Specifying this twice will just log them but not continue running. +- +-=item B<-T> +- +-By default, the daemon will set the watchdog timeout as specified in the +-device metadata. However, this does not work for every watchdog device. +-In this case, you must manually ensure that the watchdog timeout used by +-the system correctly matches the SBD settings, and then specify this +-option to allow C to continue with start-up. +- +-=item B<-5> I +- +-Warn if the time interval for tickling the watchdog exceeds this many seconds. +-Since the node is unable to log the watchdog expiry (it reboots immediately +-without a chance to write its logs to disk), this is very useful for getting +-an indication that the watchdog timeout is too short for the IO load of the +-system. +- +-Default is 3 seconds, set to zero to disable. +- +-=item B<-C> I +- +-Watchdog timeout to set before crashdumping. If SBD is set to crashdump +-instead of reboot - either via the trace mode settings or the I +-fencing agent's parameter -, SBD will adjust the watchdog timeout to this +-setting before triggering the dump. Otherwise, the watchdog might trigger and +-prevent a successful crashdump from ever being written. +- +-Set to zero (= default) to disable. +- +-=item B<-r> I +- +-Actions to be executed when the watchers don't timely report to the sbd +-master process or one of the watchers detects that the master process +-has died. +- +-Set timeout-action to comma-separated combination of +-noflush|flush plus reboot|crashdump|off. +-If just one of both is given the other stays at the default. +- +-This doesn't affect actions like off, crashdump, reboot explicitly +-triggered via message slots. +-And it does as well not configure the action a watchdog would +-trigger should it run off (there is no generic interface). +- +-Defaults to flush,reboot. +- +-=back +- +-=head2 allocate +- +-Example usage: +- +- sbd -d /dev/sda1 allocate node1 +- +-Explicitly allocates a slot for the specified node name. This should +-rarely be necessary, as every node will automatically allocate itself a +-slot the first time it starts up on watch mode. +- +-=head2 message +- +-Example usage: +- +- sbd -d /dev/sda1 message node1 test +- +-Writes the specified message to node's slot. This is rarely done +-directly, but rather abstracted via the C fencing agent +-configured as a cluster resource. +- +-Supported message types are: +- +-=over +- +-=item test +- +-This only generates a log message on the receiving node and can be used +-to check if SBD is seeing the device. Note that this could overwrite a +-fencing request send by the cluster, so should not be used during +-production. +- +-=item reset +- +-Reset the target upon receipt of this message. +- +-=item off +- +-Power-off the target. +- +-=item crashdump +- +-Cause the target node to crashdump. +- +-=item exit +- +-This will make the C daemon exit cleanly on the target. You should +-B send this message manually; this is handled properly during +-shutdown of the cluster stack. Manually stopping the daemon means the +-node is unprotected! +- +-=item clear +- +-This message indicates that no real message has been sent to the node. +-You should not set this manually; C will clear the message slot +-automatically during start-up, and setting this manually could overwrite +-a fencing message by the cluster. +- +-=back +- +-=head2 query-watchdog +- +-Example usage: +- +- sbd query-watchdog +- +-Check for available watchdog devices and print some info. +- +-B: This command will arm the watchdog during query, and if your +-watchdog refuses disarming (for example, if its kernel module has the +-'nowayout' parameter set) this will reset your system. +- +-=head2 test-watchdog +- +-Example usage: +- +- sbd test-watchdog [-w /dev/watchdog3] +- +-Test specified watchdog device (/dev/watchdog by default). +- +-B: This command will arm the watchdog and have your system reset +-in case your watchdog is working properly! If issued from an interactive +-session, it will prompt for confirmation. +- +-=head1 Base system configuration +- +-=head2 Configure a watchdog +- +-It is highly recommended that you configure your Linux system to load a +-watchdog driver with hardware assistance (as is available on most modern +-systems), such as I, I, or others. As a fall-back, you +-can use the I module. +- +-No other software must access the watchdog timer; it can only be +-accessed by one process at any given time. Some hardware vendors ship +-systems management software that use the watchdog for system resets +-(f.e. HP ASR daemon). Such software has to be disabled if the watchdog +-is to be used by SBD. +- +-=head2 Choosing and initializing the block device(s) +- +-First, you have to decide if you want to use one, two, or three devices. +- +-If you are using multiple ones, they should reside on independent +-storage setups. Putting all three of them on the same logical unit for +-example would not provide any additional redundancy. +- +-The SBD device can be connected via Fibre Channel, Fibre Channel over +-Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of +-network-based quorum server; the advantage is that it does not require +-a smart host at your third location, just block storage. +- +-The SBD partitions themselves B be mirrored (via MD, +-DRBD, or the storage layer itself), since this could result in a +-split-mirror scenario. Nor can they reside on cLVM2 volume groups, since +-they must be accessed by the cluster stack before it has started the +-cLVM2 daemons; hence, these should be either raw partitions or logical +-units on (multipath) storage. +- +-The block device(s) must be accessible from all nodes. (While it is not +-necessary that they share the same path name on all nodes, this is +-considered a very good idea.) +- +-SBD will only use about one megabyte per device, so you can easily +-create a small partition, or very small logical units. (The size of the +-SBD device depends on the block size of the underlying device. Thus, 1MB +-is fine on plain SCSI devices and SAN storage with 512 byte blocks. On +-the IBM s390x architecture in particular, disks default to 4k blocks, +-and thus require roughly 4MB.) +- +-The number of devices will affect the operation of SBD as follows: +- +-=over +- +-=item One device +- +-In its most simple implementation, you use one device only. This is +-appropriate for clusters where all your data is on the same shared +-storage (with internal redundancy) anyway; the SBD device does not +-introduce an additional single point of failure then. +- +-If the SBD device is not accessible, the daemon will fail to start and +-inhibit startup of cluster services. +- +-=item Two devices +- +-This configuration is a trade-off, primarily aimed at environments where +-host-based mirroring is used, but no third storage device is available. +- +-SBD will not commit suicide if it loses access to one mirror leg; this +-allows the cluster to continue to function even in the face of one outage. +- +-However, SBD will not fence the other side while only one mirror leg is +-available, since it does not have enough knowledge to detect an asymmetric +-split of the storage. So it will not be able to automatically tolerate a +-second failure while one of the storage arrays is down. (Though you +-can use the appropriate crm command to acknowledge the fence manually.) +- +-It will not start unless both devices are accessible on boot. +- +-=item Three devices +- +-In this most reliable and recommended configuration, SBD will only +-self-fence if more than one device is lost; hence, this configuration is +-resilient against temporary single device outages (be it due to failures +-or maintenance). Fencing messages can still be successfully relayed if +-at least two devices remain accessible. +- +-This configuration is appropriate for more complex scenarios where +-storage is not confined to a single array. For example, host-based +-mirroring solutions could have one SBD per mirror leg (not mirrored +-itself), and an additional tie-breaker on iSCSI. +- +-It will only start if at least two devices are accessible on boot. +- +-=back +- +-After you have chosen the devices and created the appropriate partitions +-and perhaps multipath alias names to ease management, use the C +-command described above to initialize the SBD metadata on them. +- +-=head3 Sharing the block device(s) between multiple clusters +- +-It is possible to share the block devices between multiple clusters, +-provided the total number of nodes accessing them does not exceed I<255> +-nodes, and they all must share the same SBD timeouts (since these are +-part of the metadata). +- +-If you are using multiple devices this can reduce the setup overhead +-required. However, you should B share devices between clusters in +-different security domains. +- +-=head2 Configure SBD to start on boot +- +-On systems using C, the C or C system +-start-up scripts must handle starting or stopping C as required +-before starting the rest of the cluster stack. +- +-For C, sbd simply has to be enabled using +- +- systemctl enable sbd.service +- +-The daemon is brought online on each node before corosync and Pacemaker +-are started, and terminated only after all other cluster components have +-been shut down - ensuring that cluster resources are never activated +-without SBD supervision. +- +-=head2 Configuration via sysconfig +- +-The system instance of C is configured via F. +-In this file, you must specify the device(s) used, as well as any +-options to pass to the daemon: +- +- SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1" +- SBD_PACEMAKER="true" +- +-C will fail to start if no C is specified. See the +-installed template for more options that can be configured here. +-In general configuration done via parameters takes precedence over +-the configuration from the configuration file. +- +-=head2 Testing the sbd installation +- +-After a restart of the cluster stack on this node, you can now try +-sending a test message to it as root, from this or any other node: +- +- sbd -d /dev/sda1 message node1 test +- +-The node will acknowledge the receipt of the message in the system logs: +- +- Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2 +- +-This confirms that SBD is indeed up and running on the node, and that it +-is ready to receive messages. +- +-Make B that F is identical on all cluster +-nodes, and that all cluster nodes are running the daemon. +- +-=head1 Pacemaker CIB integration +- +-=head2 Fencing resource +- +-Pacemaker can only interact with SBD to issue a node fence if there is a +-configure fencing resource. This should be a primitive, not a clone, as +-follows: +- +- primitive fencing-sbd stonith:external/sbd \ +- params pcmk_delay_max=30 +- +-This will automatically use the same devices as configured in +-F. +- +-While you should not configure this as a clone (as Pacemaker will register +-the fencing device on each node automatically), the I +-setting enables random fencing delay which ensures, in a scenario where a +-split-brain scenario did occur in a two node cluster, that one of the nodes +-has a better chance to survive to avoid double fencing. +- +-SBD also supports turning the reset request into a crash request, which +-may be helpful for debugging if you have kernel crashdumping configured; +-then, every fence request will cause the node to dump core. You can +-enable this via the C parameter on the fencing +-resource. This is B recommended for production use, but only for +-debugging phases. +- +-=head2 General cluster properties +- +-You must also enable STONITH in general, and set the STONITH timeout to +-be at least twice the I timeout you have configured, to allow +-enough time for the fencing message to be delivered. If your I +-timeout is 60 seconds, this is a possible configuration: +- +- property stonith-enabled="true" +- property stonith-timeout="120s" +- +-B: if I is too low for I and the +-system overhead, sbd will never be able to successfully complete a fence +-request. This will create a fencing loop. +- +-Note that the sbd fencing agent will try to detect this and +-automatically extend the I setting to a reasonable +-value, on the assumption that sbd modifying your configuration is +-preferable to not fencing. +- +-=head1 Management tasks +- +-=head2 Recovering from temporary SBD device outage +- +-If you have multiple devices, failure of a single device is not immediately +-fatal. C will retry to restart the monitor for the device every 5 +-seconds by default. However, you can tune this via the options to the +-I command. +- +-In case you wish the immediately force a restart of all currently +-disabled monitor processes, you can send a I to the SBD +-I process. +- +- +-=head1 LICENSE +- +-Copyright (C) 2008-2013 Lars Marowsky-Bree +- +-This program is free software; you can redistribute it and/or +-modify it under the terms of the GNU General Public +-License as published by the Free Software Foundation; either +-version 2 of the License, or (at your option) any later version. +- +-This software is distributed in the hope that it will be useful, +-but WITHOUT ANY WARRANTY; without even the implied warranty of +-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +-General Public License for more details. +- +-For details see the GNU General Public License at +-http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or +-http://www.gnu.org/licenses/gpl.html (the newest as per "any later"). +diff --git a/man/sbd.8.pod.in b/man/sbd.8.pod.in +new file mode 100644 +index 0000000..ff89c82 +--- /dev/null ++++ b/man/sbd.8.pod.in +@@ -0,0 +1,675 @@ ++=head1 NAME ++ ++sbd - STONITH Block Device daemon ++ ++=head1 SYNOPSIS ++ ++sbd <-d F> [options] C ++ ++=head1 SUMMARY ++ ++SBD provides a node fencing mechanism (Shoot the other node in the head, ++STONITH) for Pacemaker-based clusters through the exchange of messages ++via shared block storage such as for example a SAN, iSCSI, FCoE. This ++isolates the fencing mechanism from changes in firmware version or ++dependencies on specific firmware controllers, and it can be used as a ++STONITH mechanism in all configurations that have reliable shared ++storage. ++ ++SBD can also be used without any shared storage. In this mode, the ++watchdog device will be used to reset the node if it loses quorum, if ++any monitored daemon is lost and not recovered or if Pacemaker decides ++that the node requires fencing. ++ ++The F binary implements both the daemon that watches the message ++slots as well as the management tool for interacting with the block ++storage device(s). This mode of operation is specified via the ++C parameter; some of these modes take additional parameters. ++ ++To use SBD with shared storage, you must first C the messaging ++layout on one to three block devices. Second, configure ++F to list those devices (and possibly adjust other ++options), and restart the cluster stack on each node to ensure that ++C is started. Third, configure the C fencing ++resource in the Pacemaker CIB. ++ ++Each of these steps is documented in more detail below the description ++of the command options. ++ ++C can only be used as root. ++ ++=head2 GENERAL OPTIONS ++ ++=over ++ ++=item B<-d> F ++ ++Specify the block device(s) to be used. If you have more than one, ++specify this option up to three times. This parameter is mandatory for ++all modes, since SBD always needs a block device to interact with. ++ ++This man page uses F, F, and F as ++example device names for brevity. However, in your production ++environment, you should instead always refer to them by using the long, ++stable device name (e.g., ++F). ++ ++=item B<-v|-vv|-vvv> ++ ++Enable verbose|debug|debug-library logging (optional) ++ ++=item B<-h> ++ ++Display a concise summary of C options. ++ ++=item B<-n> I ++ ++Set local node name; defaults to C. This should not need to be ++set. ++ ++=item B<-R> ++ ++Do B enable realtime priority. By default, C runs at realtime ++priority, locks itself into memory, and also acquires highest IO ++priority to protect itself against interference from other processes on ++the system. This is a debugging-only option. ++ ++=item B<-I> I ++ ++Async IO timeout (defaults to 3 seconds, optional). You should not need ++to adjust this unless your IO setup is really very slow. ++ ++(In daemon mode, the watchdog is refreshed when the majority of devices ++could be read within this time.) ++ ++=back ++ ++=head2 create ++ ++Example usage: ++ ++ sbd -d /dev/sdc2 -d /dev/sdd3 create ++ ++If you specify the I command, sbd will write a metadata header ++to the device(s) specified and also initialize the messaging slots for ++up to 255 nodes. ++ ++B: This command will not prompt for confirmation. Roughly the ++first megabyte of the specified block device(s) will be overwritten ++immediately and without backup. ++ ++This command accepts a few options to adjust the default timings that ++are written to the metadata (to ensure they are identical across all ++nodes accessing the device). ++ ++=over ++ ++=item B<-1> I ++ ++Set watchdog timeout to N seconds. This depends mostly on your storage ++latency; the majority of devices must be successfully read within this ++time, or else the node will self-fence. ++ ++If your sbd device(s) reside on a multipath setup or iSCSI, this should ++be the time required to detect a path failure. You may be able to reduce ++this if your device outages are independent, or if you are using the ++Pacemaker integration. ++ ++=item B<-2> I ++ ++Set slot allocation timeout to N seconds. You should not need to tune ++this. ++ ++=item B<-3> I ++ ++Set daemon loop timeout to N seconds. You should not need to tune this. ++ ++=item B<-4> I ++ ++Set I timeout to N seconds. This should be twice the I ++timeout. This is the time after which a message written to a node's slot ++will be considered delivered. (Or long enough for the node to detect ++that it needed to self-fence.) ++ ++This also affects the I in Pacemaker's CIB; see below. ++ ++=back ++ ++=head2 list ++ ++Example usage: ++ ++ # sbd -d /dev/sda1 list ++ 0 hex-0 clear ++ 1 hex-7 clear ++ 2 hex-9 clear ++ ++List all allocated slots on device, and messages. You should see all ++cluster nodes that have ever been started against this device. Nodes ++that are currently running should have a I state; nodes that have ++been fenced, but not yet restarted, will show the appropriate fencing ++message. ++ ++=head2 dump ++ ++Example usage: ++ ++ # sbd -d /dev/sda1 dump ++ ==Dumping header on disk /dev/sda1 ++ Header version : 2 ++ Number of slots : 255 ++ Sector size : 512 ++ Timeout (watchdog) : 15 ++ Timeout (allocate) : 2 ++ Timeout (loop) : 1 ++ Timeout (msgwait) : 30 ++ ==Header on disk /dev/sda1 is dumped ++ ++Dump meta-data header from device. ++ ++=head2 watch ++ ++Example usage: ++ ++ sbd -d /dev/sdc2 -d /dev/sdd3 -P watch ++ ++This command will make C start in daemon mode. It will constantly monitor ++the message slot of the local node for incoming messages, reachability, and ++optionally take Pacemaker's state into account. ++ ++C B be started on boot before the cluster stack! See below ++for enabling this according to your boot environment. ++ ++The options for this mode are rarely specified directly on the ++commandline directly, but most frequently set via F. ++ ++It also constantly monitors connectivity to the storage device, and ++self-fences in case the partition becomes unreachable, guaranteeing that it ++does not disconnect from fencing messages. ++ ++A node slot is automatically allocated on the device(s) the first time ++the daemon starts watching the device; hence, manual allocation is not ++usually required. ++ ++If a watchdog is used together with the C as is strongly ++recommended, the watchdog is activated at initial start of the sbd ++daemon. The watchdog is refreshed every time the majority of SBD devices ++has been successfully read. Using a watchdog provides additional ++protection against C crashing. ++ ++If the Pacemaker integration is activated, C will B self-fence ++if device majority is lost, if: ++ ++=over ++ ++=item 1. ++ ++The partition the node is in is still quorate according to the CIB; ++ ++=item 2. ++ ++it is still quorate according to Corosync's node count; ++ ++=item 3. ++ ++the node itself is considered online and healthy by Pacemaker. ++ ++=back ++ ++This allows C to survive temporary outages of the majority of ++devices. However, while the cluster is in such a degraded state, it can ++neither successfully fence nor be shutdown cleanly (as taking the ++cluster below the quorum threshold will immediately cause all remaining ++nodes to self-fence). In short, it will not tolerate any further faults. ++Please repair the system before continuing. ++ ++There is one C process that acts as a master to which all watchers ++report; one per device to monitor the node's slot; and, optionally, one ++that handles the Pacemaker integration. ++ ++=over ++ ++=item B<-W> ++ ++Enable or disable use of the system watchdog to protect against the sbd ++processes failing and the node being left in an undefined state. Specify ++this once to enable, twice to disable. ++ ++Defaults to I. ++ ++=item B<-w> F ++ ++This can be used to override the default watchdog device used and should not ++usually be necessary. ++ ++=item B<-p> F ++ ++This option can be used to specify a pidfile for the main sbd process. ++ ++=item B<-F> I ++ ++Number of failures before a failing servant process will not be restarted ++immediately until the dampening delay has expired. If set to zero, servants ++will be restarted immediately and indefinitely. If set to one, a failed ++servant will be restarted once every B<-t> seconds. If set to a different ++value, the servant will be restarted that many times within the dampening ++period and then delay. ++ ++Defaults to I<1>. ++ ++=item B<-t> I ++ ++Dampening delay before faulty servants are restarted. Combined with C<-F 1>, ++the most logical way to tune the restart frequency of servant processes. ++Default is 5 seconds. ++ ++If set to zero, processes will be restarted indefinitely and immediately. ++ ++=item B<-P> ++ ++Enable Pacemaker integration which checks Pacemaker quorum and node health. ++Specify this once to enable, twice to disable. ++ ++Defaults to I. ++ ++=item B<-S> I ++ ++Set the start mode. (Defaults to I<0>.) ++ ++If this is set to zero, sbd will always start up unconditionally, ++regardless of whether the node was previously fenced or not. ++ ++If set to one, sbd will only start if the node was previously shutdown ++cleanly (as indicated by an exit request message in the slot), or if the ++slot is empty. A reset, crashdump, or power-off request in any slot will ++halt the start up. ++ ++This is useful to prevent nodes from rejoining if they were faulty. The ++node must be manually "unfenced" by sending an empty message to it: ++ ++ sbd -d /dev/sda1 message node1 clear ++ ++=item B<-s> I ++ ++Set the start-up wait time for devices. (Defaults to I<120>.) ++ ++Dynamic block devices such as iSCSI might not be fully initialized and ++present yet. This allows one to set a timeout for waiting for devices to ++appear on start-up. If set to 0, start-up will be aborted immediately if ++no devices are available. ++ ++=item B<-Z> ++ ++Enable trace mode. B Specifying this once will turn all reboots or power-offs, be ++they caused by self-fence decisions or messages, into a crashdump. ++Specifying this twice will just log them but not continue running. ++ ++=item B<-T> ++ ++By default, the daemon will set the watchdog timeout as specified in the ++device metadata. However, this does not work for every watchdog device. ++In this case, you must manually ensure that the watchdog timeout used by ++the system correctly matches the SBD settings, and then specify this ++option to allow C to continue with start-up. ++ ++=item B<-5> I ++ ++Warn if the time interval for tickling the watchdog exceeds this many seconds. ++Since the node is unable to log the watchdog expiry (it reboots immediately ++without a chance to write its logs to disk), this is very useful for getting ++an indication that the watchdog timeout is too short for the IO load of the ++system. ++ ++Default is 3 seconds, set to zero to disable. ++ ++=item B<-C> I ++ ++Watchdog timeout to set before crashdumping. If SBD is set to crashdump ++instead of reboot - either via the trace mode settings or the I ++fencing agent's parameter -, SBD will adjust the watchdog timeout to this ++setting before triggering the dump. Otherwise, the watchdog might trigger and ++prevent a successful crashdump from ever being written. ++ ++Set to zero (= default) to disable. ++ ++=item B<-r> I ++ ++Actions to be executed when the watchers don't timely report to the sbd ++master process or one of the watchers detects that the master process ++has died. ++ ++Set timeout-action to comma-separated combination of ++noflush|flush plus reboot|crashdump|off. ++If just one of both is given the other stays at the default. ++ ++This doesn't affect actions like off, crashdump, reboot explicitly ++triggered via message slots. ++And it does as well not configure the action a watchdog would ++trigger should it run off (there is no generic interface). ++ ++Defaults to flush,reboot. ++ ++=back ++ ++=head2 allocate ++ ++Example usage: ++ ++ sbd -d /dev/sda1 allocate node1 ++ ++Explicitly allocates a slot for the specified node name. This should ++rarely be necessary, as every node will automatically allocate itself a ++slot the first time it starts up on watch mode. ++ ++=head2 message ++ ++Example usage: ++ ++ sbd -d /dev/sda1 message node1 test ++ ++Writes the specified message to node's slot. This is rarely done ++directly, but rather abstracted via the C fencing agent ++configured as a cluster resource. ++ ++Supported message types are: ++ ++=over ++ ++=item test ++ ++This only generates a log message on the receiving node and can be used ++to check if SBD is seeing the device. Note that this could overwrite a ++fencing request send by the cluster, so should not be used during ++production. ++ ++=item reset ++ ++Reset the target upon receipt of this message. ++ ++=item off ++ ++Power-off the target. ++ ++=item crashdump ++ ++Cause the target node to crashdump. ++ ++=item exit ++ ++This will make the C daemon exit cleanly on the target. You should ++B send this message manually; this is handled properly during ++shutdown of the cluster stack. Manually stopping the daemon means the ++node is unprotected! ++ ++=item clear ++ ++This message indicates that no real message has been sent to the node. ++You should not set this manually; C will clear the message slot ++automatically during start-up, and setting this manually could overwrite ++a fencing message by the cluster. ++ ++=back ++ ++=head2 query-watchdog ++ ++Example usage: ++ ++ sbd query-watchdog ++ ++Check for available watchdog devices and print some info. ++ ++B: This command will arm the watchdog during query, and if your ++watchdog refuses disarming (for example, if its kernel module has the ++'nowayout' parameter set) this will reset your system. ++ ++=head2 test-watchdog ++ ++Example usage: ++ ++ sbd test-watchdog [-w /dev/watchdog3] ++ ++Test specified watchdog device (/dev/watchdog by default). ++ ++B: This command will arm the watchdog and have your system reset ++in case your watchdog is working properly! If issued from an interactive ++session, it will prompt for confirmation. ++ ++=head1 Base system configuration ++ ++=head2 Configure a watchdog ++ ++It is highly recommended that you configure your Linux system to load a ++watchdog driver with hardware assistance (as is available on most modern ++systems), such as I, I, or others. As a fall-back, you ++can use the I module. ++ ++No other software must access the watchdog timer; it can only be ++accessed by one process at any given time. Some hardware vendors ship ++systems management software that use the watchdog for system resets ++(f.e. HP ASR daemon). Such software has to be disabled if the watchdog ++is to be used by SBD. ++ ++=head2 Choosing and initializing the block device(s) ++ ++First, you have to decide if you want to use one, two, or three devices. ++ ++If you are using multiple ones, they should reside on independent ++storage setups. Putting all three of them on the same logical unit for ++example would not provide any additional redundancy. ++ ++The SBD device can be connected via Fibre Channel, Fibre Channel over ++Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of ++network-based quorum server; the advantage is that it does not require ++a smart host at your third location, just block storage. ++ ++The SBD partitions themselves B be mirrored (via MD, ++DRBD, or the storage layer itself), since this could result in a ++split-mirror scenario. Nor can they reside on cLVM2 volume groups, since ++they must be accessed by the cluster stack before it has started the ++cLVM2 daemons; hence, these should be either raw partitions or logical ++units on (multipath) storage. ++ ++The block device(s) must be accessible from all nodes. (While it is not ++necessary that they share the same path name on all nodes, this is ++considered a very good idea.) ++ ++SBD will only use about one megabyte per device, so you can easily ++create a small partition, or very small logical units. (The size of the ++SBD device depends on the block size of the underlying device. Thus, 1MB ++is fine on plain SCSI devices and SAN storage with 512 byte blocks. On ++the IBM s390x architecture in particular, disks default to 4k blocks, ++and thus require roughly 4MB.) ++ ++The number of devices will affect the operation of SBD as follows: ++ ++=over ++ ++=item One device ++ ++In its most simple implementation, you use one device only. This is ++appropriate for clusters where all your data is on the same shared ++storage (with internal redundancy) anyway; the SBD device does not ++introduce an additional single point of failure then. ++ ++If the SBD device is not accessible, the daemon will fail to start and ++inhibit startup of cluster services. ++ ++=item Two devices ++ ++This configuration is a trade-off, primarily aimed at environments where ++host-based mirroring is used, but no third storage device is available. ++ ++SBD will not commit suicide if it loses access to one mirror leg; this ++allows the cluster to continue to function even in the face of one outage. ++ ++However, SBD will not fence the other side while only one mirror leg is ++available, since it does not have enough knowledge to detect an asymmetric ++split of the storage. So it will not be able to automatically tolerate a ++second failure while one of the storage arrays is down. (Though you ++can use the appropriate crm command to acknowledge the fence manually.) ++ ++It will not start unless both devices are accessible on boot. ++ ++=item Three devices ++ ++In this most reliable and recommended configuration, SBD will only ++self-fence if more than one device is lost; hence, this configuration is ++resilient against temporary single device outages (be it due to failures ++or maintenance). Fencing messages can still be successfully relayed if ++at least two devices remain accessible. ++ ++This configuration is appropriate for more complex scenarios where ++storage is not confined to a single array. For example, host-based ++mirroring solutions could have one SBD per mirror leg (not mirrored ++itself), and an additional tie-breaker on iSCSI. ++ ++It will only start if at least two devices are accessible on boot. ++ ++=back ++ ++After you have chosen the devices and created the appropriate partitions ++and perhaps multipath alias names to ease management, use the C ++command described above to initialize the SBD metadata on them. ++ ++=head3 Sharing the block device(s) between multiple clusters ++ ++It is possible to share the block devices between multiple clusters, ++provided the total number of nodes accessing them does not exceed I<255> ++nodes, and they all must share the same SBD timeouts (since these are ++part of the metadata). ++ ++If you are using multiple devices this can reduce the setup overhead ++required. However, you should B share devices between clusters in ++different security domains. ++ ++=head2 Configure SBD to start on boot ++ ++On systems using C, the C or C system ++start-up scripts must handle starting or stopping C as required ++before starting the rest of the cluster stack. ++ ++For C, sbd simply has to be enabled using ++ ++ systemctl enable sbd.service ++ ++The daemon is brought online on each node before corosync and Pacemaker ++are started, and terminated only after all other cluster components have ++been shut down - ensuring that cluster resources are never activated ++without SBD supervision. ++ ++=head2 Configuration via sysconfig ++ ++The system instance of C is configured via F. ++In this file, you must specify the device(s) used, as well as any ++options to pass to the daemon: ++ ++ SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1" ++ SBD_PACEMAKER="true" ++ ++C will fail to start if no C is specified. See the ++installed template or section for configuration via environment ++for more options that can be configured here. ++In general configuration done via parameters takes precedence over ++the configuration from the configuration file. ++ ++=head2 Configuration via environment ++ ++=over ++@environment_section@ ++=back ++ ++=head2 Testing the sbd installation ++ ++After a restart of the cluster stack on this node, you can now try ++sending a test message to it as root, from this or any other node: ++ ++ sbd -d /dev/sda1 message node1 test ++ ++The node will acknowledge the receipt of the message in the system logs: ++ ++ Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2 ++ ++This confirms that SBD is indeed up and running on the node, and that it ++is ready to receive messages. ++ ++Make B that F is identical on all cluster ++nodes, and that all cluster nodes are running the daemon. ++ ++=head1 Pacemaker CIB integration ++ ++=head2 Fencing resource ++ ++Pacemaker can only interact with SBD to issue a node fence if there is a ++configure fencing resource. This should be a primitive, not a clone, as ++follows: ++ ++ primitive fencing-sbd stonith:external/sbd \ ++ params pcmk_delay_max=30 ++ ++This will automatically use the same devices as configured in ++F. ++ ++While you should not configure this as a clone (as Pacemaker will register ++the fencing device on each node automatically), the I ++setting enables random fencing delay which ensures, in a scenario where a ++split-brain scenario did occur in a two node cluster, that one of the nodes ++has a better chance to survive to avoid double fencing. ++ ++SBD also supports turning the reset request into a crash request, which ++may be helpful for debugging if you have kernel crashdumping configured; ++then, every fence request will cause the node to dump core. You can ++enable this via the C parameter on the fencing ++resource. This is B recommended for production use, but only for ++debugging phases. ++ ++=head2 General cluster properties ++ ++You must also enable STONITH in general, and set the STONITH timeout to ++be at least twice the I timeout you have configured, to allow ++enough time for the fencing message to be delivered. If your I ++timeout is 60 seconds, this is a possible configuration: ++ ++ property stonith-enabled="true" ++ property stonith-timeout="120s" ++ ++B: if I is too low for I and the ++system overhead, sbd will never be able to successfully complete a fence ++request. This will create a fencing loop. ++ ++Note that the sbd fencing agent will try to detect this and ++automatically extend the I setting to a reasonable ++value, on the assumption that sbd modifying your configuration is ++preferable to not fencing. ++ ++=head1 Management tasks ++ ++=head2 Recovering from temporary SBD device outage ++ ++If you have multiple devices, failure of a single device is not immediately ++fatal. C will retry to restart the monitor for the device every 5 ++seconds by default. However, you can tune this via the options to the ++I command. ++ ++In case you wish the immediately force a restart of all currently ++disabled monitor processes, you can send a I to the SBD ++I process. ++ ++ ++=head1 LICENSE ++ ++Copyright (C) 2008-2013 Lars Marowsky-Bree ++ ++This program is free software; you can redistribute it and/or ++modify it under the terms of the GNU General Public ++License as published by the Free Software Foundation; either ++version 2 of the License, or (at your option) any later version. ++ ++This software is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++General Public License for more details. ++ ++For details see the GNU General Public License at ++http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or ++http://www.gnu.org/licenses/gpl.html (the newest as per "any later"). +diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig +index e1a60ed..33b50d0 100644 +--- a/src/sbd.sysconfig ++++ b/src/sbd.sysconfig +@@ -14,7 +14,7 @@ + # + SBD_PACEMAKER=yes + +-## Type: list(always,clean) ++## Type: always / clean + ## Default: always + # + # Specify the start mode for sbd. Setting this to "clean" will only +@@ -103,6 +103,7 @@ SBD_TIMEOUT_ACTION=flush,reboot + # Thus in auto-mode sbd will check if the slice has RT-budget assigned. + # If that is the case sbd will stay in that slice while it will + # be moved to root-slice otherwise. ++# + SBD_MOVE_TO_ROOT_CGROUP=auto + + ## Type: string +-- +1.8.3.1 + diff --git a/0003-Fix-scheduling-overhaul-the-whole-thing.patch b/0003-Fix-scheduling-overhaul-the-whole-thing.patch new file mode 100644 index 0000000..05fab9d --- /dev/null +++ b/0003-Fix-scheduling-overhaul-the-whole-thing.patch @@ -0,0 +1,152 @@ +From 4bc08cf76fc01e98cbec76bf32bb333b77f69217 Mon Sep 17 00:00:00 2001 +From: Klaus Wenninger +Date: Thu, 27 Feb 2020 19:02:57 +0100 +Subject: [PATCH] Fix: scheduling: overhaul the whole thing + +- prevent possible lockup when format in proc changes +- properly get and handle scheduler policy & prio +- on SCHED_RR failing push to the max with SCHED_OTHER +--- + src/sbd-common.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 44 insertions(+), 12 deletions(-) + +diff --git a/src/sbd-common.c b/src/sbd-common.c +index 9ec43b2..c2da758 100644 +--- a/src/sbd-common.c ++++ b/src/sbd-common.c +@@ -26,6 +26,9 @@ + #include + #include + #include ++#include ++#include ++#include + + #ifdef _POSIX_MEMLOCK + # include +@@ -298,7 +301,7 @@ watchdog_populate_list(void) + FILE *file; + + snprintf(entry_name, sizeof(entry_name), +- SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); ++ SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name); + file = fopen(entry_name, "r"); + if (file) { + int major, minor; +@@ -667,7 +670,7 @@ static int get_realtime_budget(void) + { + FILE *f; + char fname[PATH_MAX]; +- int res = -1, lnum = 0; ++ int res = -1, lnum = 0, num; + char *cgroup = NULL, *namespecs = NULL; + + snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid()); +@@ -677,7 +680,8 @@ static int get_realtime_budget(void) + (intmax_t)getpid()); + goto exit_res; + } +- while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum, &namespecs, &cgroup) !=EOF ) { ++ while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum, ++ &namespecs, &cgroup)) !=EOF ) { + if (namespecs && strstr(namespecs, "cpuacct")) { + free(namespecs); + break; +@@ -690,6 +694,11 @@ static int get_realtime_budget(void) + free(namespecs); + namespecs = NULL; + } ++ /* not to get stuck if format changes */ ++ if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) || ++ (fscanf(f, "\n") == EOF))) { ++ break; ++ } + } + fclose(f); + if (cgroup == NULL) { +@@ -776,15 +785,17 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) + return; + } + ++do { + #ifdef SCHED_RR + if (move_to_root_cgroup) { + sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup); + } + + { +- int pcurrent = 0; + int pmin = sched_get_priority_min(SCHED_RR); + int pmax = sched_get_priority_max(SCHED_RR); ++ struct sched_param sp; ++ int pcurrent; + + if (priority == 0) { + priority = pmax; +@@ -794,26 +805,47 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK) + priority = pmax; + } + +- pcurrent = sched_getscheduler(0); +- if (pcurrent < 0) { ++ if (sched_getparam(0, &sp) < 0) { + cl_perror("Unable to get scheduler priority"); + +- } else if(pcurrent < priority) { +- struct sched_param sp; ++ } else if ((pcurrent = sched_getscheduler(0)) < 0) { ++ cl_perror("Unable to get scheduler policy"); + ++ } else if ((pcurrent == SCHED_RR) && ++ (sp.sched_priority >= priority)) { ++ cl_log(LOG_INFO, ++ "Stay with priority (%d) for policy SCHED_RR", ++ sp.sched_priority); ++ break; ++ } else { + memset(&sp, 0, sizeof(sp)); + sp.sched_priority = priority; + + if (sched_setscheduler(0, SCHED_RR, &sp) < 0) { +- cl_perror("Unable to set scheduler priority to %d", priority); ++ cl_perror( ++ "Unable to set scheduler policy to SCHED_RR priority %d", ++ priority); + } else { +- cl_log(LOG_INFO, "Scheduler priority is now %d", priority); ++ cl_log(LOG_INFO, ++ "Scheduler policy is now SCHED_RR priority %d", ++ priority); ++ break; + } + } + } + #else +- cl_log(LOG_ERR, "System does not support updating the scheduler priority"); ++ cl_log(LOG_ERR, "System does not support updating the scheduler policy"); ++#endif ++#ifdef PRIO_PGRP ++ if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) { ++ cl_perror("Unable to raise the scheduler priority"); ++ } else { ++ cl_log(LOG_INFO, "Scheduler priority raised to the maximum"); ++ } ++#else ++ cl_perror("System does not support setting the scheduler priority"); + #endif ++} while (0); + + sbd_memlock(heapgrowK, stackgrowK); + } +@@ -826,7 +858,7 @@ maximize_priority(void) + return; + } + +- sbd_make_realtime(0, 256, 256); ++ sbd_make_realtime(0, 256, 256); + + if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) { +-- +1.8.3.1 + diff --git a/gating.yaml b/gating.yaml new file mode 100644 index 0000000..1af3453 --- /dev/null +++ b/gating.yaml @@ -0,0 +1,9 @@ +--- !Policy +product_versions: + - fedora-* +decision_context: bodhi_update_push_testing +subject_type: koji_build +rules: + - !PassingTestCaseRule {test_case_name: dist.depcheck} + - !PassingTestCaseRule {test_case_name: fedora-ci.koji-build.tier0.functional} + diff --git a/sbd.spec b/sbd.spec new file mode 100644 index 0000000..61b4e8a --- /dev/null +++ b/sbd.spec @@ -0,0 +1,227 @@ +# +# spec file for package sbd +# +# Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. +# Copyright (c) 2013 Lars Marowsky-Bree +# +# All modifications and additions to the file contributed by third parties +# remain the property of their copyright owners, unless otherwise agreed +# upon. The license for this file, and modifications and additions to the +# file, is the same license as for the pristine package itself (unless the +# license for the pristine package is not an Open Source License, in which +# case the license is the MIT License). An "Open Source License" is a +# license that conforms to the Open Source Definition (Version 1.9) +# published by the Open Source Initiative. + +# Please submit bugfixes or comments via http://bugs.opensuse.org/ +# +%global commit 25fce8a7d5e8cd5abc2379077381b10bd6cec183 +%global shortcommit %(c=%{commit}; echo ${c:0:7}) +%global github_owner Clusterlabs +%global buildnum 7 + +Name: sbd +Summary: Storage-based death +License: GPLv2+ +Version: 1.4.1 +Release: %{buildnum}%{?dist}.1 +Url: https://github.com/%{github_owner}/%{name} +Source0: https://github.com/%{github_owner}/%{name}/archive/%{commit}/%{name}-%{commit}.tar.gz +Patch0: 0001-Fix-regressions.sh-make-parameter-passing-consistent.patch +Patch1: 0002-Doc-add-environment-section-to-man-page.patch +Patch2: 0003-Fix-scheduling-overhaul-the-whole-thing.patch +BuildRequires: autoconf +BuildRequires: automake +BuildRequires: libuuid-devel +BuildRequires: glib2-devel +BuildRequires: libaio-devel +BuildRequires: corosync-devel +BuildRequires: pacemaker-libs-devel +BuildRequires: libtool +BuildRequires: libuuid-devel +BuildRequires: libxml2-devel +BuildRequires: pkgconfig +BuildRequires: make +BuildRequires: systemd +Conflicts: fence-agents-sbd < 4.5.0 + +%if 0%{?rhel} +ExclusiveArch: i686 x86_64 s390x aarch64 ppc64le +%endif + +%if %{defined systemd_requires} +%systemd_requires +%endif + +%description + +This package contains the storage-based death functionality. + +%package tests +Summary: Storage-based death environment for regression tests +License: GPLv2+ + +%description tests +This package provides an environment + testscripts for +regression-testing sbd. + +########################################################### + +%prep +%autosetup -n %{name}-%{commit} -p1 +%ifarch s390x s390 +sed -i src/sbd.sysconfig -e "s/Default: 5/Default: 15/" +sed -i src/sbd.sysconfig -e "s/SBD_WATCHDOG_TIMEOUT=5/SBD_WATCHDOG_TIMEOUT=15/" +%endif + +########################################################### + +%build +./autogen.sh +export CFLAGS="$RPM_OPT_FLAGS -Wall -Werror" +%configure +make %{?_smp_mflags} + +########################################################### + +%install + +make DESTDIR=$RPM_BUILD_ROOT LIBDIR=%{_libdir} install +rm -rf ${RPM_BUILD_ROOT}%{_libdir}/stonith + +install -D -m 0755 tests/regressions.sh $RPM_BUILD_ROOT/usr/share/sbd/regressions.sh +%if %{defined _unitdir} +install -D -m 0644 src/sbd.service $RPM_BUILD_ROOT/%{_unitdir}/sbd.service +install -D -m 0644 src/sbd_remote.service $RPM_BUILD_ROOT/%{_unitdir}/sbd_remote.service +%endif + +mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig +install -m 644 src/sbd.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/sbd + +# Don't package static libs +find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f +find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f + +########################################################### + +%if %{defined _unitdir} +%post +%systemd_post sbd.service +%systemd_post sbd_remote.service +if [ $1 -ne 1 ] ; then + if systemctl --quiet is-enabled sbd.service 2>/dev/null + then + systemctl --quiet reenable sbd.service 2>/dev/null || : + fi + if systemctl --quiet is-enabled sbd_remote.service 2>/dev/null + then + systemctl --quiet reenable sbd_remote.service 2>/dev/null || : + fi +fi + +%preun +%systemd_preun sbd.service +%systemd_preun sbd_remote.service + +%postun +%systemd_postun sbd.service +%systemd_postun sbd_remote.service +%endif + +%files +########################################################### +%defattr(-,root,root) +%config(noreplace) %{_sysconfdir}/sysconfig/sbd +%{_sbindir}/sbd +%exclude %{_datadir}/sbd/regressions.sh +%doc %{_mandir}/man8/sbd* +%if %{defined _unitdir} +%{_unitdir}/sbd.service +%{_unitdir}/sbd_remote.service +%endif +%doc COPYING + +%files tests +########################################################### +%defattr(-,root,root) +%dir %{_datadir}/sbd +%{_datadir}/sbd/regressions.sh +%{_libdir}/libsbdtestbed* + +%changelog +* Wed Jul 29 2020 Fedora Release Engineering - 1.4.1-7.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild + +* Thu May 14 2020 Klaus Wenninger - 1.4.1-7 +- Rebuild against libqb2.0 (f33-build-side-23348) + +* Wed Mar 11 2020 Klaus Wenninger - 1.4.1-6 +- Rebuild because tagging the build failed + +* Thu Mar 5 2020 Klaus Wenninger - 1.4.1-5 +- Rebase to upstream v1.4.1 +- Make coverity happy with parameter passing in regressions.sh +- Add auto generated environment section to man-page +- Overhaul setting scheduler policy/priority +- Enable Fedora CI Gating + +* Thu Jan 30 2020 Fedora Release Engineering - 1.4.0-3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild + +* Fri Jul 26 2019 Fedora Release Engineering - 1.4.0-2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild + +* Fri Feb 1 2019 Klaus Wenninger - 1.4.0-1 +- Rebase to upstream v1.4.0 +- Fail earlier on invalid servants (solves GCC9 build issue as well) + +* Wed Nov 21 2018 Klaus Wenninger - 1.3.1-1.git4927571 +- Rebased to commit 4927571f8e9b00db8242654b1329dfbd71dcfe99 +- Removed disabling of shared-disk-support + Resolves rhbz#1606301 + +* Sat Jul 14 2018 Fedora Release Engineering - 1.2.1-4.3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild + +* Sun Mar 18 2018 Iryna Shcherbina - 1.2.1-4.2 +- Update Python 2 dependency declarations to new packaging standards + (See https://fedoraproject.org/wiki/FinalizingFedoraSwitchtoPython3) + +* Fri Feb 09 2018 Fedora Release Engineering - 1.2.1-4.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild + +* Tue Jan 30 2018 Merlin Mathesius - 1.2.1-4 +- Patch to use correct C inline function semantics to fix FTBFS (BZ#1424417) + Cleanup spec file conditionals + +* Thu Aug 03 2017 Fedora Release Engineering - 1.2.1-3.4 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Binutils_Mass_Rebuild + +* Thu Jul 27 2017 Fedora Release Engineering - 1.2.1-3.3 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_27_Mass_Rebuild + +* Sat Feb 11 2017 Fedora Release Engineering - 1.2.1-3.2 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_26_Mass_Rebuild + +* Thu Feb 04 2016 Fedora Release Engineering - 1.2.1-3.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_24_Mass_Rebuild + +* Tue Jan 19 2016 Jan Pokorný - 1.2.1-3 +- Rebuilt for libpe_status soname bump arising from Pacemaker 1.1.14 + +* Thu Jul 02 2015 Ralf Corsépius - 1.2.1-2 +- Add dist-tag (RHBZ #1237187). + +* Fri Jun 19 2015 Fedora Release Engineering - 1.2.1-1.1 +- Rebuilt for https://fedoraproject.org/wiki/Fedora_23_Mass_Rebuild + +* Thu Oct 30 2014 - 1.2.1-1 +- Correctly enable /proc/pid validation for sbd_lock_running() +- Improved integration with the el7 environment + +* Fri Aug 29 2014 - 1.2.1-0.2.8f912945.git +- Remove some additional SUSE-isms + +* Fri Aug 29 2014 - 1.2.1-0.1.8f912945.git +- Prepare for package review + Resolves: rhbz#1134245 diff --git a/sources b/sources new file mode 100644 index 0000000..50b7de1 --- /dev/null +++ b/sources @@ -0,0 +1 @@ +SHA512 (sbd-25fce8a7d5e8cd5abc2379077381b10bd6cec183.tar.gz) = 3b89ee0aa88282f17c8daf725a1e7a8c2f2affdcf6ff6f4ca4faf250760d778a65c5693e5df3fcc7554d60dd9b0cb1a0350e266fadb7668320f3c676d8799a29 diff --git a/tests/inventory b/tests/inventory new file mode 100755 index 0000000..e0cea17 --- /dev/null +++ b/tests/inventory @@ -0,0 +1,4 @@ +#!/bin/bash +export TEST_DOCKER_EXTRA_ARGS="--privileged --network host" +exec merge-standard-inventory "$@" + diff --git a/tests/tests.yml b/tests/tests.yml new file mode 100644 index 0000000..49bc170 --- /dev/null +++ b/tests/tests.yml @@ -0,0 +1,16 @@ +--- +- hosts: localhost + roles: + - role: standard-test-basic + tags: + - classic + - container + tests: + - smoke: + dir: . + run: /usr/share/sbd/regressions.sh + required_packages: + - sbd + - sbd-tests + - device-mapper +