1460 lines
52 KiB
Diff
1460 lines
52 KiB
Diff
|
From 9dd82a8b4daa5a7bd8ab3afa43b081f212efb1ac Mon Sep 17 00:00:00 2001
|
||
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||
|
Date: Wed, 29 Jan 2020 20:34:18 +0100
|
||
|
Subject: [PATCH] Doc: add environment section to man-page
|
||
|
|
||
|
Environment section is auto-generated from sbd.sysconfig.
|
||
|
---
|
||
|
.gitignore | 1 +
|
||
|
Makefile.am | 6 +-
|
||
|
README.md | 3 +-
|
||
|
man/Makefile.am | 8 +-
|
||
|
man/sbd.8.pod | 668 -----------------------------------------------------
|
||
|
man/sbd.8.pod.in | 675 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||
|
src/sbd.sysconfig | 3 +-
|
||
|
7 files changed, 690 insertions(+), 674 deletions(-)
|
||
|
delete mode 100644 man/sbd.8.pod
|
||
|
create mode 100644 man/sbd.8.pod.in
|
||
|
|
||
|
diff --git a/Makefile.am b/Makefile.am
|
||
|
index 1c29f75..bd4346d 100644
|
||
|
--- a/Makefile.am
|
||
|
+++ b/Makefile.am
|
||
|
@@ -9,8 +9,8 @@ TARFILE = $(distdir).tar.gz
|
||
|
DIST_ARCHIVES = $(TARFILE)
|
||
|
KEEP_EXISTING_TAR = no
|
||
|
INJECT_GIT_COMMIT = yes
|
||
|
-DISTCLEANFILES = sbd-* sbd-*/
|
||
|
CLEANFILES = *.rpm *.tar.* sbd-*
|
||
|
+DISTCLEANFILES = sbd-* sbd-*/
|
||
|
|
||
|
RPM_ROOT = $(shell pwd)
|
||
|
RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \
|
||
|
@@ -31,7 +31,7 @@ export SBD_BINARY := src/sbd
|
||
|
export SBD_PRELOAD := tests/.libs/libsbdtestbed.so
|
||
|
export SBD_USE_DM := no
|
||
|
|
||
|
-EXTRA_DIST = sbd.spec tests/regressions.sh
|
||
|
+EXTRA_DIST = sbd.spec tests/regressions.sh man/sbd.8.pod.in
|
||
|
|
||
|
export:
|
||
|
rm -f $(PACKAGE)-HEAD.tar.*
|
||
|
@@ -43,7 +43,7 @@ export:
|
||
|
echo `date`: Using existing tarball: $(TARFILE); \
|
||
|
else \
|
||
|
rm -f $(PACKAGE).tar.*; \
|
||
|
- (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclusive="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \
|
||
|
+ (git archive --prefix=$(distdir)/ $(shell echo $(TAG)|cut -f1 -d-) || tar -c --transform="s,^,$(distdir)/," --exclude="*.tar.*" --exclude="$(distdir)" --exclude="*.o" --exclude="*.8" --exclude="config.*" --exclude="libtool" --exclude="ltmain.sh*" --exclude="Makefile" --exclude="Makefile.in" --exclude="stamp-*" --exclude="*.service" --exclude="sbd" --exclude="*.m4" --exclude="*.cache" --exclude="configure" --exclude="*.list" --exclude="depcomp" --exclude="install-sh" --exclude="missing" --exclude="compile" --exclude="sbd.sh" --exclude="~" --exclude="*.swp" --exclude="*.patch" --exclude="*.diff" --exclude="*.orig" --exclude="*.rej" --exclude="*.rpm" --exclude="*.pod" --exclude=".deps" --exclude="test-driver" *) | gzip > $(TARFILE); \
|
||
|
if test -n "$$(git status -s)" || test "$(INJECT_GIT_COMMIT)" = "yes"; then \
|
||
|
if test -n "$$(git status -s)"; then git diff HEAD --name-only|grep -v "^\."|xargs -n1 git diff HEAD > uncommitted.diff; fi; \
|
||
|
rm -rf $(distdir); tar -xzf $(TARFILE); rm $(TARFILE); \
|
||
|
diff --git a/README.md b/README.md
|
||
|
index d02a8bd..42a3fde 100644
|
||
|
--- a/README.md
|
||
|
+++ b/README.md
|
||
|
@@ -5,5 +5,6 @@ A highly reliable fencing or Shoot-the-other-node-in-the-head (STONITH) mechanis
|
||
|
The component works with Pacemaker clusters, and is currently known to
|
||
|
compile and function on Pacemaker 1.1.7+ and corosync 1.4.x or 2.3.x.
|
||
|
|
||
|
-Please see https://github.com/l-mb/sbd/blob/master/man/sbd.8.pod for the full documentation.
|
||
|
+Please see https://github.com/clusterlabs/sbd/blob/master/man/sbd.8.pod.in &
|
||
|
+https://github.com/clusterlabs/sbd/blob/master/src/sbd.sysconfig for the full documentation.
|
||
|
|
||
|
diff --git a/man/Makefile.am b/man/Makefile.am
|
||
|
index 3f89085..995712d 100644
|
||
|
--- a/man/Makefile.am
|
||
|
+++ b/man/Makefile.am
|
||
|
@@ -1,6 +1,12 @@
|
||
|
dist_man_MANS = sbd.8
|
||
|
|
||
|
-EXTRA_DIST = sbd.8.pod
|
||
|
+DISTCLEANFILES = sbd.8.pod sbd.8 sbd.sysconfig.pod
|
||
|
+
|
||
|
+sbd.sysconfig.pod: ../src/sbd.sysconfig
|
||
|
+ sed -r -n -e "s/^## Type: (.*)/Allows C<\1>/;t type;s/^## Default: (.*)/ defaulting to C<\1>/;t default;s/^#*(.*)=.*/=item B<\1>\n/;t variable;s/^#*//;s/^ *//;H;d;:type;h;d;:default;H;x;s/\n//;x;d;:variable;G;p" $< > $@
|
||
|
+
|
||
|
+sbd.8.pod: sbd.8.pod.in sbd.sysconfig.pod
|
||
|
+ sed -e "s/@environment_section@//;t insert;p;d;:insert;rsbd.sysconfig.pod" $< > $@
|
||
|
|
||
|
sbd.8: sbd.8.pod
|
||
|
@POD2MAN@ -s 8 -c "STONITH Block Device" -r "SBD" -n "SBD" $< $@
|
||
|
diff --git a/man/sbd.8.pod b/man/sbd.8.pod
|
||
|
deleted file mode 100644
|
||
|
index 377c579..0000000
|
||
|
--- a/man/sbd.8.pod
|
||
|
+++ /dev/null
|
||
|
@@ -1,668 +0,0 @@
|
||
|
-=head1 NAME
|
||
|
-
|
||
|
-sbd - STONITH Block Device daemon
|
||
|
-
|
||
|
-=head1 SYNOPSIS
|
||
|
-
|
||
|
-sbd <-d F</dev/...>> [options] C<command>
|
||
|
-
|
||
|
-=head1 SUMMARY
|
||
|
-
|
||
|
-SBD provides a node fencing mechanism (Shoot the other node in the head,
|
||
|
-STONITH) for Pacemaker-based clusters through the exchange of messages
|
||
|
-via shared block storage such as for example a SAN, iSCSI, FCoE. This
|
||
|
-isolates the fencing mechanism from changes in firmware version or
|
||
|
-dependencies on specific firmware controllers, and it can be used as a
|
||
|
-STONITH mechanism in all configurations that have reliable shared
|
||
|
-storage.
|
||
|
-
|
||
|
-SBD can also be used without any shared storage. In this mode, the
|
||
|
-watchdog device will be used to reset the node if it loses quorum, if
|
||
|
-any monitored daemon is lost and not recovered or if Pacemaker decides
|
||
|
-that the node requires fencing.
|
||
|
-
|
||
|
-The F<sbd> binary implements both the daemon that watches the message
|
||
|
-slots as well as the management tool for interacting with the block
|
||
|
-storage device(s). This mode of operation is specified via the
|
||
|
-C<command> parameter; some of these modes take additional parameters.
|
||
|
-
|
||
|
-To use SBD with shared storage, you must first C<create> the messaging
|
||
|
-layout on one to three block devices. Second, configure
|
||
|
-F</etc/sysconfig/sbd> to list those devices (and possibly adjust other
|
||
|
-options), and restart the cluster stack on each node to ensure that
|
||
|
-C<sbd> is started. Third, configure the C<external/sbd> fencing
|
||
|
-resource in the Pacemaker CIB.
|
||
|
-
|
||
|
-Each of these steps is documented in more detail below the description
|
||
|
-of the command options.
|
||
|
-
|
||
|
-C<sbd> can only be used as root.
|
||
|
-
|
||
|
-=head2 GENERAL OPTIONS
|
||
|
-
|
||
|
-=over
|
||
|
-
|
||
|
-=item B<-d> F</dev/...>
|
||
|
-
|
||
|
-Specify the block device(s) to be used. If you have more than one,
|
||
|
-specify this option up to three times. This parameter is mandatory for
|
||
|
-all modes, since SBD always needs a block device to interact with.
|
||
|
-
|
||
|
-This man page uses F</dev/sda1>, F</dev/sdb1>, and F</dev/sdc1> as
|
||
|
-example device names for brevity. However, in your production
|
||
|
-environment, you should instead always refer to them by using the long,
|
||
|
-stable device name (e.g.,
|
||
|
-F</dev/disk/by-id/dm-uuid-part1-mpath-3600508b400105b5a0001500000250000>).
|
||
|
-
|
||
|
-=item B<-v|-vv|-vvv>
|
||
|
-
|
||
|
-Enable verbose|debug|debug-library logging (optional)
|
||
|
-
|
||
|
-=item B<-h>
|
||
|
-
|
||
|
-Display a concise summary of C<sbd> options.
|
||
|
-
|
||
|
-=item B<-n> I<node>
|
||
|
-
|
||
|
-Set local node name; defaults to C<uname -n>. This should not need to be
|
||
|
-set.
|
||
|
-
|
||
|
-=item B<-R>
|
||
|
-
|
||
|
-Do B<not> enable realtime priority. By default, C<sbd> runs at realtime
|
||
|
-priority, locks itself into memory, and also acquires highest IO
|
||
|
-priority to protect itself against interference from other processes on
|
||
|
-the system. This is a debugging-only option.
|
||
|
-
|
||
|
-=item B<-I> I<N>
|
||
|
-
|
||
|
-Async IO timeout (defaults to 3 seconds, optional). You should not need
|
||
|
-to adjust this unless your IO setup is really very slow.
|
||
|
-
|
||
|
-(In daemon mode, the watchdog is refreshed when the majority of devices
|
||
|
-could be read within this time.)
|
||
|
-
|
||
|
-=back
|
||
|
-
|
||
|
-=head2 create
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- sbd -d /dev/sdc2 -d /dev/sdd3 create
|
||
|
-
|
||
|
-If you specify the I<create> command, sbd will write a metadata header
|
||
|
-to the device(s) specified and also initialize the messaging slots for
|
||
|
-up to 255 nodes.
|
||
|
-
|
||
|
-B<Warning>: This command will not prompt for confirmation. Roughly the
|
||
|
-first megabyte of the specified block device(s) will be overwritten
|
||
|
-immediately and without backup.
|
||
|
-
|
||
|
-This command accepts a few options to adjust the default timings that
|
||
|
-are written to the metadata (to ensure they are identical across all
|
||
|
-nodes accessing the device).
|
||
|
-
|
||
|
-=over
|
||
|
-
|
||
|
-=item B<-1> I<N>
|
||
|
-
|
||
|
-Set watchdog timeout to N seconds. This depends mostly on your storage
|
||
|
-latency; the majority of devices must be successfully read within this
|
||
|
-time, or else the node will self-fence.
|
||
|
-
|
||
|
-If your sbd device(s) reside on a multipath setup or iSCSI, this should
|
||
|
-be the time required to detect a path failure. You may be able to reduce
|
||
|
-this if your device outages are independent, or if you are using the
|
||
|
-Pacemaker integration.
|
||
|
-
|
||
|
-=item B<-2> I<N>
|
||
|
-
|
||
|
-Set slot allocation timeout to N seconds. You should not need to tune
|
||
|
-this.
|
||
|
-
|
||
|
-=item B<-3> I<N>
|
||
|
-
|
||
|
-Set daemon loop timeout to N seconds. You should not need to tune this.
|
||
|
-
|
||
|
-=item B<-4> I<N>
|
||
|
-
|
||
|
-Set I<msgwait> timeout to N seconds. This should be twice the I<watchdog>
|
||
|
-timeout. This is the time after which a message written to a node's slot
|
||
|
-will be considered delivered. (Or long enough for the node to detect
|
||
|
-that it needed to self-fence.)
|
||
|
-
|
||
|
-This also affects the I<stonith-timeout> in Pacemaker's CIB; see below.
|
||
|
-
|
||
|
-=back
|
||
|
-
|
||
|
-=head2 list
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- # sbd -d /dev/sda1 list
|
||
|
- 0 hex-0 clear
|
||
|
- 1 hex-7 clear
|
||
|
- 2 hex-9 clear
|
||
|
-
|
||
|
-List all allocated slots on device, and messages. You should see all
|
||
|
-cluster nodes that have ever been started against this device. Nodes
|
||
|
-that are currently running should have a I<clear> state; nodes that have
|
||
|
-been fenced, but not yet restarted, will show the appropriate fencing
|
||
|
-message.
|
||
|
-
|
||
|
-=head2 dump
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- # sbd -d /dev/sda1 dump
|
||
|
- ==Dumping header on disk /dev/sda1
|
||
|
- Header version : 2
|
||
|
- Number of slots : 255
|
||
|
- Sector size : 512
|
||
|
- Timeout (watchdog) : 15
|
||
|
- Timeout (allocate) : 2
|
||
|
- Timeout (loop) : 1
|
||
|
- Timeout (msgwait) : 30
|
||
|
- ==Header on disk /dev/sda1 is dumped
|
||
|
-
|
||
|
-Dump meta-data header from device.
|
||
|
-
|
||
|
-=head2 watch
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- sbd -d /dev/sdc2 -d /dev/sdd3 -P watch
|
||
|
-
|
||
|
-This command will make C<sbd> start in daemon mode. It will constantly monitor
|
||
|
-the message slot of the local node for incoming messages, reachability, and
|
||
|
-optionally take Pacemaker's state into account.
|
||
|
-
|
||
|
-C<sbd> B<must> be started on boot before the cluster stack! See below
|
||
|
-for enabling this according to your boot environment.
|
||
|
-
|
||
|
-The options for this mode are rarely specified directly on the
|
||
|
-commandline directly, but most frequently set via F</etc/sysconfig/sbd>.
|
||
|
-
|
||
|
-It also constantly monitors connectivity to the storage device, and
|
||
|
-self-fences in case the partition becomes unreachable, guaranteeing that it
|
||
|
-does not disconnect from fencing messages.
|
||
|
-
|
||
|
-A node slot is automatically allocated on the device(s) the first time
|
||
|
-the daemon starts watching the device; hence, manual allocation is not
|
||
|
-usually required.
|
||
|
-
|
||
|
-If a watchdog is used together with the C<sbd> as is strongly
|
||
|
-recommended, the watchdog is activated at initial start of the sbd
|
||
|
-daemon. The watchdog is refreshed every time the majority of SBD devices
|
||
|
-has been successfully read. Using a watchdog provides additional
|
||
|
-protection against C<sbd> crashing.
|
||
|
-
|
||
|
-If the Pacemaker integration is activated, C<sbd> will B<not> self-fence
|
||
|
-if device majority is lost, if:
|
||
|
-
|
||
|
-=over
|
||
|
-
|
||
|
-=item 1.
|
||
|
-
|
||
|
-The partition the node is in is still quorate according to the CIB;
|
||
|
-
|
||
|
-=item 2.
|
||
|
-
|
||
|
-it is still quorate according to Corosync's node count;
|
||
|
-
|
||
|
-=item 3.
|
||
|
-
|
||
|
-the node itself is considered online and healthy by Pacemaker.
|
||
|
-
|
||
|
-=back
|
||
|
-
|
||
|
-This allows C<sbd> to survive temporary outages of the majority of
|
||
|
-devices. However, while the cluster is in such a degraded state, it can
|
||
|
-neither successfully fence nor be shutdown cleanly (as taking the
|
||
|
-cluster below the quorum threshold will immediately cause all remaining
|
||
|
-nodes to self-fence). In short, it will not tolerate any further faults.
|
||
|
-Please repair the system before continuing.
|
||
|
-
|
||
|
-There is one C<sbd> process that acts as a master to which all watchers
|
||
|
-report; one per device to monitor the node's slot; and, optionally, one
|
||
|
-that handles the Pacemaker integration.
|
||
|
-
|
||
|
-=over
|
||
|
-
|
||
|
-=item B<-W>
|
||
|
-
|
||
|
-Enable or disable use of the system watchdog to protect against the sbd
|
||
|
-processes failing and the node being left in an undefined state. Specify
|
||
|
-this once to enable, twice to disable.
|
||
|
-
|
||
|
-Defaults to I<enabled>.
|
||
|
-
|
||
|
-=item B<-w> F</dev/watchdog>
|
||
|
-
|
||
|
-This can be used to override the default watchdog device used and should not
|
||
|
-usually be necessary.
|
||
|
-
|
||
|
-=item B<-p> F</var/run/sbd.pid>
|
||
|
-
|
||
|
-This option can be used to specify a pidfile for the main sbd process.
|
||
|
-
|
||
|
-=item B<-F> I<N>
|
||
|
-
|
||
|
-Number of failures before a failing servant process will not be restarted
|
||
|
-immediately until the dampening delay has expired. If set to zero, servants
|
||
|
-will be restarted immediately and indefinitely. If set to one, a failed
|
||
|
-servant will be restarted once every B<-t> seconds. If set to a different
|
||
|
-value, the servant will be restarted that many times within the dampening
|
||
|
-period and then delay.
|
||
|
-
|
||
|
-Defaults to I<1>.
|
||
|
-
|
||
|
-=item B<-t> I<N>
|
||
|
-
|
||
|
-Dampening delay before faulty servants are restarted. Combined with C<-F 1>,
|
||
|
-the most logical way to tune the restart frequency of servant processes.
|
||
|
-Default is 5 seconds.
|
||
|
-
|
||
|
-If set to zero, processes will be restarted indefinitely and immediately.
|
||
|
-
|
||
|
-=item B<-P>
|
||
|
-
|
||
|
-Enable Pacemaker integration which checks Pacemaker quorum and node health.
|
||
|
-Specify this once to enable, twice to disable.
|
||
|
-
|
||
|
-Defaults to I<enabled>.
|
||
|
-
|
||
|
-=item B<-S> I<N>
|
||
|
-
|
||
|
-Set the start mode. (Defaults to I<0>.)
|
||
|
-
|
||
|
-If this is set to zero, sbd will always start up unconditionally,
|
||
|
-regardless of whether the node was previously fenced or not.
|
||
|
-
|
||
|
-If set to one, sbd will only start if the node was previously shutdown
|
||
|
-cleanly (as indicated by an exit request message in the slot), or if the
|
||
|
-slot is empty. A reset, crashdump, or power-off request in any slot will
|
||
|
-halt the start up.
|
||
|
-
|
||
|
-This is useful to prevent nodes from rejoining if they were faulty. The
|
||
|
-node must be manually "unfenced" by sending an empty message to it:
|
||
|
-
|
||
|
- sbd -d /dev/sda1 message node1 clear
|
||
|
-
|
||
|
-=item B<-s> I<N>
|
||
|
-
|
||
|
-Set the start-up wait time for devices. (Defaults to I<120>.)
|
||
|
-
|
||
|
-Dynamic block devices such as iSCSI might not be fully initialized and
|
||
|
-present yet. This allows one to set a timeout for waiting for devices to
|
||
|
-appear on start-up. If set to 0, start-up will be aborted immediately if
|
||
|
-no devices are available.
|
||
|
-
|
||
|
-=item B<-Z>
|
||
|
-
|
||
|
-Enable trace mode. B<Warning: this is unsafe for production, use at your
|
||
|
-own risk!> Specifying this once will turn all reboots or power-offs, be
|
||
|
-they caused by self-fence decisions or messages, into a crashdump.
|
||
|
-Specifying this twice will just log them but not continue running.
|
||
|
-
|
||
|
-=item B<-T>
|
||
|
-
|
||
|
-By default, the daemon will set the watchdog timeout as specified in the
|
||
|
-device metadata. However, this does not work for every watchdog device.
|
||
|
-In this case, you must manually ensure that the watchdog timeout used by
|
||
|
-the system correctly matches the SBD settings, and then specify this
|
||
|
-option to allow C<sbd> to continue with start-up.
|
||
|
-
|
||
|
-=item B<-5> I<N>
|
||
|
-
|
||
|
-Warn if the time interval for tickling the watchdog exceeds this many seconds.
|
||
|
-Since the node is unable to log the watchdog expiry (it reboots immediately
|
||
|
-without a chance to write its logs to disk), this is very useful for getting
|
||
|
-an indication that the watchdog timeout is too short for the IO load of the
|
||
|
-system.
|
||
|
-
|
||
|
-Default is 3 seconds, set to zero to disable.
|
||
|
-
|
||
|
-=item B<-C> I<N>
|
||
|
-
|
||
|
-Watchdog timeout to set before crashdumping. If SBD is set to crashdump
|
||
|
-instead of reboot - either via the trace mode settings or the I<external/sbd>
|
||
|
-fencing agent's parameter -, SBD will adjust the watchdog timeout to this
|
||
|
-setting before triggering the dump. Otherwise, the watchdog might trigger and
|
||
|
-prevent a successful crashdump from ever being written.
|
||
|
-
|
||
|
-Set to zero (= default) to disable.
|
||
|
-
|
||
|
-=item B<-r> I<N>
|
||
|
-
|
||
|
-Actions to be executed when the watchers don't timely report to the sbd
|
||
|
-master process or one of the watchers detects that the master process
|
||
|
-has died.
|
||
|
-
|
||
|
-Set timeout-action to comma-separated combination of
|
||
|
-noflush|flush plus reboot|crashdump|off.
|
||
|
-If just one of both is given the other stays at the default.
|
||
|
-
|
||
|
-This doesn't affect actions like off, crashdump, reboot explicitly
|
||
|
-triggered via message slots.
|
||
|
-And it does as well not configure the action a watchdog would
|
||
|
-trigger should it run off (there is no generic interface).
|
||
|
-
|
||
|
-Defaults to flush,reboot.
|
||
|
-
|
||
|
-=back
|
||
|
-
|
||
|
-=head2 allocate
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- sbd -d /dev/sda1 allocate node1
|
||
|
-
|
||
|
-Explicitly allocates a slot for the specified node name. This should
|
||
|
-rarely be necessary, as every node will automatically allocate itself a
|
||
|
-slot the first time it starts up on watch mode.
|
||
|
-
|
||
|
-=head2 message
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- sbd -d /dev/sda1 message node1 test
|
||
|
-
|
||
|
-Writes the specified message to node's slot. This is rarely done
|
||
|
-directly, but rather abstracted via the C<external/sbd> fencing agent
|
||
|
-configured as a cluster resource.
|
||
|
-
|
||
|
-Supported message types are:
|
||
|
-
|
||
|
-=over
|
||
|
-
|
||
|
-=item test
|
||
|
-
|
||
|
-This only generates a log message on the receiving node and can be used
|
||
|
-to check if SBD is seeing the device. Note that this could overwrite a
|
||
|
-fencing request send by the cluster, so should not be used during
|
||
|
-production.
|
||
|
-
|
||
|
-=item reset
|
||
|
-
|
||
|
-Reset the target upon receipt of this message.
|
||
|
-
|
||
|
-=item off
|
||
|
-
|
||
|
-Power-off the target.
|
||
|
-
|
||
|
-=item crashdump
|
||
|
-
|
||
|
-Cause the target node to crashdump.
|
||
|
-
|
||
|
-=item exit
|
||
|
-
|
||
|
-This will make the C<sbd> daemon exit cleanly on the target. You should
|
||
|
-B<not> send this message manually; this is handled properly during
|
||
|
-shutdown of the cluster stack. Manually stopping the daemon means the
|
||
|
-node is unprotected!
|
||
|
-
|
||
|
-=item clear
|
||
|
-
|
||
|
-This message indicates that no real message has been sent to the node.
|
||
|
-You should not set this manually; C<sbd> will clear the message slot
|
||
|
-automatically during start-up, and setting this manually could overwrite
|
||
|
-a fencing message by the cluster.
|
||
|
-
|
||
|
-=back
|
||
|
-
|
||
|
-=head2 query-watchdog
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- sbd query-watchdog
|
||
|
-
|
||
|
-Check for available watchdog devices and print some info.
|
||
|
-
|
||
|
-B<Warning>: This command will arm the watchdog during query, and if your
|
||
|
-watchdog refuses disarming (for example, if its kernel module has the
|
||
|
-'nowayout' parameter set) this will reset your system.
|
||
|
-
|
||
|
-=head2 test-watchdog
|
||
|
-
|
||
|
-Example usage:
|
||
|
-
|
||
|
- sbd test-watchdog [-w /dev/watchdog3]
|
||
|
-
|
||
|
-Test specified watchdog device (/dev/watchdog by default).
|
||
|
-
|
||
|
-B<Warning>: This command will arm the watchdog and have your system reset
|
||
|
-in case your watchdog is working properly! If issued from an interactive
|
||
|
-session, it will prompt for confirmation.
|
||
|
-
|
||
|
-=head1 Base system configuration
|
||
|
-
|
||
|
-=head2 Configure a watchdog
|
||
|
-
|
||
|
-It is highly recommended that you configure your Linux system to load a
|
||
|
-watchdog driver with hardware assistance (as is available on most modern
|
||
|
-systems), such as I<hpwdt>, I<iTCO_wdt>, or others. As a fall-back, you
|
||
|
-can use the I<softdog> module.
|
||
|
-
|
||
|
-No other software must access the watchdog timer; it can only be
|
||
|
-accessed by one process at any given time. Some hardware vendors ship
|
||
|
-systems management software that use the watchdog for system resets
|
||
|
-(f.e. HP ASR daemon). Such software has to be disabled if the watchdog
|
||
|
-is to be used by SBD.
|
||
|
-
|
||
|
-=head2 Choosing and initializing the block device(s)
|
||
|
-
|
||
|
-First, you have to decide if you want to use one, two, or three devices.
|
||
|
-
|
||
|
-If you are using multiple ones, they should reside on independent
|
||
|
-storage setups. Putting all three of them on the same logical unit for
|
||
|
-example would not provide any additional redundancy.
|
||
|
-
|
||
|
-The SBD device can be connected via Fibre Channel, Fibre Channel over
|
||
|
-Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of
|
||
|
-network-based quorum server; the advantage is that it does not require
|
||
|
-a smart host at your third location, just block storage.
|
||
|
-
|
||
|
-The SBD partitions themselves B<must not> be mirrored (via MD,
|
||
|
-DRBD, or the storage layer itself), since this could result in a
|
||
|
-split-mirror scenario. Nor can they reside on cLVM2 volume groups, since
|
||
|
-they must be accessed by the cluster stack before it has started the
|
||
|
-cLVM2 daemons; hence, these should be either raw partitions or logical
|
||
|
-units on (multipath) storage.
|
||
|
-
|
||
|
-The block device(s) must be accessible from all nodes. (While it is not
|
||
|
-necessary that they share the same path name on all nodes, this is
|
||
|
-considered a very good idea.)
|
||
|
-
|
||
|
-SBD will only use about one megabyte per device, so you can easily
|
||
|
-create a small partition, or very small logical units. (The size of the
|
||
|
-SBD device depends on the block size of the underlying device. Thus, 1MB
|
||
|
-is fine on plain SCSI devices and SAN storage with 512 byte blocks. On
|
||
|
-the IBM s390x architecture in particular, disks default to 4k blocks,
|
||
|
-and thus require roughly 4MB.)
|
||
|
-
|
||
|
-The number of devices will affect the operation of SBD as follows:
|
||
|
-
|
||
|
-=over
|
||
|
-
|
||
|
-=item One device
|
||
|
-
|
||
|
-In its most simple implementation, you use one device only. This is
|
||
|
-appropriate for clusters where all your data is on the same shared
|
||
|
-storage (with internal redundancy) anyway; the SBD device does not
|
||
|
-introduce an additional single point of failure then.
|
||
|
-
|
||
|
-If the SBD device is not accessible, the daemon will fail to start and
|
||
|
-inhibit startup of cluster services.
|
||
|
-
|
||
|
-=item Two devices
|
||
|
-
|
||
|
-This configuration is a trade-off, primarily aimed at environments where
|
||
|
-host-based mirroring is used, but no third storage device is available.
|
||
|
-
|
||
|
-SBD will not commit suicide if it loses access to one mirror leg; this
|
||
|
-allows the cluster to continue to function even in the face of one outage.
|
||
|
-
|
||
|
-However, SBD will not fence the other side while only one mirror leg is
|
||
|
-available, since it does not have enough knowledge to detect an asymmetric
|
||
|
-split of the storage. So it will not be able to automatically tolerate a
|
||
|
-second failure while one of the storage arrays is down. (Though you
|
||
|
-can use the appropriate crm command to acknowledge the fence manually.)
|
||
|
-
|
||
|
-It will not start unless both devices are accessible on boot.
|
||
|
-
|
||
|
-=item Three devices
|
||
|
-
|
||
|
-In this most reliable and recommended configuration, SBD will only
|
||
|
-self-fence if more than one device is lost; hence, this configuration is
|
||
|
-resilient against temporary single device outages (be it due to failures
|
||
|
-or maintenance). Fencing messages can still be successfully relayed if
|
||
|
-at least two devices remain accessible.
|
||
|
-
|
||
|
-This configuration is appropriate for more complex scenarios where
|
||
|
-storage is not confined to a single array. For example, host-based
|
||
|
-mirroring solutions could have one SBD per mirror leg (not mirrored
|
||
|
-itself), and an additional tie-breaker on iSCSI.
|
||
|
-
|
||
|
-It will only start if at least two devices are accessible on boot.
|
||
|
-
|
||
|
-=back
|
||
|
-
|
||
|
-After you have chosen the devices and created the appropriate partitions
|
||
|
-and perhaps multipath alias names to ease management, use the C<sbd create>
|
||
|
-command described above to initialize the SBD metadata on them.
|
||
|
-
|
||
|
-=head3 Sharing the block device(s) between multiple clusters
|
||
|
-
|
||
|
-It is possible to share the block devices between multiple clusters,
|
||
|
-provided the total number of nodes accessing them does not exceed I<255>
|
||
|
-nodes, and they all must share the same SBD timeouts (since these are
|
||
|
-part of the metadata).
|
||
|
-
|
||
|
-If you are using multiple devices this can reduce the setup overhead
|
||
|
-required. However, you should B<not> share devices between clusters in
|
||
|
-different security domains.
|
||
|
-
|
||
|
-=head2 Configure SBD to start on boot
|
||
|
-
|
||
|
-On systems using C<sysvinit>, the C<openais> or C<corosync> system
|
||
|
-start-up scripts must handle starting or stopping C<sbd> as required
|
||
|
-before starting the rest of the cluster stack.
|
||
|
-
|
||
|
-For C<systemd>, sbd simply has to be enabled using
|
||
|
-
|
||
|
- systemctl enable sbd.service
|
||
|
-
|
||
|
-The daemon is brought online on each node before corosync and Pacemaker
|
||
|
-are started, and terminated only after all other cluster components have
|
||
|
-been shut down - ensuring that cluster resources are never activated
|
||
|
-without SBD supervision.
|
||
|
-
|
||
|
-=head2 Configuration via sysconfig
|
||
|
-
|
||
|
-The system instance of C<sbd> is configured via F</etc/sysconfig/sbd>.
|
||
|
-In this file, you must specify the device(s) used, as well as any
|
||
|
-options to pass to the daemon:
|
||
|
-
|
||
|
- SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1"
|
||
|
- SBD_PACEMAKER="true"
|
||
|
-
|
||
|
-C<sbd> will fail to start if no C<SBD_DEVICE> is specified. See the
|
||
|
-installed template for more options that can be configured here.
|
||
|
-In general configuration done via parameters takes precedence over
|
||
|
-the configuration from the configuration file.
|
||
|
-
|
||
|
-=head2 Testing the sbd installation
|
||
|
-
|
||
|
-After a restart of the cluster stack on this node, you can now try
|
||
|
-sending a test message to it as root, from this or any other node:
|
||
|
-
|
||
|
- sbd -d /dev/sda1 message node1 test
|
||
|
-
|
||
|
-The node will acknowledge the receipt of the message in the system logs:
|
||
|
-
|
||
|
- Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2
|
||
|
-
|
||
|
-This confirms that SBD is indeed up and running on the node, and that it
|
||
|
-is ready to receive messages.
|
||
|
-
|
||
|
-Make B<sure> that F</etc/sysconfig/sbd> is identical on all cluster
|
||
|
-nodes, and that all cluster nodes are running the daemon.
|
||
|
-
|
||
|
-=head1 Pacemaker CIB integration
|
||
|
-
|
||
|
-=head2 Fencing resource
|
||
|
-
|
||
|
-Pacemaker can only interact with SBD to issue a node fence if there is a
|
||
|
-configure fencing resource. This should be a primitive, not a clone, as
|
||
|
-follows:
|
||
|
-
|
||
|
- primitive fencing-sbd stonith:external/sbd \
|
||
|
- params pcmk_delay_max=30
|
||
|
-
|
||
|
-This will automatically use the same devices as configured in
|
||
|
-F</etc/sysconfig/sbd>.
|
||
|
-
|
||
|
-While you should not configure this as a clone (as Pacemaker will register
|
||
|
-the fencing device on each node automatically), the I<pcmk_delay_max>
|
||
|
-setting enables random fencing delay which ensures, in a scenario where a
|
||
|
-split-brain scenario did occur in a two node cluster, that one of the nodes
|
||
|
-has a better chance to survive to avoid double fencing.
|
||
|
-
|
||
|
-SBD also supports turning the reset request into a crash request, which
|
||
|
-may be helpful for debugging if you have kernel crashdumping configured;
|
||
|
-then, every fence request will cause the node to dump core. You can
|
||
|
-enable this via the C<crashdump="true"> parameter on the fencing
|
||
|
-resource. This is B<not> recommended for production use, but only for
|
||
|
-debugging phases.
|
||
|
-
|
||
|
-=head2 General cluster properties
|
||
|
-
|
||
|
-You must also enable STONITH in general, and set the STONITH timeout to
|
||
|
-be at least twice the I<msgwait> timeout you have configured, to allow
|
||
|
-enough time for the fencing message to be delivered. If your I<msgwait>
|
||
|
-timeout is 60 seconds, this is a possible configuration:
|
||
|
-
|
||
|
- property stonith-enabled="true"
|
||
|
- property stonith-timeout="120s"
|
||
|
-
|
||
|
-B<Caution>: if I<stonith-timeout> is too low for I<msgwait> and the
|
||
|
-system overhead, sbd will never be able to successfully complete a fence
|
||
|
-request. This will create a fencing loop.
|
||
|
-
|
||
|
-Note that the sbd fencing agent will try to detect this and
|
||
|
-automatically extend the I<stonith-timeout> setting to a reasonable
|
||
|
-value, on the assumption that sbd modifying your configuration is
|
||
|
-preferable to not fencing.
|
||
|
-
|
||
|
-=head1 Management tasks
|
||
|
-
|
||
|
-=head2 Recovering from temporary SBD device outage
|
||
|
-
|
||
|
-If you have multiple devices, failure of a single device is not immediately
|
||
|
-fatal. C<sbd> will retry to restart the monitor for the device every 5
|
||
|
-seconds by default. However, you can tune this via the options to the
|
||
|
-I<watch> command.
|
||
|
-
|
||
|
-In case you wish the immediately force a restart of all currently
|
||
|
-disabled monitor processes, you can send a I<SIGUSR1> to the SBD
|
||
|
-I<inquisitor> process.
|
||
|
-
|
||
|
-
|
||
|
-=head1 LICENSE
|
||
|
-
|
||
|
-Copyright (C) 2008-2013 Lars Marowsky-Bree
|
||
|
-
|
||
|
-This program is free software; you can redistribute it and/or
|
||
|
-modify it under the terms of the GNU General Public
|
||
|
-License as published by the Free Software Foundation; either
|
||
|
-version 2 of the License, or (at your option) any later version.
|
||
|
-
|
||
|
-This software is distributed in the hope that it will be useful,
|
||
|
-but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
-General Public License for more details.
|
||
|
-
|
||
|
-For details see the GNU General Public License at
|
||
|
-http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or
|
||
|
-http://www.gnu.org/licenses/gpl.html (the newest as per "any later").
|
||
|
diff --git a/man/sbd.8.pod.in b/man/sbd.8.pod.in
|
||
|
new file mode 100644
|
||
|
index 0000000..ff89c82
|
||
|
--- /dev/null
|
||
|
+++ b/man/sbd.8.pod.in
|
||
|
@@ -0,0 +1,675 @@
|
||
|
+=head1 NAME
|
||
|
+
|
||
|
+sbd - STONITH Block Device daemon
|
||
|
+
|
||
|
+=head1 SYNOPSIS
|
||
|
+
|
||
|
+sbd <-d F</dev/...>> [options] C<command>
|
||
|
+
|
||
|
+=head1 SUMMARY
|
||
|
+
|
||
|
+SBD provides a node fencing mechanism (Shoot the other node in the head,
|
||
|
+STONITH) for Pacemaker-based clusters through the exchange of messages
|
||
|
+via shared block storage such as for example a SAN, iSCSI, FCoE. This
|
||
|
+isolates the fencing mechanism from changes in firmware version or
|
||
|
+dependencies on specific firmware controllers, and it can be used as a
|
||
|
+STONITH mechanism in all configurations that have reliable shared
|
||
|
+storage.
|
||
|
+
|
||
|
+SBD can also be used without any shared storage. In this mode, the
|
||
|
+watchdog device will be used to reset the node if it loses quorum, if
|
||
|
+any monitored daemon is lost and not recovered or if Pacemaker decides
|
||
|
+that the node requires fencing.
|
||
|
+
|
||
|
+The F<sbd> binary implements both the daemon that watches the message
|
||
|
+slots as well as the management tool for interacting with the block
|
||
|
+storage device(s). This mode of operation is specified via the
|
||
|
+C<command> parameter; some of these modes take additional parameters.
|
||
|
+
|
||
|
+To use SBD with shared storage, you must first C<create> the messaging
|
||
|
+layout on one to three block devices. Second, configure
|
||
|
+F</etc/sysconfig/sbd> to list those devices (and possibly adjust other
|
||
|
+options), and restart the cluster stack on each node to ensure that
|
||
|
+C<sbd> is started. Third, configure the C<external/sbd> fencing
|
||
|
+resource in the Pacemaker CIB.
|
||
|
+
|
||
|
+Each of these steps is documented in more detail below the description
|
||
|
+of the command options.
|
||
|
+
|
||
|
+C<sbd> can only be used as root.
|
||
|
+
|
||
|
+=head2 GENERAL OPTIONS
|
||
|
+
|
||
|
+=over
|
||
|
+
|
||
|
+=item B<-d> F</dev/...>
|
||
|
+
|
||
|
+Specify the block device(s) to be used. If you have more than one,
|
||
|
+specify this option up to three times. This parameter is mandatory for
|
||
|
+all modes, since SBD always needs a block device to interact with.
|
||
|
+
|
||
|
+This man page uses F</dev/sda1>, F</dev/sdb1>, and F</dev/sdc1> as
|
||
|
+example device names for brevity. However, in your production
|
||
|
+environment, you should instead always refer to them by using the long,
|
||
|
+stable device name (e.g.,
|
||
|
+F</dev/disk/by-id/dm-uuid-part1-mpath-3600508b400105b5a0001500000250000>).
|
||
|
+
|
||
|
+=item B<-v|-vv|-vvv>
|
||
|
+
|
||
|
+Enable verbose|debug|debug-library logging (optional)
|
||
|
+
|
||
|
+=item B<-h>
|
||
|
+
|
||
|
+Display a concise summary of C<sbd> options.
|
||
|
+
|
||
|
+=item B<-n> I<node>
|
||
|
+
|
||
|
+Set local node name; defaults to C<uname -n>. This should not need to be
|
||
|
+set.
|
||
|
+
|
||
|
+=item B<-R>
|
||
|
+
|
||
|
+Do B<not> enable realtime priority. By default, C<sbd> runs at realtime
|
||
|
+priority, locks itself into memory, and also acquires highest IO
|
||
|
+priority to protect itself against interference from other processes on
|
||
|
+the system. This is a debugging-only option.
|
||
|
+
|
||
|
+=item B<-I> I<N>
|
||
|
+
|
||
|
+Async IO timeout (defaults to 3 seconds, optional). You should not need
|
||
|
+to adjust this unless your IO setup is really very slow.
|
||
|
+
|
||
|
+(In daemon mode, the watchdog is refreshed when the majority of devices
|
||
|
+could be read within this time.)
|
||
|
+
|
||
|
+=back
|
||
|
+
|
||
|
+=head2 create
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ sbd -d /dev/sdc2 -d /dev/sdd3 create
|
||
|
+
|
||
|
+If you specify the I<create> command, sbd will write a metadata header
|
||
|
+to the device(s) specified and also initialize the messaging slots for
|
||
|
+up to 255 nodes.
|
||
|
+
|
||
|
+B<Warning>: This command will not prompt for confirmation. Roughly the
|
||
|
+first megabyte of the specified block device(s) will be overwritten
|
||
|
+immediately and without backup.
|
||
|
+
|
||
|
+This command accepts a few options to adjust the default timings that
|
||
|
+are written to the metadata (to ensure they are identical across all
|
||
|
+nodes accessing the device).
|
||
|
+
|
||
|
+=over
|
||
|
+
|
||
|
+=item B<-1> I<N>
|
||
|
+
|
||
|
+Set watchdog timeout to N seconds. This depends mostly on your storage
|
||
|
+latency; the majority of devices must be successfully read within this
|
||
|
+time, or else the node will self-fence.
|
||
|
+
|
||
|
+If your sbd device(s) reside on a multipath setup or iSCSI, this should
|
||
|
+be the time required to detect a path failure. You may be able to reduce
|
||
|
+this if your device outages are independent, or if you are using the
|
||
|
+Pacemaker integration.
|
||
|
+
|
||
|
+=item B<-2> I<N>
|
||
|
+
|
||
|
+Set slot allocation timeout to N seconds. You should not need to tune
|
||
|
+this.
|
||
|
+
|
||
|
+=item B<-3> I<N>
|
||
|
+
|
||
|
+Set daemon loop timeout to N seconds. You should not need to tune this.
|
||
|
+
|
||
|
+=item B<-4> I<N>
|
||
|
+
|
||
|
+Set I<msgwait> timeout to N seconds. This should be twice the I<watchdog>
|
||
|
+timeout. This is the time after which a message written to a node's slot
|
||
|
+will be considered delivered. (Or long enough for the node to detect
|
||
|
+that it needed to self-fence.)
|
||
|
+
|
||
|
+This also affects the I<stonith-timeout> in Pacemaker's CIB; see below.
|
||
|
+
|
||
|
+=back
|
||
|
+
|
||
|
+=head2 list
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ # sbd -d /dev/sda1 list
|
||
|
+ 0 hex-0 clear
|
||
|
+ 1 hex-7 clear
|
||
|
+ 2 hex-9 clear
|
||
|
+
|
||
|
+List all allocated slots on device, and messages. You should see all
|
||
|
+cluster nodes that have ever been started against this device. Nodes
|
||
|
+that are currently running should have a I<clear> state; nodes that have
|
||
|
+been fenced, but not yet restarted, will show the appropriate fencing
|
||
|
+message.
|
||
|
+
|
||
|
+=head2 dump
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ # sbd -d /dev/sda1 dump
|
||
|
+ ==Dumping header on disk /dev/sda1
|
||
|
+ Header version : 2
|
||
|
+ Number of slots : 255
|
||
|
+ Sector size : 512
|
||
|
+ Timeout (watchdog) : 15
|
||
|
+ Timeout (allocate) : 2
|
||
|
+ Timeout (loop) : 1
|
||
|
+ Timeout (msgwait) : 30
|
||
|
+ ==Header on disk /dev/sda1 is dumped
|
||
|
+
|
||
|
+Dump meta-data header from device.
|
||
|
+
|
||
|
+=head2 watch
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ sbd -d /dev/sdc2 -d /dev/sdd3 -P watch
|
||
|
+
|
||
|
+This command will make C<sbd> start in daemon mode. It will constantly monitor
|
||
|
+the message slot of the local node for incoming messages, reachability, and
|
||
|
+optionally take Pacemaker's state into account.
|
||
|
+
|
||
|
+C<sbd> B<must> be started on boot before the cluster stack! See below
|
||
|
+for enabling this according to your boot environment.
|
||
|
+
|
||
|
+The options for this mode are rarely specified directly on the
|
||
|
+commandline directly, but most frequently set via F</etc/sysconfig/sbd>.
|
||
|
+
|
||
|
+It also constantly monitors connectivity to the storage device, and
|
||
|
+self-fences in case the partition becomes unreachable, guaranteeing that it
|
||
|
+does not disconnect from fencing messages.
|
||
|
+
|
||
|
+A node slot is automatically allocated on the device(s) the first time
|
||
|
+the daemon starts watching the device; hence, manual allocation is not
|
||
|
+usually required.
|
||
|
+
|
||
|
+If a watchdog is used together with the C<sbd> as is strongly
|
||
|
+recommended, the watchdog is activated at initial start of the sbd
|
||
|
+daemon. The watchdog is refreshed every time the majority of SBD devices
|
||
|
+has been successfully read. Using a watchdog provides additional
|
||
|
+protection against C<sbd> crashing.
|
||
|
+
|
||
|
+If the Pacemaker integration is activated, C<sbd> will B<not> self-fence
|
||
|
+if device majority is lost, if:
|
||
|
+
|
||
|
+=over
|
||
|
+
|
||
|
+=item 1.
|
||
|
+
|
||
|
+The partition the node is in is still quorate according to the CIB;
|
||
|
+
|
||
|
+=item 2.
|
||
|
+
|
||
|
+it is still quorate according to Corosync's node count;
|
||
|
+
|
||
|
+=item 3.
|
||
|
+
|
||
|
+the node itself is considered online and healthy by Pacemaker.
|
||
|
+
|
||
|
+=back
|
||
|
+
|
||
|
+This allows C<sbd> to survive temporary outages of the majority of
|
||
|
+devices. However, while the cluster is in such a degraded state, it can
|
||
|
+neither successfully fence nor be shutdown cleanly (as taking the
|
||
|
+cluster below the quorum threshold will immediately cause all remaining
|
||
|
+nodes to self-fence). In short, it will not tolerate any further faults.
|
||
|
+Please repair the system before continuing.
|
||
|
+
|
||
|
+There is one C<sbd> process that acts as a master to which all watchers
|
||
|
+report; one per device to monitor the node's slot; and, optionally, one
|
||
|
+that handles the Pacemaker integration.
|
||
|
+
|
||
|
+=over
|
||
|
+
|
||
|
+=item B<-W>
|
||
|
+
|
||
|
+Enable or disable use of the system watchdog to protect against the sbd
|
||
|
+processes failing and the node being left in an undefined state. Specify
|
||
|
+this once to enable, twice to disable.
|
||
|
+
|
||
|
+Defaults to I<enabled>.
|
||
|
+
|
||
|
+=item B<-w> F</dev/watchdog>
|
||
|
+
|
||
|
+This can be used to override the default watchdog device used and should not
|
||
|
+usually be necessary.
|
||
|
+
|
||
|
+=item B<-p> F</var/run/sbd.pid>
|
||
|
+
|
||
|
+This option can be used to specify a pidfile for the main sbd process.
|
||
|
+
|
||
|
+=item B<-F> I<N>
|
||
|
+
|
||
|
+Number of failures before a failing servant process will not be restarted
|
||
|
+immediately until the dampening delay has expired. If set to zero, servants
|
||
|
+will be restarted immediately and indefinitely. If set to one, a failed
|
||
|
+servant will be restarted once every B<-t> seconds. If set to a different
|
||
|
+value, the servant will be restarted that many times within the dampening
|
||
|
+period and then delay.
|
||
|
+
|
||
|
+Defaults to I<1>.
|
||
|
+
|
||
|
+=item B<-t> I<N>
|
||
|
+
|
||
|
+Dampening delay before faulty servants are restarted. Combined with C<-F 1>,
|
||
|
+the most logical way to tune the restart frequency of servant processes.
|
||
|
+Default is 5 seconds.
|
||
|
+
|
||
|
+If set to zero, processes will be restarted indefinitely and immediately.
|
||
|
+
|
||
|
+=item B<-P>
|
||
|
+
|
||
|
+Enable Pacemaker integration which checks Pacemaker quorum and node health.
|
||
|
+Specify this once to enable, twice to disable.
|
||
|
+
|
||
|
+Defaults to I<enabled>.
|
||
|
+
|
||
|
+=item B<-S> I<N>
|
||
|
+
|
||
|
+Set the start mode. (Defaults to I<0>.)
|
||
|
+
|
||
|
+If this is set to zero, sbd will always start up unconditionally,
|
||
|
+regardless of whether the node was previously fenced or not.
|
||
|
+
|
||
|
+If set to one, sbd will only start if the node was previously shutdown
|
||
|
+cleanly (as indicated by an exit request message in the slot), or if the
|
||
|
+slot is empty. A reset, crashdump, or power-off request in any slot will
|
||
|
+halt the start up.
|
||
|
+
|
||
|
+This is useful to prevent nodes from rejoining if they were faulty. The
|
||
|
+node must be manually "unfenced" by sending an empty message to it:
|
||
|
+
|
||
|
+ sbd -d /dev/sda1 message node1 clear
|
||
|
+
|
||
|
+=item B<-s> I<N>
|
||
|
+
|
||
|
+Set the start-up wait time for devices. (Defaults to I<120>.)
|
||
|
+
|
||
|
+Dynamic block devices such as iSCSI might not be fully initialized and
|
||
|
+present yet. This allows one to set a timeout for waiting for devices to
|
||
|
+appear on start-up. If set to 0, start-up will be aborted immediately if
|
||
|
+no devices are available.
|
||
|
+
|
||
|
+=item B<-Z>
|
||
|
+
|
||
|
+Enable trace mode. B<Warning: this is unsafe for production, use at your
|
||
|
+own risk!> Specifying this once will turn all reboots or power-offs, be
|
||
|
+they caused by self-fence decisions or messages, into a crashdump.
|
||
|
+Specifying this twice will just log them but not continue running.
|
||
|
+
|
||
|
+=item B<-T>
|
||
|
+
|
||
|
+By default, the daemon will set the watchdog timeout as specified in the
|
||
|
+device metadata. However, this does not work for every watchdog device.
|
||
|
+In this case, you must manually ensure that the watchdog timeout used by
|
||
|
+the system correctly matches the SBD settings, and then specify this
|
||
|
+option to allow C<sbd> to continue with start-up.
|
||
|
+
|
||
|
+=item B<-5> I<N>
|
||
|
+
|
||
|
+Warn if the time interval for tickling the watchdog exceeds this many seconds.
|
||
|
+Since the node is unable to log the watchdog expiry (it reboots immediately
|
||
|
+without a chance to write its logs to disk), this is very useful for getting
|
||
|
+an indication that the watchdog timeout is too short for the IO load of the
|
||
|
+system.
|
||
|
+
|
||
|
+Default is 3 seconds, set to zero to disable.
|
||
|
+
|
||
|
+=item B<-C> I<N>
|
||
|
+
|
||
|
+Watchdog timeout to set before crashdumping. If SBD is set to crashdump
|
||
|
+instead of reboot - either via the trace mode settings or the I<external/sbd>
|
||
|
+fencing agent's parameter -, SBD will adjust the watchdog timeout to this
|
||
|
+setting before triggering the dump. Otherwise, the watchdog might trigger and
|
||
|
+prevent a successful crashdump from ever being written.
|
||
|
+
|
||
|
+Set to zero (= default) to disable.
|
||
|
+
|
||
|
+=item B<-r> I<N>
|
||
|
+
|
||
|
+Actions to be executed when the watchers don't timely report to the sbd
|
||
|
+master process or one of the watchers detects that the master process
|
||
|
+has died.
|
||
|
+
|
||
|
+Set timeout-action to comma-separated combination of
|
||
|
+noflush|flush plus reboot|crashdump|off.
|
||
|
+If just one of both is given the other stays at the default.
|
||
|
+
|
||
|
+This doesn't affect actions like off, crashdump, reboot explicitly
|
||
|
+triggered via message slots.
|
||
|
+And it does as well not configure the action a watchdog would
|
||
|
+trigger should it run off (there is no generic interface).
|
||
|
+
|
||
|
+Defaults to flush,reboot.
|
||
|
+
|
||
|
+=back
|
||
|
+
|
||
|
+=head2 allocate
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ sbd -d /dev/sda1 allocate node1
|
||
|
+
|
||
|
+Explicitly allocates a slot for the specified node name. This should
|
||
|
+rarely be necessary, as every node will automatically allocate itself a
|
||
|
+slot the first time it starts up on watch mode.
|
||
|
+
|
||
|
+=head2 message
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ sbd -d /dev/sda1 message node1 test
|
||
|
+
|
||
|
+Writes the specified message to node's slot. This is rarely done
|
||
|
+directly, but rather abstracted via the C<external/sbd> fencing agent
|
||
|
+configured as a cluster resource.
|
||
|
+
|
||
|
+Supported message types are:
|
||
|
+
|
||
|
+=over
|
||
|
+
|
||
|
+=item test
|
||
|
+
|
||
|
+This only generates a log message on the receiving node and can be used
|
||
|
+to check if SBD is seeing the device. Note that this could overwrite a
|
||
|
+fencing request send by the cluster, so should not be used during
|
||
|
+production.
|
||
|
+
|
||
|
+=item reset
|
||
|
+
|
||
|
+Reset the target upon receipt of this message.
|
||
|
+
|
||
|
+=item off
|
||
|
+
|
||
|
+Power-off the target.
|
||
|
+
|
||
|
+=item crashdump
|
||
|
+
|
||
|
+Cause the target node to crashdump.
|
||
|
+
|
||
|
+=item exit
|
||
|
+
|
||
|
+This will make the C<sbd> daemon exit cleanly on the target. You should
|
||
|
+B<not> send this message manually; this is handled properly during
|
||
|
+shutdown of the cluster stack. Manually stopping the daemon means the
|
||
|
+node is unprotected!
|
||
|
+
|
||
|
+=item clear
|
||
|
+
|
||
|
+This message indicates that no real message has been sent to the node.
|
||
|
+You should not set this manually; C<sbd> will clear the message slot
|
||
|
+automatically during start-up, and setting this manually could overwrite
|
||
|
+a fencing message by the cluster.
|
||
|
+
|
||
|
+=back
|
||
|
+
|
||
|
+=head2 query-watchdog
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ sbd query-watchdog
|
||
|
+
|
||
|
+Check for available watchdog devices and print some info.
|
||
|
+
|
||
|
+B<Warning>: This command will arm the watchdog during query, and if your
|
||
|
+watchdog refuses disarming (for example, if its kernel module has the
|
||
|
+'nowayout' parameter set) this will reset your system.
|
||
|
+
|
||
|
+=head2 test-watchdog
|
||
|
+
|
||
|
+Example usage:
|
||
|
+
|
||
|
+ sbd test-watchdog [-w /dev/watchdog3]
|
||
|
+
|
||
|
+Test specified watchdog device (/dev/watchdog by default).
|
||
|
+
|
||
|
+B<Warning>: This command will arm the watchdog and have your system reset
|
||
|
+in case your watchdog is working properly! If issued from an interactive
|
||
|
+session, it will prompt for confirmation.
|
||
|
+
|
||
|
+=head1 Base system configuration
|
||
|
+
|
||
|
+=head2 Configure a watchdog
|
||
|
+
|
||
|
+It is highly recommended that you configure your Linux system to load a
|
||
|
+watchdog driver with hardware assistance (as is available on most modern
|
||
|
+systems), such as I<hpwdt>, I<iTCO_wdt>, or others. As a fall-back, you
|
||
|
+can use the I<softdog> module.
|
||
|
+
|
||
|
+No other software must access the watchdog timer; it can only be
|
||
|
+accessed by one process at any given time. Some hardware vendors ship
|
||
|
+systems management software that use the watchdog for system resets
|
||
|
+(f.e. HP ASR daemon). Such software has to be disabled if the watchdog
|
||
|
+is to be used by SBD.
|
||
|
+
|
||
|
+=head2 Choosing and initializing the block device(s)
|
||
|
+
|
||
|
+First, you have to decide if you want to use one, two, or three devices.
|
||
|
+
|
||
|
+If you are using multiple ones, they should reside on independent
|
||
|
+storage setups. Putting all three of them on the same logical unit for
|
||
|
+example would not provide any additional redundancy.
|
||
|
+
|
||
|
+The SBD device can be connected via Fibre Channel, Fibre Channel over
|
||
|
+Ethernet, or even iSCSI. Thus, an iSCSI target can become a sort-of
|
||
|
+network-based quorum server; the advantage is that it does not require
|
||
|
+a smart host at your third location, just block storage.
|
||
|
+
|
||
|
+The SBD partitions themselves B<must not> be mirrored (via MD,
|
||
|
+DRBD, or the storage layer itself), since this could result in a
|
||
|
+split-mirror scenario. Nor can they reside on cLVM2 volume groups, since
|
||
|
+they must be accessed by the cluster stack before it has started the
|
||
|
+cLVM2 daemons; hence, these should be either raw partitions or logical
|
||
|
+units on (multipath) storage.
|
||
|
+
|
||
|
+The block device(s) must be accessible from all nodes. (While it is not
|
||
|
+necessary that they share the same path name on all nodes, this is
|
||
|
+considered a very good idea.)
|
||
|
+
|
||
|
+SBD will only use about one megabyte per device, so you can easily
|
||
|
+create a small partition, or very small logical units. (The size of the
|
||
|
+SBD device depends on the block size of the underlying device. Thus, 1MB
|
||
|
+is fine on plain SCSI devices and SAN storage with 512 byte blocks. On
|
||
|
+the IBM s390x architecture in particular, disks default to 4k blocks,
|
||
|
+and thus require roughly 4MB.)
|
||
|
+
|
||
|
+The number of devices will affect the operation of SBD as follows:
|
||
|
+
|
||
|
+=over
|
||
|
+
|
||
|
+=item One device
|
||
|
+
|
||
|
+In its most simple implementation, you use one device only. This is
|
||
|
+appropriate for clusters where all your data is on the same shared
|
||
|
+storage (with internal redundancy) anyway; the SBD device does not
|
||
|
+introduce an additional single point of failure then.
|
||
|
+
|
||
|
+If the SBD device is not accessible, the daemon will fail to start and
|
||
|
+inhibit startup of cluster services.
|
||
|
+
|
||
|
+=item Two devices
|
||
|
+
|
||
|
+This configuration is a trade-off, primarily aimed at environments where
|
||
|
+host-based mirroring is used, but no third storage device is available.
|
||
|
+
|
||
|
+SBD will not commit suicide if it loses access to one mirror leg; this
|
||
|
+allows the cluster to continue to function even in the face of one outage.
|
||
|
+
|
||
|
+However, SBD will not fence the other side while only one mirror leg is
|
||
|
+available, since it does not have enough knowledge to detect an asymmetric
|
||
|
+split of the storage. So it will not be able to automatically tolerate a
|
||
|
+second failure while one of the storage arrays is down. (Though you
|
||
|
+can use the appropriate crm command to acknowledge the fence manually.)
|
||
|
+
|
||
|
+It will not start unless both devices are accessible on boot.
|
||
|
+
|
||
|
+=item Three devices
|
||
|
+
|
||
|
+In this most reliable and recommended configuration, SBD will only
|
||
|
+self-fence if more than one device is lost; hence, this configuration is
|
||
|
+resilient against temporary single device outages (be it due to failures
|
||
|
+or maintenance). Fencing messages can still be successfully relayed if
|
||
|
+at least two devices remain accessible.
|
||
|
+
|
||
|
+This configuration is appropriate for more complex scenarios where
|
||
|
+storage is not confined to a single array. For example, host-based
|
||
|
+mirroring solutions could have one SBD per mirror leg (not mirrored
|
||
|
+itself), and an additional tie-breaker on iSCSI.
|
||
|
+
|
||
|
+It will only start if at least two devices are accessible on boot.
|
||
|
+
|
||
|
+=back
|
||
|
+
|
||
|
+After you have chosen the devices and created the appropriate partitions
|
||
|
+and perhaps multipath alias names to ease management, use the C<sbd create>
|
||
|
+command described above to initialize the SBD metadata on them.
|
||
|
+
|
||
|
+=head3 Sharing the block device(s) between multiple clusters
|
||
|
+
|
||
|
+It is possible to share the block devices between multiple clusters,
|
||
|
+provided the total number of nodes accessing them does not exceed I<255>
|
||
|
+nodes, and they all must share the same SBD timeouts (since these are
|
||
|
+part of the metadata).
|
||
|
+
|
||
|
+If you are using multiple devices this can reduce the setup overhead
|
||
|
+required. However, you should B<not> share devices between clusters in
|
||
|
+different security domains.
|
||
|
+
|
||
|
+=head2 Configure SBD to start on boot
|
||
|
+
|
||
|
+On systems using C<sysvinit>, the C<openais> or C<corosync> system
|
||
|
+start-up scripts must handle starting or stopping C<sbd> as required
|
||
|
+before starting the rest of the cluster stack.
|
||
|
+
|
||
|
+For C<systemd>, sbd simply has to be enabled using
|
||
|
+
|
||
|
+ systemctl enable sbd.service
|
||
|
+
|
||
|
+The daemon is brought online on each node before corosync and Pacemaker
|
||
|
+are started, and terminated only after all other cluster components have
|
||
|
+been shut down - ensuring that cluster resources are never activated
|
||
|
+without SBD supervision.
|
||
|
+
|
||
|
+=head2 Configuration via sysconfig
|
||
|
+
|
||
|
+The system instance of C<sbd> is configured via F</etc/sysconfig/sbd>.
|
||
|
+In this file, you must specify the device(s) used, as well as any
|
||
|
+options to pass to the daemon:
|
||
|
+
|
||
|
+ SBD_DEVICE="/dev/sda1;/dev/sdb1;/dev/sdc1"
|
||
|
+ SBD_PACEMAKER="true"
|
||
|
+
|
||
|
+C<sbd> will fail to start if no C<SBD_DEVICE> is specified. See the
|
||
|
+installed template or section for configuration via environment
|
||
|
+for more options that can be configured here.
|
||
|
+In general configuration done via parameters takes precedence over
|
||
|
+the configuration from the configuration file.
|
||
|
+
|
||
|
+=head2 Configuration via environment
|
||
|
+
|
||
|
+=over
|
||
|
+@environment_section@
|
||
|
+=back
|
||
|
+
|
||
|
+=head2 Testing the sbd installation
|
||
|
+
|
||
|
+After a restart of the cluster stack on this node, you can now try
|
||
|
+sending a test message to it as root, from this or any other node:
|
||
|
+
|
||
|
+ sbd -d /dev/sda1 message node1 test
|
||
|
+
|
||
|
+The node will acknowledge the receipt of the message in the system logs:
|
||
|
+
|
||
|
+ Aug 29 14:10:00 node1 sbd: [13412]: info: Received command test from node2
|
||
|
+
|
||
|
+This confirms that SBD is indeed up and running on the node, and that it
|
||
|
+is ready to receive messages.
|
||
|
+
|
||
|
+Make B<sure> that F</etc/sysconfig/sbd> is identical on all cluster
|
||
|
+nodes, and that all cluster nodes are running the daemon.
|
||
|
+
|
||
|
+=head1 Pacemaker CIB integration
|
||
|
+
|
||
|
+=head2 Fencing resource
|
||
|
+
|
||
|
+Pacemaker can only interact with SBD to issue a node fence if there is a
|
||
|
+configure fencing resource. This should be a primitive, not a clone, as
|
||
|
+follows:
|
||
|
+
|
||
|
+ primitive fencing-sbd stonith:external/sbd \
|
||
|
+ params pcmk_delay_max=30
|
||
|
+
|
||
|
+This will automatically use the same devices as configured in
|
||
|
+F</etc/sysconfig/sbd>.
|
||
|
+
|
||
|
+While you should not configure this as a clone (as Pacemaker will register
|
||
|
+the fencing device on each node automatically), the I<pcmk_delay_max>
|
||
|
+setting enables random fencing delay which ensures, in a scenario where a
|
||
|
+split-brain scenario did occur in a two node cluster, that one of the nodes
|
||
|
+has a better chance to survive to avoid double fencing.
|
||
|
+
|
||
|
+SBD also supports turning the reset request into a crash request, which
|
||
|
+may be helpful for debugging if you have kernel crashdumping configured;
|
||
|
+then, every fence request will cause the node to dump core. You can
|
||
|
+enable this via the C<crashdump="true"> parameter on the fencing
|
||
|
+resource. This is B<not> recommended for production use, but only for
|
||
|
+debugging phases.
|
||
|
+
|
||
|
+=head2 General cluster properties
|
||
|
+
|
||
|
+You must also enable STONITH in general, and set the STONITH timeout to
|
||
|
+be at least twice the I<msgwait> timeout you have configured, to allow
|
||
|
+enough time for the fencing message to be delivered. If your I<msgwait>
|
||
|
+timeout is 60 seconds, this is a possible configuration:
|
||
|
+
|
||
|
+ property stonith-enabled="true"
|
||
|
+ property stonith-timeout="120s"
|
||
|
+
|
||
|
+B<Caution>: if I<stonith-timeout> is too low for I<msgwait> and the
|
||
|
+system overhead, sbd will never be able to successfully complete a fence
|
||
|
+request. This will create a fencing loop.
|
||
|
+
|
||
|
+Note that the sbd fencing agent will try to detect this and
|
||
|
+automatically extend the I<stonith-timeout> setting to a reasonable
|
||
|
+value, on the assumption that sbd modifying your configuration is
|
||
|
+preferable to not fencing.
|
||
|
+
|
||
|
+=head1 Management tasks
|
||
|
+
|
||
|
+=head2 Recovering from temporary SBD device outage
|
||
|
+
|
||
|
+If you have multiple devices, failure of a single device is not immediately
|
||
|
+fatal. C<sbd> will retry to restart the monitor for the device every 5
|
||
|
+seconds by default. However, you can tune this via the options to the
|
||
|
+I<watch> command.
|
||
|
+
|
||
|
+In case you wish the immediately force a restart of all currently
|
||
|
+disabled monitor processes, you can send a I<SIGUSR1> to the SBD
|
||
|
+I<inquisitor> process.
|
||
|
+
|
||
|
+
|
||
|
+=head1 LICENSE
|
||
|
+
|
||
|
+Copyright (C) 2008-2013 Lars Marowsky-Bree
|
||
|
+
|
||
|
+This program is free software; you can redistribute it and/or
|
||
|
+modify it under the terms of the GNU General Public
|
||
|
+License as published by the Free Software Foundation; either
|
||
|
+version 2 of the License, or (at your option) any later version.
|
||
|
+
|
||
|
+This software is distributed in the hope that it will be useful,
|
||
|
+but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
+General Public License for more details.
|
||
|
+
|
||
|
+For details see the GNU General Public License at
|
||
|
+http://www.gnu.org/licenses/gpl-2.0.html (version 2) and/or
|
||
|
+http://www.gnu.org/licenses/gpl.html (the newest as per "any later").
|
||
|
diff --git a/src/sbd.sysconfig b/src/sbd.sysconfig
|
||
|
index e1a60ed..33b50d0 100644
|
||
|
--- a/src/sbd.sysconfig
|
||
|
+++ b/src/sbd.sysconfig
|
||
|
@@ -14,7 +14,7 @@
|
||
|
#
|
||
|
SBD_PACEMAKER=yes
|
||
|
|
||
|
-## Type: list(always,clean)
|
||
|
+## Type: always / clean
|
||
|
## Default: always
|
||
|
#
|
||
|
# Specify the start mode for sbd. Setting this to "clean" will only
|
||
|
@@ -103,6 +103,7 @@ SBD_TIMEOUT_ACTION=flush,reboot
|
||
|
# Thus in auto-mode sbd will check if the slice has RT-budget assigned.
|
||
|
# If that is the case sbd will stay in that slice while it will
|
||
|
# be moved to root-slice otherwise.
|
||
|
+#
|
||
|
SBD_MOVE_TO_ROOT_CGROUP=auto
|
||
|
|
||
|
## Type: string
|
||
|
--
|
||
|
1.8.3.1
|
||
|
|