13178ca5fa
This synchronises sos in master to the current upstream sos-2.2 branch and includes all patches from RHEL6 not already present.
352 lines
16 KiB
Diff
352 lines
16 KiB
Diff
diff -up sos-2.2/sos/plugins/cluster.py.orig sos-2.2/sos/plugins/cluster.py
|
|
--- sos-2.2/sos/plugins/cluster.py.orig 2011-02-04 14:50:31.510521528 -0500
|
|
+++ sos-2.2/sos/plugins/cluster.py 2011-02-04 14:51:19.485399800 -0500
|
|
@@ -14,12 +14,12 @@
|
|
|
|
import sos.plugintools
|
|
import os, re
|
|
-import time, libxml2
|
|
-import glob
|
|
+from glob import glob
|
|
|
|
class cluster(sos.plugintools.PluginBase):
|
|
"""cluster suite and GFS related information
|
|
"""
|
|
+
|
|
optionList = [("gfslockdump", 'gather output of gfs lockdumps', 'slow', False),
|
|
('lockdump', 'gather dlm lockdumps', 'slow', False)]
|
|
|
|
@@ -40,190 +40,9 @@ class cluster(sos.plugintools.PluginBase
|
|
self.files = [ "/etc/cluster/cluster.conf" ]
|
|
return sos.plugintools.PluginBase.checkenabled(self)
|
|
|
|
- def has_gfs(self):
|
|
- return (len(self.doRegexFindAll(r'^\S+\s+\S+\s+gfs\s+.*$', "/etc/mtab")) > 0)
|
|
-
|
|
- def diagnose(self):
|
|
+ def setup(self):
|
|
rhelver = self.policy().rhelVersion()
|
|
|
|
- # check if the minimum set of packages is installed
|
|
- # for RHEL4 RHCS(ccs, cman, cman-kernel, magma, magma-plugins, (dlm, dlm-kernel) || gulm, perl-Net-Telnet, rgmanager, fence)
|
|
- # RHEL4 GFS (GFS, GFS-kernel, ccs, lvm2-cluster, fence)
|
|
-
|
|
- pkgs_check = []
|
|
- mods_check = []
|
|
- serv_check = []
|
|
-
|
|
- if rhelver == 4:
|
|
- pkgs_check.extend( [ "ccs", "cman", "magma", "magma-plugins", "perl-Net-Telnet", "rgmanager", "fence" ] )
|
|
- mods_check.extend( [ "cman", "dlm" ] )
|
|
- if self.has_gfs():
|
|
- mods_check.append("gfs")
|
|
- serv_check.extend( [ "cman", "ccsd", "rgmanager", "fenced" ] )
|
|
- if self.has_gfs():
|
|
- serv_check.extend( ["gfs", "clvmd"] )
|
|
- elif rhelver == 5:
|
|
- pkgs_check.extend ( [ "cman", "perl-Net-Telnet", "rgmanager" ] )
|
|
- mods_check.extend( [ "dlm" ] )
|
|
- if self.has_gfs():
|
|
- mods_check.extend( ["gfs", "gfs2"] )
|
|
- serv_check.extend( [ "cman", "rgmanager" ] )
|
|
- if self.has_gfs():
|
|
- serv_check.extend( ["gfs", "clvmd"] )
|
|
- elif rhelver == 6:
|
|
- serv_check.extend( [ "fenced", "corosync", "dlm_controld"] )
|
|
- if self.has_gfs():
|
|
- serv_check.extend( ["gfs_controld"] )
|
|
-
|
|
- # check that kernel module packages are installed for
|
|
- # running kernel version
|
|
-
|
|
- for modname in mods_check:
|
|
- found = 0
|
|
-
|
|
- if self.policy().allPkgsByNameRegex( "^" + modname ):
|
|
- found = 1
|
|
-
|
|
- ret, out, time = self.callExtProg('/sbin/modinfo -F vermagic ' + modname)
|
|
-
|
|
- if ret == 0:
|
|
- found = 2
|
|
-
|
|
- if len(self.fileGrep("^%s\s+" % modname, "/proc/modules")) > 0:
|
|
- found = 3
|
|
-
|
|
- if found == 0:
|
|
- self.addDiagnose("required kernel module is missing: %s" % modname)
|
|
- elif found == 1:
|
|
- self.addDiagnose("required module is not available for current kernel: %s" % modname)
|
|
- elif found == 2:
|
|
- self.addDiagnose("required module is available but not loaded: %s" % modname)
|
|
-
|
|
- for pkg in pkgs_check:
|
|
- if not self.isInstalled(pkg):
|
|
- self.addDiagnose("required package is missing: %s" % pkg)
|
|
-
|
|
- if rhelver == "4":
|
|
- # (dlm, dlm-kernel) || gulm
|
|
- if not ((self.isInstalled("dlm") and self.isInstalled("dlm-kernel")) or self.isInstalled("gulm")):
|
|
- self.addDiagnose("required packages are missing: (dlm, dlm-kernel) || gulm")
|
|
-
|
|
- # check if all the needed daemons are active at sosreport time
|
|
- # check if they are started at boot time in RHEL4 RHCS (cman, ccsd, rgmanager, fenced)
|
|
- # and GFS (gfs, ccsd, clvmd, fenced)
|
|
-
|
|
- for service in serv_check:
|
|
- ret, out, time = self.callExtProg("/sbin/service %s status &> /dev/null" % service)
|
|
- if ret != 0:
|
|
- self.addDiagnose("service %s is not running" % service)
|
|
-
|
|
- if not self.policy().runlevelDefault() in self.policy().runlevelByService(service):
|
|
- self.addDiagnose("service %s is not started in default runlevel" % service)
|
|
-
|
|
- # FIXME: missing important cman services
|
|
- # FIXME: any cman service whose state != run ?
|
|
- # Fence Domain: "default" 2 2 run -
|
|
-
|
|
- # is cluster quorate
|
|
- if not self.is_cluster_quorate():
|
|
- self.addDiagnose("cluster node is not quorate")
|
|
-
|
|
- # if there is no cluster.conf, diagnose() finishes here.
|
|
- try:
|
|
- os.stat("/etc/cluster/cluster.conf")
|
|
- except:
|
|
- self.addDiagnose("/etc/cluster/cluster.conf is missing")
|
|
- return
|
|
-
|
|
- # setup XML xpath context
|
|
- xml = libxml2.parseFile("/etc/cluster/cluster.conf")
|
|
- xpathContext = xml.xpathNewContext()
|
|
-
|
|
- # make sure that the node names are valid according to RFC 2181
|
|
- for hostname in xpathContext.xpathEval('/cluster/clusternodes/clusternode/@name'):
|
|
- if not re.match('^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?(\.[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?)*$', hostname.content):
|
|
- self.addDiagnose("node name (%s) contains invalid characters" % hostname.content)
|
|
-
|
|
- # do not rely on DNS to resolve node names, must have them in /etc/hosts
|
|
- for hostname in xpathContext.xpathEval('/cluster/clusternodes/clusternode/@name'):
|
|
- if len(self.fileGrep(r'^.*\W+%s' % hostname.content , "/etc/hosts")) == 0:
|
|
- self.addDiagnose("node %s is not defined in /etc/hosts" % hostname.content)
|
|
-
|
|
- # check fencing (warn on no fencing)
|
|
- if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[not(fence/method/device)]")):
|
|
- if self.has_gfs():
|
|
- self.addDiagnose("one or more nodes have no fencing agent configured: fencing is required for GFS to work")
|
|
- else:
|
|
- self.addDiagnose("one or more nodes have no fencing agent configured: the cluster infrastructure might not work as intended")
|
|
-
|
|
- # check fencing (warn on manual)
|
|
- if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[/cluster/fencedevices/fencedevice[@agent='fence_manual']/@name=fence/method/device/@name]")):
|
|
- self.addDiagnose("one or more nodes have manual fencing agent configured (data integrity is not guaranteed)")
|
|
-
|
|
- # if fence_ilo or fence_drac, make sure acpid is not running
|
|
- ret, hostname, time = self.callExtProg("/bin/uname -n")
|
|
- hostname = hostname.split(".")[0]
|
|
- if len(xpathContext.xpathEval('/cluster/clusternodes/clusternode[@name = "%s" and /cluster/fencedevices/fencedevice[@agent="fence_rsa" or @agent="fence_drac"]/@name=fence/method/device/@name]' % hostname )):
|
|
- ret, out, time = self.callExtProg("/sbin/service acpid status")
|
|
- if ret == 0 or self.policy().runlevelDefault() in self.policy().runlevelByService("acpid"):
|
|
- self.addDiagnose("acpid is enabled, this may cause problems with your fencing method.")
|
|
-
|
|
- # check for fs exported via nfs without nfsid attribute
|
|
- if len(xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]/nfsexport")):
|
|
- for xmlNode in xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]"):
|
|
- fsRefAttribute = xmlNode.xpathEval("@ref")
|
|
- if (len(fsRefAttribute) > 0) :
|
|
- fsRefName = fsRefAttribute[0].content
|
|
- if len(xpathContext.xpathEval("cluster/rm/resources/fs[@name='%s'][not(@fsid)]" % fsRefName)):
|
|
- self.addDiagnose("one or more nfs export do not have a fsid attribute set.")
|
|
- break
|
|
- else:
|
|
- self.addDiagnose("one or more nfs export do not have a fsid attribute set.")
|
|
-
|
|
- # cluster.conf file version and the in-memory cluster configuration version matches
|
|
- status, cluster_version, time = self.callExtProg("cman_tool status | grep 'Config version'")
|
|
- if not status:
|
|
- cluster_version = cluster_version[16:]
|
|
- else:
|
|
- cluster_version = None
|
|
- conf_version = xpathContext.xpathEval("/cluster/@config_version")[0].content
|
|
-
|
|
- if status == 0 and conf_version != cluster_version:
|
|
- self.addDiagnose("cluster.conf and in-memory configuration version differ (%s != %s)" % (conf_version, cluster_version) )
|
|
-
|
|
- status, output, time = self.callExtProg("/usr/sbin/rg_test test /etc/cluster/cluster.conf")
|
|
- if output.find("Error: ") > 0:
|
|
- self.addDiagnose("configuration errors are present according to rg_test")
|
|
-
|
|
- # make sure the first part of the lock table matches the cluster name
|
|
- # and that the locking protocol is sane
|
|
- cluster_name = xpathContext.xpathEval("/cluster/@name")[0].content
|
|
-
|
|
- for fs in self.fileGrep(r'^[^#][/\w]*\W*[/\w]*\W*gfs', "/etc/fstab"):
|
|
- # for each gfs entry
|
|
- fs = fs.split()
|
|
- lockproto = self.get_gfs_sb_field(fs[0], "sb_lockproto")
|
|
- if lockproto and lockproto != self.get_locking_proto():
|
|
- self.addDiagnose("gfs mountpoint (%s) is using the wrong locking protocol (%s)" % (fs[0], lockproto) )
|
|
-
|
|
- locktable = self.get_gfs_sb_field(fs[0], "sb_locktable")
|
|
- try: locktable = locktable.split(":")[0]
|
|
- except: continue
|
|
- if locktable != cluster_name:
|
|
- self.addDiagnose("gfs mountpoint (%s) is using the wrong locking table" % fs[0])
|
|
-
|
|
- # Check for existence of weak-updates in gfs2 prior to 2.6.18-128
|
|
- if rhelver == 5:
|
|
- ret, vermagic, time = self.callExtProg("modinfo -F vermagic gfs2")
|
|
- # just kernel release from vermagic line
|
|
- vermagic = vermagic.split()[0].lstrip('2.6.18-')
|
|
- vermagic = vermagic[:vermagic.find('.')]
|
|
- if int(vermagic) < 128:
|
|
- self.addDiagnose('GFS2 is being used via weak-updates, kmod-gfs2 should be uninstalled and system reboot' \
|
|
- 'to allow for kernel provided gfs2 module to be used.')
|
|
-
|
|
- def setup(self):
|
|
- self.collectExtOutput("/sbin/fdisk -l")
|
|
self.addCopySpec("/etc/cluster.conf")
|
|
self.addCopySpec("/etc/cluster.xml")
|
|
self.addCopySpec("/etc/cluster")
|
|
@@ -231,79 +50,82 @@ class cluster(sos.plugintools.PluginBase
|
|
self.addCopySpec("/etc/sysconfig/cman")
|
|
self.addCopySpec("/var/lib/ricci")
|
|
self.addCopySpec("/var/lib/luci")
|
|
+ self.addCopySpec("/var/log/cluster")
|
|
+ self.addCopySpec("/var/log/luci/luci.log")
|
|
+
|
|
+ if self.getOption('gfslockdump'):
|
|
+ self.do_gfslockdump()
|
|
+
|
|
+ if self.getOption('lockdump'):
|
|
+ self.do_lockdump()
|
|
+
|
|
self.collectExtOutput("/usr/sbin/rg_test test /etc/cluster/cluster.conf")
|
|
- self.collectExtOutput("cman_tool status")
|
|
- self.collectExtOutput("cman_tool -a nodes")
|
|
- self.collectOutputNow("group_tool ls -g1")
|
|
- self.collectOutputNow("group_tool dump")
|
|
- self.collectExtOutput("ccs_tool lsnode")
|
|
self.collectExtOutput("fence_tool ls -n")
|
|
- self.collectExtOutput("dlm_tool ls -n")
|
|
self.collectExtOutput("gfs_control ls -n")
|
|
- self.collectExtOutput("fence_tool dump", root_symlink="fenced.txt")
|
|
- self.collectExtOutput("dlm_tool dump", root_symlink="dlm_controld.txt")
|
|
- self.collectExtOutput("gfs_control dump", root_symlink="gfs_controld.txt")
|
|
- self.collectExtOutput("dlm_tool log_plock", root_symlink="log_plock.txt")
|
|
- self.addCopySpec("/var/log/cluster")
|
|
- self.addCopySpec("/var/log/luci/luci.log")
|
|
-
|
|
- self.collectExtOutput("clustat")
|
|
+ self.collectExtOutput("dlm_tool log_plock")
|
|
|
|
+ self.collectExtOutput("/sbin/fdisk -l")
|
|
+ self.collectOutputNow("clustat")
|
|
+ self.collectOutputNow("group_tool dump")
|
|
+ self.collectExtOutput("cman_tool services")
|
|
+ self.collectExtOutput("cman_tool nodes")
|
|
+ self.collectExtOutput("cman_tool status")
|
|
+ self.collectExtOutput("ccs_tool lsnode")
|
|
self.collectExtOutput("/sbin/ipvsadm -L")
|
|
|
|
- if self.getOption('gfslockdump'): self.do_gfslockdump()
|
|
- if self.getOption('lockdump'): self.do_lockdump()
|
|
-
|
|
- return
|
|
+ if rhelver is 4:
|
|
+ self.addCopySpec("/proc/cluster/*")
|
|
+ self.collectExtOutput("cman_tool nodes")
|
|
+
|
|
+ if rhelver is not 4: # 5+
|
|
+ self.collectExtOutput("cman_tool -a nodes")
|
|
+
|
|
+ if rhelver is 5:
|
|
+ self.collectExtOutput("group_tool -v")
|
|
+ self.collectExtOutput("group_tool dump fence")
|
|
+ self.collectExtOutput("group_tool dump gfs")
|
|
+
|
|
+ if rhelver not in (4,5): # 6+
|
|
+ self.collectExtOutput("corosync-quorumtool -l")
|
|
+ self.collectExtOutput("corosync-quorumtool -s")
|
|
+ self.collectExtOutput("corosync-cpgtool")
|
|
+ self.collectExtOutput("corosync-objctl")
|
|
+ self.collectExtOutput("group_tool ls -g1")
|
|
+ self.collectExtOutput("gfs_control ls -n")
|
|
+ self.collectExtOutput("gfs_control dump")
|
|
+ self.collectExtOutput("fence_tool dump")
|
|
+ self.collectExtOutput("dlm_tool dump")
|
|
+ self.collectExtOutput("dlm_tool ls -n")
|
|
|
|
def do_lockdump(self):
|
|
- status, output, time= self.callExtProg("cman_tool services")
|
|
- if status:
|
|
- # command somehow failed
|
|
- return False
|
|
-
|
|
rhelver = self.policy().rhelVersion()
|
|
|
|
- if rhelver == "4":
|
|
- regex = r'^DLM Lock Space:\s*"([^"]*)".*$'
|
|
- elif rhelver == "5Server" or rhelver == "5Client":
|
|
- regex = r'^dlm\s+[^\s]+\s+([^\s]+)\s.*$'
|
|
-
|
|
- reg=re.compile(regex,re.MULTILINE)
|
|
- for lockspace in reg.findall(output):
|
|
- ret, out, time = self.callExtProg("echo %s > /proc/cluster/dlm_locks" % lockspace)
|
|
- self.collectOutputNow("cat /proc/cluster/dlm_locks", root_symlink = "dlm_locks_%s" % lockspace)
|
|
-
|
|
- def get_locking_proto(self):
|
|
- # FIXME: what's the best way to find out ?
|
|
- return "lock_dlm"
|
|
- return "lock_gulm"
|
|
+ if rhelver is 4:
|
|
+ status, output, time = self.callExtProg("cman_tool services")
|
|
+ for lockspace in re.compile(r'^DLM Lock Space:\s*"([^"]*)".*$', re.MULTILINE).findall(output):
|
|
+ self.callExtProg("echo %s > /proc/cluster/dlm_locks" % lockspace)
|
|
+ self.collectOutputNow("cat /proc/cluster/dlm_locks",
|
|
+ suggest_filename = "dlm_locks_%s" % lockspace)
|
|
+
|
|
+ if rhelver is 5:
|
|
+ status, output, time = self.callExtProg("group_tool")
|
|
+ for lockspace in re.compile(r'^dlm\s+[^\s]+\s+([^\s]+)$', re.MULTILINE).findall(output):
|
|
+ self.collectExtOutput("dlm_tool lockdebug '%s'" % lockspace,
|
|
+ suggest_filename = "dlm_locks_%s" % lockspace)
|
|
+
|
|
+ else: # RHEL6 or recent Fedora
|
|
+ status, output, time = self.callExtProg("dlm_tool ls")
|
|
+ for lockspace in re.compile(r'^name\s+([^\s]+)$', re.MULTILINE).findall(output):
|
|
+ self.collectExtOutput("dlm_tool lockdebug -svw '%s'" % lockspace,
|
|
+ suggest_filename = "dlm_locks_%s" % lockspace)
|
|
|
|
def do_gfslockdump(self):
|
|
for mntpoint in self.doRegexFindAll(r'^\S+\s+([^\s]+)\s+gfs\s+.*$', "/proc/mounts"):
|
|
- self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntpoint, root_symlink = "gfs_lockdump_" + self.mangleCommand(mntpoint) )
|
|
-
|
|
- def do_rgmanager_bt(self):
|
|
- # FIXME: threads backtrace via SIGALRM
|
|
- return
|
|
+ self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntpoint,
|
|
+ suggest_filename = "gfs_lockdump_" + self.mangleCommand(mntpoint))
|
|
|
|
def postproc(self):
|
|
- for cluster_conf in glob.glob("/etc/cluster/cluster.conf*"):
|
|
+ for cluster_conf in glob("/etc/cluster/cluster.conf*"):
|
|
self.doRegexSub(cluster_conf, r"(\s*\<fencedevice\s*.*\s*passwd\s*=\s*)\S+(\")", r"\1%s" %('"***"'))
|
|
return
|
|
|
|
- def is_cluster_quorate(self):
|
|
- ret, output, time = self.callExtProg("cman_tool status | grep '^Membership state: '")
|
|
- try:
|
|
- if output[18:] == "Cluster-Member":
|
|
- return True
|
|
- except:
|
|
- pass
|
|
- return False
|
|
-
|
|
- def get_gfs_sb_field(self, device, field):
|
|
- for ret, line, time in self.callExtProg("/sbin/gfs_tool sb %s all" % device).split("\n"):
|
|
- if re.match('^\W*%s = ' % field, line):
|
|
- return line.split("=")[1].strip()
|
|
- return False
|
|
-
|