237 lines
8.6 KiB
Diff
237 lines
8.6 KiB
Diff
From e66ab728426e147bf4fc594109137ebfb1f2dda6 Mon Sep 17 00:00:00 2001
|
|
From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com>
|
|
Date: Mon, 23 Nov 2020 08:09:44 +0530
|
|
Subject: [PATCH 566/584] enahancement/debug: Option to generate core dump
|
|
without killing the process
|
|
|
|
Comments and idea proposed by: Xavi Hernandez(jahernan@redhat.com):
|
|
|
|
On production systems sometimes we see a log message saying that an assertion
|
|
has failed. But it's hard to track why it failed without additional information
|
|
(on debug builds, a GF_ASSERT() generates a core dump and kills the process,
|
|
so it can be used to debug the issue, but many times we are only able to
|
|
reproduce assertion failures on production systems, where GF_ASSERT() only logs
|
|
a message and continues).
|
|
|
|
In other cases we may have a core dump caused by a bug, but the core dump doesn't
|
|
necessarily happen when the bug has happened. Sometimes the crash happens so much
|
|
later that the causes that triggered the bug are lost. In these cases we can add
|
|
more assertions to the places that touch the potential candidates to cause the bug,
|
|
but the only thing we'll get is a log message, which may not be enough.
|
|
|
|
One solution would be to always generate a core dump in case of assertion failure,
|
|
but this was already discussed and it was decided that it was too drastic. If a
|
|
core dump was really needed, a new macro was created to do so: GF_ABORT(),
|
|
but GF_ASSERT() would continue to not kill the process on production systems.
|
|
|
|
I'm proposing to modify GF_ASSERT() on production builds so that it conditionally
|
|
triggers a signal when a debugger is attached. When this happens, the debugger
|
|
will generate a core dump and continue the process as if nothing had happened.
|
|
If there's no debugger attached, GF_ASSERT() will behave as always.
|
|
|
|
The idea I have is to use SIGCONT to do that. This signal is harmless, so we can
|
|
unmask it (we currently mask all unneeded signals) and raise it inside a GF_ASSERT()
|
|
when some global variable is set to true.
|
|
|
|
To produce the core dump, run the script under extras/debug/gfcore.py on other
|
|
terminal. gdb breaks and produces coredump when GF_ASSERT is hit.
|
|
|
|
The script is copied from #1810 which is written by Xavi Hernandez(jahernan@redhat.com)
|
|
|
|
Backport of:
|
|
> Upstream-patch: https://github.com/gluster/glusterfs/pull/1814
|
|
> Fixes: #1810
|
|
> Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53
|
|
> Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
|
|
|
|
BUG: 1927640
|
|
Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53
|
|
Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244960
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
---
|
|
extras/debug/gfcore.py | 77 +++++++++++++++++++++++++++++++
|
|
libglusterfs/src/common-utils.c | 11 +++++
|
|
libglusterfs/src/glusterfs/common-utils.h | 10 +++-
|
|
libglusterfs/src/libglusterfs.sym | 16 +++++++
|
|
4 files changed, 112 insertions(+), 2 deletions(-)
|
|
create mode 100755 extras/debug/gfcore.py
|
|
|
|
diff --git a/extras/debug/gfcore.py b/extras/debug/gfcore.py
|
|
new file mode 100755
|
|
index 0000000..9f097f0
|
|
--- /dev/null
|
|
+++ b/extras/debug/gfcore.py
|
|
@@ -0,0 +1,77 @@
|
|
+#!/usr/bin/env python3
|
|
+
|
|
+def launch():
|
|
+ if len(sys.argv) < 3:
|
|
+ sys.stderr.write("Syntax: {} <pid> <count> [<dir>]\n".format(os.path.basename(sys.argv[0])))
|
|
+ sys.exit(1)
|
|
+
|
|
+ pid = int(sys.argv[1])
|
|
+ count = int(sys.argv[2])
|
|
+ base = os.getcwd()
|
|
+ if len(sys.argv) > 3:
|
|
+ base = sys.argv[3]
|
|
+ base = os.path.realpath(base)
|
|
+
|
|
+ subprocess.run([
|
|
+ "gdb", "-batch",
|
|
+ "-p", str(pid),
|
|
+ "-ex", "py arg_count = {}".format(count),
|
|
+ "-ex", "py arg_dir = '{}'".format(base),
|
|
+ "-x", __file__
|
|
+ ])
|
|
+
|
|
+class GFCore(object):
|
|
+ def __init__(self, count, base):
|
|
+ self.count = count
|
|
+ self.base = base
|
|
+ gdb.execute('set pagination off')
|
|
+ gdb.execute('set gf_signal_on_assert = 1')
|
|
+ gdb.events.stop.connect(self.gf_stop)
|
|
+
|
|
+ self.cont()
|
|
+
|
|
+ def cont(self, quit = False):
|
|
+ if not(quit) and (self.count > 0):
|
|
+ gdb.execute('continue')
|
|
+ else:
|
|
+ gdb.execute('set gf_signal_on_assert = 0')
|
|
+ gdb.execute('quit')
|
|
+
|
|
+ def gf_stop(self, event):
|
|
+ quit = False
|
|
+
|
|
+ if isinstance(event, gdb.SignalEvent):
|
|
+ if event.stop_signal == 'SIGCONT':
|
|
+ now = datetime.utcnow().isoformat()
|
|
+ pid = gdb.selected_inferior().pid
|
|
+ name = "{}/gfcore.{}.{}".format(self.base, pid, now)
|
|
+ print("Generating coredump '{}'".format(name))
|
|
+ gdb.execute('gcore {}'.format(name))
|
|
+ self.count -= 1
|
|
+
|
|
+ elif event.stop_signal == 'SIGINT':
|
|
+ print("SIGINT received. Exiting")
|
|
+ quit = True
|
|
+
|
|
+ else:
|
|
+ print("Ignoring signal {}".format(event.stop_signal))
|
|
+ else:
|
|
+ print("Unexpected event {}".format(type(event)))
|
|
+
|
|
+ self.cont(quit)
|
|
+
|
|
+# Module 'gdb' is not available when running outside gdb.
|
|
+try:
|
|
+ import gdb
|
|
+ from datetime import datetime
|
|
+
|
|
+ GFCore(arg_count, arg_dir)
|
|
+except ModuleNotFoundError:
|
|
+ import sys
|
|
+ import os
|
|
+ import subprocess
|
|
+
|
|
+ try:
|
|
+ launch()
|
|
+ except KeyboardInterrupt:
|
|
+ pass
|
|
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
|
|
index 70d5d21..d351b93 100644
|
|
--- a/libglusterfs/src/common-utils.c
|
|
+++ b/libglusterfs/src/common-utils.c
|
|
@@ -77,9 +77,19 @@ char *vol_type_str[] = {
|
|
"Distributed-Disperse",
|
|
};
|
|
|
|
+gf_boolean_t gf_signal_on_assert = false;
|
|
+
|
|
typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size);
|
|
typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size);
|
|
|
|
+void gf_assert(void)
|
|
+{
|
|
+ if (gf_signal_on_assert) {
|
|
+ raise(SIGCONT);
|
|
+ }
|
|
+
|
|
+}
|
|
+
|
|
void
|
|
gf_xxh64_wrapper(const unsigned char *data, size_t const len,
|
|
unsigned long long const seed, char *xxh64)
|
|
@@ -4021,6 +4031,7 @@ gf_thread_vcreate(pthread_t *thread, const pthread_attr_t *attr,
|
|
sigdelset(&set, SIGSYS);
|
|
sigdelset(&set, SIGFPE);
|
|
sigdelset(&set, SIGABRT);
|
|
+ sigdelset(&set, SIGCONT);
|
|
|
|
pthread_sigmask(SIG_BLOCK, &set, &old);
|
|
|
|
diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h
|
|
index f0a0a41..604afd0 100644
|
|
--- a/libglusterfs/src/glusterfs/common-utils.h
|
|
+++ b/libglusterfs/src/glusterfs/common-utils.h
|
|
@@ -25,6 +25,7 @@
|
|
#include <limits.h>
|
|
#include <fnmatch.h>
|
|
#include <uuid/uuid.h>
|
|
+#include <urcu/compiler.h>
|
|
|
|
#ifndef ffsll
|
|
#define ffsll(x) __builtin_ffsll(x)
|
|
@@ -431,14 +432,19 @@ BIT_VALUE(unsigned char *array, unsigned int index)
|
|
#define GF_FILE_CONTENT_REQUESTED(_xattr_req, _content_limit) \
|
|
(dict_get_uint64(_xattr_req, "glusterfs.content", _content_limit) == 0)
|
|
|
|
+void gf_assert(void);
|
|
+
|
|
#ifdef DEBUG
|
|
#define GF_ASSERT(x) assert(x);
|
|
#else
|
|
#define GF_ASSERT(x) \
|
|
do { \
|
|
- if (!(x)) { \
|
|
+ if (caa_unlikely(!(x))) { \
|
|
+ gf_assert(); \
|
|
gf_msg_callingfn("", GF_LOG_ERROR, 0, LG_MSG_ASSERTION_FAILED, \
|
|
- "Assertion failed: " #x); \
|
|
+ "Assertion failed: To attach gdb and coredump," \
|
|
+ " Run the script under " \
|
|
+ "\"glusterfs/extras/debug/gfcore.py\""); \
|
|
} \
|
|
} while (0)
|
|
#endif
|
|
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
|
|
index 0a0862e..9072afa 100644
|
|
--- a/libglusterfs/src/libglusterfs.sym
|
|
+++ b/libglusterfs/src/libglusterfs.sym
|
|
@@ -1167,3 +1167,19 @@ gf_changelog_register_generic
|
|
gf_gfid_generate_from_xxh64
|
|
find_xlator_option_in_cmd_args_t
|
|
gf_d_type_from_ia_type
|
|
+glusterfs_graph_fini
|
|
+glusterfs_process_svc_attach_volfp
|
|
+glusterfs_mux_volfile_reconfigure
|
|
+glusterfs_process_svc_detach
|
|
+mgmt_is_multiplexed_daemon
|
|
+xlator_is_cleanup_starting
|
|
+gf_nanosleep
|
|
+gf_syncfs
|
|
+graph_total_client_xlator
|
|
+get_xattrs_to_heal
|
|
+gf_latency_statedump_and_reset
|
|
+gf_latency_new
|
|
+gf_latency_reset
|
|
+gf_latency_update
|
|
+gf_frame_latency_update
|
|
+gf_assert
|
|
--
|
|
1.8.3.1
|
|
|