glusterfs/0566-enahancement-debug-Option-to-generate-core-dump-with.patch
2023-02-27 13:17:02 -05:00

237 lines
8.6 KiB
Diff

From e66ab728426e147bf4fc594109137ebfb1f2dda6 Mon Sep 17 00:00:00 2001
From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com>
Date: Mon, 23 Nov 2020 08:09:44 +0530
Subject: [PATCH 566/584] enahancement/debug: Option to generate core dump
without killing the process
Comments and idea proposed by: Xavi Hernandez(jahernan@redhat.com):
On production systems sometimes we see a log message saying that an assertion
has failed. But it's hard to track why it failed without additional information
(on debug builds, a GF_ASSERT() generates a core dump and kills the process,
so it can be used to debug the issue, but many times we are only able to
reproduce assertion failures on production systems, where GF_ASSERT() only logs
a message and continues).
In other cases we may have a core dump caused by a bug, but the core dump doesn't
necessarily happen when the bug has happened. Sometimes the crash happens so much
later that the causes that triggered the bug are lost. In these cases we can add
more assertions to the places that touch the potential candidates to cause the bug,
but the only thing we'll get is a log message, which may not be enough.
One solution would be to always generate a core dump in case of assertion failure,
but this was already discussed and it was decided that it was too drastic. If a
core dump was really needed, a new macro was created to do so: GF_ABORT(),
but GF_ASSERT() would continue to not kill the process on production systems.
I'm proposing to modify GF_ASSERT() on production builds so that it conditionally
triggers a signal when a debugger is attached. When this happens, the debugger
will generate a core dump and continue the process as if nothing had happened.
If there's no debugger attached, GF_ASSERT() will behave as always.
The idea I have is to use SIGCONT to do that. This signal is harmless, so we can
unmask it (we currently mask all unneeded signals) and raise it inside a GF_ASSERT()
when some global variable is set to true.
To produce the core dump, run the script under extras/debug/gfcore.py on other
terminal. gdb breaks and produces coredump when GF_ASSERT is hit.
The script is copied from #1810 which is written by Xavi Hernandez(jahernan@redhat.com)
Backport of:
> Upstream-patch: https://github.com/gluster/glusterfs/pull/1814
> Fixes: #1810
> Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53
> Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
BUG: 1927640
Change-Id: I6566ca2cae15501d8835c36f56be4c6950cb2a53
Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244960
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
extras/debug/gfcore.py | 77 +++++++++++++++++++++++++++++++
libglusterfs/src/common-utils.c | 11 +++++
libglusterfs/src/glusterfs/common-utils.h | 10 +++-
libglusterfs/src/libglusterfs.sym | 16 +++++++
4 files changed, 112 insertions(+), 2 deletions(-)
create mode 100755 extras/debug/gfcore.py
diff --git a/extras/debug/gfcore.py b/extras/debug/gfcore.py
new file mode 100755
index 0000000..9f097f0
--- /dev/null
+++ b/extras/debug/gfcore.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+def launch():
+ if len(sys.argv) < 3:
+ sys.stderr.write("Syntax: {} <pid> <count> [<dir>]\n".format(os.path.basename(sys.argv[0])))
+ sys.exit(1)
+
+ pid = int(sys.argv[1])
+ count = int(sys.argv[2])
+ base = os.getcwd()
+ if len(sys.argv) > 3:
+ base = sys.argv[3]
+ base = os.path.realpath(base)
+
+ subprocess.run([
+ "gdb", "-batch",
+ "-p", str(pid),
+ "-ex", "py arg_count = {}".format(count),
+ "-ex", "py arg_dir = '{}'".format(base),
+ "-x", __file__
+ ])
+
+class GFCore(object):
+ def __init__(self, count, base):
+ self.count = count
+ self.base = base
+ gdb.execute('set pagination off')
+ gdb.execute('set gf_signal_on_assert = 1')
+ gdb.events.stop.connect(self.gf_stop)
+
+ self.cont()
+
+ def cont(self, quit = False):
+ if not(quit) and (self.count > 0):
+ gdb.execute('continue')
+ else:
+ gdb.execute('set gf_signal_on_assert = 0')
+ gdb.execute('quit')
+
+ def gf_stop(self, event):
+ quit = False
+
+ if isinstance(event, gdb.SignalEvent):
+ if event.stop_signal == 'SIGCONT':
+ now = datetime.utcnow().isoformat()
+ pid = gdb.selected_inferior().pid
+ name = "{}/gfcore.{}.{}".format(self.base, pid, now)
+ print("Generating coredump '{}'".format(name))
+ gdb.execute('gcore {}'.format(name))
+ self.count -= 1
+
+ elif event.stop_signal == 'SIGINT':
+ print("SIGINT received. Exiting")
+ quit = True
+
+ else:
+ print("Ignoring signal {}".format(event.stop_signal))
+ else:
+ print("Unexpected event {}".format(type(event)))
+
+ self.cont(quit)
+
+# Module 'gdb' is not available when running outside gdb.
+try:
+ import gdb
+ from datetime import datetime
+
+ GFCore(arg_count, arg_dir)
+except ModuleNotFoundError:
+ import sys
+ import os
+ import subprocess
+
+ try:
+ launch()
+ except KeyboardInterrupt:
+ pass
diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c
index 70d5d21..d351b93 100644
--- a/libglusterfs/src/common-utils.c
+++ b/libglusterfs/src/common-utils.c
@@ -77,9 +77,19 @@ char *vol_type_str[] = {
"Distributed-Disperse",
};
+gf_boolean_t gf_signal_on_assert = false;
+
typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size);
typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size);
+void gf_assert(void)
+{
+ if (gf_signal_on_assert) {
+ raise(SIGCONT);
+ }
+
+}
+
void
gf_xxh64_wrapper(const unsigned char *data, size_t const len,
unsigned long long const seed, char *xxh64)
@@ -4021,6 +4031,7 @@ gf_thread_vcreate(pthread_t *thread, const pthread_attr_t *attr,
sigdelset(&set, SIGSYS);
sigdelset(&set, SIGFPE);
sigdelset(&set, SIGABRT);
+ sigdelset(&set, SIGCONT);
pthread_sigmask(SIG_BLOCK, &set, &old);
diff --git a/libglusterfs/src/glusterfs/common-utils.h b/libglusterfs/src/glusterfs/common-utils.h
index f0a0a41..604afd0 100644
--- a/libglusterfs/src/glusterfs/common-utils.h
+++ b/libglusterfs/src/glusterfs/common-utils.h
@@ -25,6 +25,7 @@
#include <limits.h>
#include <fnmatch.h>
#include <uuid/uuid.h>
+#include <urcu/compiler.h>
#ifndef ffsll
#define ffsll(x) __builtin_ffsll(x)
@@ -431,14 +432,19 @@ BIT_VALUE(unsigned char *array, unsigned int index)
#define GF_FILE_CONTENT_REQUESTED(_xattr_req, _content_limit) \
(dict_get_uint64(_xattr_req, "glusterfs.content", _content_limit) == 0)
+void gf_assert(void);
+
#ifdef DEBUG
#define GF_ASSERT(x) assert(x);
#else
#define GF_ASSERT(x) \
do { \
- if (!(x)) { \
+ if (caa_unlikely(!(x))) { \
+ gf_assert(); \
gf_msg_callingfn("", GF_LOG_ERROR, 0, LG_MSG_ASSERTION_FAILED, \
- "Assertion failed: " #x); \
+ "Assertion failed: To attach gdb and coredump," \
+ " Run the script under " \
+ "\"glusterfs/extras/debug/gfcore.py\""); \
} \
} while (0)
#endif
diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym
index 0a0862e..9072afa 100644
--- a/libglusterfs/src/libglusterfs.sym
+++ b/libglusterfs/src/libglusterfs.sym
@@ -1167,3 +1167,19 @@ gf_changelog_register_generic
gf_gfid_generate_from_xxh64
find_xlator_option_in_cmd_args_t
gf_d_type_from_ia_type
+glusterfs_graph_fini
+glusterfs_process_svc_attach_volfp
+glusterfs_mux_volfile_reconfigure
+glusterfs_process_svc_detach
+mgmt_is_multiplexed_daemon
+xlator_is_cleanup_starting
+gf_nanosleep
+gf_syncfs
+graph_total_client_xlator
+get_xattrs_to_heal
+gf_latency_statedump_and_reset
+gf_latency_new
+gf_latency_reset
+gf_latency_update
+gf_frame_latency_update
+gf_assert
--
1.8.3.1