2014-05-08 11:37:14 +00:00
|
|
|
# These variables and functions are useful in 2nd kernel
|
|
|
|
|
|
|
|
. /lib/kdump-lib.sh
|
2021-01-19 12:10:28 +00:00
|
|
|
. /lib/kdump-logger.sh
|
2014-05-08 11:37:14 +00:00
|
|
|
|
|
|
|
KDUMP_PATH="/var/crash"
|
2020-10-27 09:04:25 +00:00
|
|
|
KDUMP_LOG_FILE="/run/initramfs/kexec-dmesg.log"
|
2014-05-08 11:37:14 +00:00
|
|
|
CORE_COLLECTOR=""
|
2020-10-26 08:27:05 +00:00
|
|
|
DEFAULT_CORE_COLLECTOR="makedumpfile -l --message-level 7 -d 31"
|
2014-05-08 11:37:14 +00:00
|
|
|
DMESG_COLLECTOR="/sbin/vmcore-dmesg"
|
2019-01-17 20:31:23 +00:00
|
|
|
FAILURE_ACTION="systemctl reboot -f"
|
2015-01-29 03:39:02 +00:00
|
|
|
DATEDIR=`date +%Y-%m-%d-%T`
|
2014-05-08 11:37:14 +00:00
|
|
|
HOST_IP='127.0.0.1'
|
|
|
|
DUMP_INSTRUCTION=""
|
|
|
|
SSH_KEY_LOCATION="/root/.ssh/kdump_id_rsa"
|
|
|
|
KDUMP_SCRIPT_DIR="/kdumpscripts"
|
|
|
|
DD_BLKSIZE=512
|
2015-12-11 07:06:02 +00:00
|
|
|
FINAL_ACTION="systemctl reboot -f"
|
2014-05-08 11:37:14 +00:00
|
|
|
KDUMP_PRE=""
|
|
|
|
KDUMP_POST=""
|
|
|
|
NEWROOT="/sysroot"
|
2020-01-28 19:34:48 +00:00
|
|
|
OPALCORE="/sys/firmware/opal/mpipl/core"
|
2014-05-08 11:37:14 +00:00
|
|
|
|
2020-10-27 09:04:25 +00:00
|
|
|
#initiate the kdump logger
|
|
|
|
dlog_init
|
|
|
|
if [ $? -ne 0 ]; then
|
|
|
|
echo "failed to initiate the kdump logger."
|
|
|
|
exit 1
|
|
|
|
fi
|
|
|
|
|
2014-05-08 11:37:14 +00:00
|
|
|
get_kdump_confs()
|
|
|
|
{
|
|
|
|
local config_opt config_val
|
|
|
|
|
|
|
|
while read config_opt config_val;
|
|
|
|
do
|
|
|
|
# remove inline comments after the end of a directive.
|
|
|
|
case "$config_opt" in
|
|
|
|
path)
|
|
|
|
KDUMP_PATH="$config_val"
|
|
|
|
;;
|
|
|
|
core_collector)
|
|
|
|
[ -n "$config_val" ] && CORE_COLLECTOR="$config_val"
|
|
|
|
;;
|
|
|
|
sshkey)
|
|
|
|
if [ -f "$config_val" ]; then
|
|
|
|
SSH_KEY_LOCATION=$config_val
|
|
|
|
fi
|
|
|
|
;;
|
|
|
|
kdump_pre)
|
|
|
|
KDUMP_PRE="$config_val"
|
|
|
|
;;
|
|
|
|
kdump_post)
|
|
|
|
KDUMP_POST="$config_val"
|
|
|
|
;;
|
|
|
|
fence_kdump_args)
|
|
|
|
FENCE_KDUMP_ARGS="$config_val"
|
|
|
|
;;
|
|
|
|
fence_kdump_nodes)
|
|
|
|
FENCE_KDUMP_NODES="$config_val"
|
|
|
|
;;
|
2019-01-17 20:31:23 +00:00
|
|
|
failure_action|default)
|
2014-05-08 11:37:14 +00:00
|
|
|
case $config_val in
|
|
|
|
shell)
|
2019-01-17 20:31:23 +00:00
|
|
|
FAILURE_ACTION="kdump_emergency_shell"
|
2014-05-08 11:37:14 +00:00
|
|
|
;;
|
|
|
|
reboot)
|
2019-10-30 09:09:07 +00:00
|
|
|
FAILURE_ACTION="systemctl reboot -f && exit"
|
2014-05-08 11:37:14 +00:00
|
|
|
;;
|
|
|
|
halt)
|
2019-10-30 09:09:07 +00:00
|
|
|
FAILURE_ACTION="halt && exit"
|
2014-05-08 11:37:14 +00:00
|
|
|
;;
|
|
|
|
poweroff)
|
2019-10-30 09:09:07 +00:00
|
|
|
FAILURE_ACTION="systemctl poweroff -f && exit"
|
2014-05-08 11:37:14 +00:00
|
|
|
;;
|
|
|
|
dump_to_rootfs)
|
2019-01-17 20:31:23 +00:00
|
|
|
FAILURE_ACTION="dump_to_rootfs"
|
2014-05-08 11:37:14 +00:00
|
|
|
;;
|
|
|
|
esac
|
|
|
|
;;
|
2019-01-17 20:31:24 +00:00
|
|
|
final_action)
|
|
|
|
case $config_val in
|
|
|
|
reboot)
|
|
|
|
FINAL_ACTION="systemctl reboot -f"
|
|
|
|
;;
|
|
|
|
halt)
|
|
|
|
FINAL_ACTION="halt"
|
|
|
|
;;
|
|
|
|
poweroff)
|
|
|
|
FINAL_ACTION="systemctl poweroff -f"
|
|
|
|
;;
|
|
|
|
esac
|
|
|
|
;;
|
2014-05-08 11:37:14 +00:00
|
|
|
esac
|
2021-08-03 11:49:51 +00:00
|
|
|
done <<< "$(kdump_read_conf)"
|
Introduce kdump error handling service
Now upon failure kdump script might not be called at all and it might
not be able to execute default action. It results in a hang.
Because we disable emergency shell and rely on kdump.sh being invoked
through dracut-pre-pivot hook. But it might happen that we never call
into dracut-pre-pivot hook because certain systemd targets could not
reach due to failure in their dependencies. In those cases error
handling code does not run and system hangs. For example:
sysroot-var-crash.mount --> initrd-root-fs.target --> initrd.target \
--> dracut-pre-pivot.service --> kdump.sh
If /sysroot/var/crash mount fails, initrd-root-fs.target will not be
reached. And then initrd.target will not be reached,
dracut-pre-pivot.service wouldn't run. Finally kdump.sh wouldn't run.
To solve this problem, we need to separate the error handling code from
dracut-pre-pivot hook, and every time when a failure shows up, the
separated code can be called by the emergency service.
By default systemd provides an emergency service which will drop us into
shell every time upon a critical failure. It's very convenient for us to
re-use the framework of systemd emergency, because we don't have to
touch the other parts of systemd. We can use our own script instead of
the default one.
This new scheme will overwrite emergency shell and replace with kdump
error handling code. And this code will do the error handling as needed.
Now, we will not rely on dracut-pre-pivot hook running always. Instead
whenever error happens and it is serious enough that emergency shell
needed to run, now kdump error handler will run.
dracut-emergency is also replaced by kdump error handler and it's
enabled again all the way down. So all the failure (including systemd
and dracut) in 2nd kernel could be captured, and trigger kdump error
handler.
dracut-initqueue is a special case, which calls "systemctl start
emergency" directly, not via "OnFailure=emergency". In case of failure,
emergency is started, but not in a isolation mode, which means
dracut-initqueue is still running. On the other hand, emergency will
call dracut-initqueue again when default action is dump_to_rootfs.
systemd would block on the last dracut-initqueue, waiting for the first
instance to exit, which leaves us hang. It looks like the following:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency (running)
--> kdump-error-handler.sh (running)
--> call dracut-initqueue:
--> blocking and waiting for the original instance to exit.
To fix this, I'd like to introduce a wrapper emergency service. This
emegency service will replace both the systemd and dracut emergency. And
this service does nothing but to isolate to real kdump error handler
service:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency isolate to kdump-error-handler.service
--> dracut-emergency and dracut-initqueue will both be stopped
and kdump-error-handler.service will run kdump-error-handler.sh.
In a normal failure case, this still works:
foo.service fails
--> trigger emergency.service
--> emergency.service isolates to kdump-error-handler.service
--> kdump-error-handler.service will run kdump-error-handler.sh
Signed-off-by: WANG Chao <chaowang@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2014-05-08 11:37:15 +00:00
|
|
|
|
2014-11-28 05:55:39 +00:00
|
|
|
if [ -z "$CORE_COLLECTOR" ]; then
|
|
|
|
CORE_COLLECTOR="$DEFAULT_CORE_COLLECTOR"
|
|
|
|
if is_ssh_dump_target || is_raw_dump_target; then
|
Introduce kdump error handling service
Now upon failure kdump script might not be called at all and it might
not be able to execute default action. It results in a hang.
Because we disable emergency shell and rely on kdump.sh being invoked
through dracut-pre-pivot hook. But it might happen that we never call
into dracut-pre-pivot hook because certain systemd targets could not
reach due to failure in their dependencies. In those cases error
handling code does not run and system hangs. For example:
sysroot-var-crash.mount --> initrd-root-fs.target --> initrd.target \
--> dracut-pre-pivot.service --> kdump.sh
If /sysroot/var/crash mount fails, initrd-root-fs.target will not be
reached. And then initrd.target will not be reached,
dracut-pre-pivot.service wouldn't run. Finally kdump.sh wouldn't run.
To solve this problem, we need to separate the error handling code from
dracut-pre-pivot hook, and every time when a failure shows up, the
separated code can be called by the emergency service.
By default systemd provides an emergency service which will drop us into
shell every time upon a critical failure. It's very convenient for us to
re-use the framework of systemd emergency, because we don't have to
touch the other parts of systemd. We can use our own script instead of
the default one.
This new scheme will overwrite emergency shell and replace with kdump
error handling code. And this code will do the error handling as needed.
Now, we will not rely on dracut-pre-pivot hook running always. Instead
whenever error happens and it is serious enough that emergency shell
needed to run, now kdump error handler will run.
dracut-emergency is also replaced by kdump error handler and it's
enabled again all the way down. So all the failure (including systemd
and dracut) in 2nd kernel could be captured, and trigger kdump error
handler.
dracut-initqueue is a special case, which calls "systemctl start
emergency" directly, not via "OnFailure=emergency". In case of failure,
emergency is started, but not in a isolation mode, which means
dracut-initqueue is still running. On the other hand, emergency will
call dracut-initqueue again when default action is dump_to_rootfs.
systemd would block on the last dracut-initqueue, waiting for the first
instance to exit, which leaves us hang. It looks like the following:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency (running)
--> kdump-error-handler.sh (running)
--> call dracut-initqueue:
--> blocking and waiting for the original instance to exit.
To fix this, I'd like to introduce a wrapper emergency service. This
emegency service will replace both the systemd and dracut emergency. And
this service does nothing but to isolate to real kdump error handler
service:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency isolate to kdump-error-handler.service
--> dracut-emergency and dracut-initqueue will both be stopped
and kdump-error-handler.service will run kdump-error-handler.sh.
In a normal failure case, this still works:
foo.service fails
--> trigger emergency.service
--> emergency.service isolates to kdump-error-handler.service
--> kdump-error-handler.service will run kdump-error-handler.sh
Signed-off-by: WANG Chao <chaowang@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2014-05-08 11:37:15 +00:00
|
|
|
CORE_COLLECTOR="$CORE_COLLECTOR -F"
|
|
|
|
fi
|
|
|
|
fi
|
2014-05-08 11:37:14 +00:00
|
|
|
}
|
|
|
|
|
2020-10-27 09:04:25 +00:00
|
|
|
# store the kexec kernel log to a file.
|
|
|
|
save_log()
|
|
|
|
{
|
|
|
|
dmesg -T > $KDUMP_LOG_FILE
|
|
|
|
|
|
|
|
if command -v journalctl > /dev/null; then
|
|
|
|
journalctl -ab >> $KDUMP_LOG_FILE
|
|
|
|
fi
|
2021-03-18 08:52:46 +00:00
|
|
|
chmod 600 $KDUMP_LOG_FILE
|
2020-10-27 09:04:25 +00:00
|
|
|
}
|
|
|
|
|
2020-03-12 12:57:08 +00:00
|
|
|
# dump_fs <mount point>
|
2014-05-08 11:37:14 +00:00
|
|
|
dump_fs()
|
|
|
|
{
|
2020-12-14 09:01:42 +00:00
|
|
|
local _exitcode
|
2020-03-12 12:57:08 +00:00
|
|
|
local _mp=$1
|
2021-04-22 12:51:59 +00:00
|
|
|
local _op=$(get_mount_info OPTIONS target $_mp -f)
|
|
|
|
ddebug "dump_fs _mp=$_mp _opts=$_op"
|
2021-01-04 16:35:58 +00:00
|
|
|
|
|
|
|
if ! is_mounted "$_mp"; then
|
|
|
|
dinfo "dump path \"$_mp\" is not mounted, trying to mount..."
|
|
|
|
mount --target $_mp
|
|
|
|
if [ $? -ne 0 ]; then
|
|
|
|
derror "failed to dump to \"$_mp\", it's not a mount point!"
|
|
|
|
return 1
|
2019-09-27 10:17:32 +00:00
|
|
|
fi
|
2014-05-08 11:37:14 +00:00
|
|
|
fi
|
|
|
|
|
|
|
|
# Remove -F in makedumpfile case. We don't want a flat format dump here.
|
|
|
|
[[ $CORE_COLLECTOR = *makedumpfile* ]] && CORE_COLLECTOR=`echo $CORE_COLLECTOR | sed -e "s/-F//g"`
|
|
|
|
|
2020-03-12 15:58:49 +00:00
|
|
|
local _dump_path=$(echo "$_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/" | tr -s /)
|
|
|
|
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "saving to $_dump_path"
|
2014-05-08 11:37:14 +00:00
|
|
|
|
2018-10-11 08:24:58 +00:00
|
|
|
# Only remount to read-write mode if the dump target is mounted read-only.
|
|
|
|
if [[ "$_op" = "ro"* ]]; then
|
2021-04-22 12:51:59 +00:00
|
|
|
dinfo "Remounting the dump target in rw mode."
|
|
|
|
mount -o remount,rw $_mp || return 1
|
2018-10-11 08:24:58 +00:00
|
|
|
fi
|
|
|
|
|
2020-03-12 15:58:49 +00:00
|
|
|
mkdir -p $_dump_path || return 1
|
2014-05-08 11:37:14 +00:00
|
|
|
|
2020-03-12 15:58:49 +00:00
|
|
|
save_vmcore_dmesg_fs ${DMESG_COLLECTOR} "$_dump_path"
|
|
|
|
save_opalcore_fs "$_dump_path"
|
2014-05-08 11:37:14 +00:00
|
|
|
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "saving vmcore"
|
|
|
|
$CORE_COLLECTOR /proc/vmcore $_dump_path/vmcore-incomplete
|
2020-12-14 09:01:42 +00:00
|
|
|
_exitcode=$?
|
|
|
|
if [ $_exitcode -eq 0 ]; then
|
|
|
|
mv $_dump_path/vmcore-incomplete $_dump_path/vmcore
|
|
|
|
sync
|
|
|
|
dinfo "saving vmcore complete"
|
|
|
|
else
|
|
|
|
derror "saving vmcore failed, _exitcode:$_exitcode"
|
|
|
|
fi
|
|
|
|
|
2021-01-14 03:42:01 +00:00
|
|
|
dinfo "saving the $KDUMP_LOG_FILE to $_dump_path/"
|
2020-10-27 09:04:25 +00:00
|
|
|
save_log
|
|
|
|
mv $KDUMP_LOG_FILE $_dump_path/
|
2020-12-14 09:01:42 +00:00
|
|
|
if [ $_exitcode -ne 0 ]; then
|
|
|
|
return 1
|
2020-10-27 09:04:25 +00:00
|
|
|
fi
|
2019-09-27 10:17:32 +00:00
|
|
|
|
kdump-lib-initramfs.sh: ignore the failure of echo
The kdump-capture.service will fail, if the following conds are meet up.
-1. boot up a VM with the following cmd:
qemu-kvm -name 'avocado-vt-vm1' -sandbox off -machine pc -nodefaults -vga cirrus \
-drive id=drive_image1,if=none,snapshot=off,aio=native,cache=none,format=qcow2,file=$guest_img \
-device virtio-blk-pci,id=image1,drive=drive_image1,bootindex=0,bus=pci.0,addr=04 \
-device virtio-net-pci,mac=9a:4d:4e:4f:50:51,id=id3DveCw,vectors=4,netdev=idgW5YRp,bus=pci.0,addr=05 \
-netdev tap,id=idgW5YRp \
-m 2048 \
-smp 4,maxcpus=4,cores=2,threads=1,sockets=2 \
-cpu 'SandyBridge',+kvm_pv_unhalt \
-vnc :0 \
-rtc base=utc,clock=host,driftfix=slew \
-boot order=cdn,once=c,menu=off,strict=off \
-enable-kvm \
-monitor stdio \
-qmp tcp:localhost:4444,server,nowait
-2. in kernel cmdline with the following options: console=tty0 console=ttyS0,
Because the "-nodefaults" option in qemu cmd excludes the emulation of serial port, the ttyS0 will
have no real backend device. We can observe such issue in 1st kernel by:
echo teststring > /dev/console or
echo teststring > /dev/ttyS0,
It gets the error "-bash: echo: write error: Input/output error".
Such conds cause small issue in 1st kernel, but it is a big problem for kdump-capture and emergency
service.
This patch aims to work aroundthe issue in kdump-capture service:
dump_fs() return value will affect the following code in dracut-kdump.sh
DUMP_RETVAL=$? <---
do_kdump_post $DUMP_RETVAL
if [ $? -ne 0 ]; then
echo "kdump: kdump_post script exited with non-zero status!"
fi
Although kdump-capture saves the vmcore successfully, but it exit 1 and
fall on emergency service.
Signed-off-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Xunlei Pang <xlpang@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2017-04-17 07:41:02 +00:00
|
|
|
# improper kernel cmdline can cause the failure of echo, we can ignore this kind of failure
|
|
|
|
return 0
|
2014-05-08 11:37:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
save_vmcore_dmesg_fs() {
|
|
|
|
local _dmesg_collector=$1
|
|
|
|
local _path=$2
|
|
|
|
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "saving vmcore-dmesg.txt to ${_path}"
|
2014-05-08 11:37:14 +00:00
|
|
|
$_dmesg_collector /proc/vmcore > ${_path}/vmcore-dmesg-incomplete.txt
|
|
|
|
_exitcode=$?
|
|
|
|
if [ $_exitcode -eq 0 ]; then
|
|
|
|
mv ${_path}/vmcore-dmesg-incomplete.txt ${_path}/vmcore-dmesg.txt
|
2021-03-18 08:52:46 +00:00
|
|
|
chmod 600 ${_path}/vmcore-dmesg.txt
|
2014-05-08 11:37:14 +00:00
|
|
|
|
|
|
|
# Make sure file is on disk. There have been instances where later
|
|
|
|
# saving vmcore failed and system rebooted without sync and there
|
|
|
|
# was no vmcore-dmesg.txt available.
|
|
|
|
sync
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "saving vmcore-dmesg.txt complete"
|
2014-05-08 11:37:14 +00:00
|
|
|
else
|
2021-05-10 14:10:26 +00:00
|
|
|
if [ -f ${_path}/vmcore-dmesg-incomplete.txt ]; then
|
|
|
|
chmod 600 ${_path}/vmcore-dmesg-incomplete.txt
|
|
|
|
fi
|
2020-10-27 09:04:25 +00:00
|
|
|
derror "saving vmcore-dmesg.txt failed"
|
2014-05-08 11:37:14 +00:00
|
|
|
fi
|
|
|
|
}
|
|
|
|
|
2020-01-28 19:34:48 +00:00
|
|
|
save_opalcore_fs() {
|
|
|
|
local _path=$1
|
|
|
|
|
|
|
|
if [ ! -f $OPALCORE ]; then
|
|
|
|
# Check if we are on an old kernel that uses a different path
|
|
|
|
if [ -f /sys/firmware/opal/core ]; then
|
|
|
|
OPALCORE="/sys/firmware/opal/core"
|
|
|
|
else
|
|
|
|
return 0
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "saving opalcore:$OPALCORE to ${_path}/opalcore"
|
2020-01-28 19:34:48 +00:00
|
|
|
cp $OPALCORE ${_path}/opalcore
|
|
|
|
if [ $? -ne 0 ]; then
|
2020-10-27 09:04:25 +00:00
|
|
|
derror "saving opalcore failed"
|
2020-01-28 19:34:48 +00:00
|
|
|
return 1
|
|
|
|
fi
|
|
|
|
|
|
|
|
sync
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "saving opalcore complete"
|
2020-01-28 19:34:48 +00:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
|
Introduce kdump error handling service
Now upon failure kdump script might not be called at all and it might
not be able to execute default action. It results in a hang.
Because we disable emergency shell and rely on kdump.sh being invoked
through dracut-pre-pivot hook. But it might happen that we never call
into dracut-pre-pivot hook because certain systemd targets could not
reach due to failure in their dependencies. In those cases error
handling code does not run and system hangs. For example:
sysroot-var-crash.mount --> initrd-root-fs.target --> initrd.target \
--> dracut-pre-pivot.service --> kdump.sh
If /sysroot/var/crash mount fails, initrd-root-fs.target will not be
reached. And then initrd.target will not be reached,
dracut-pre-pivot.service wouldn't run. Finally kdump.sh wouldn't run.
To solve this problem, we need to separate the error handling code from
dracut-pre-pivot hook, and every time when a failure shows up, the
separated code can be called by the emergency service.
By default systemd provides an emergency service which will drop us into
shell every time upon a critical failure. It's very convenient for us to
re-use the framework of systemd emergency, because we don't have to
touch the other parts of systemd. We can use our own script instead of
the default one.
This new scheme will overwrite emergency shell and replace with kdump
error handling code. And this code will do the error handling as needed.
Now, we will not rely on dracut-pre-pivot hook running always. Instead
whenever error happens and it is serious enough that emergency shell
needed to run, now kdump error handler will run.
dracut-emergency is also replaced by kdump error handler and it's
enabled again all the way down. So all the failure (including systemd
and dracut) in 2nd kernel could be captured, and trigger kdump error
handler.
dracut-initqueue is a special case, which calls "systemctl start
emergency" directly, not via "OnFailure=emergency". In case of failure,
emergency is started, but not in a isolation mode, which means
dracut-initqueue is still running. On the other hand, emergency will
call dracut-initqueue again when default action is dump_to_rootfs.
systemd would block on the last dracut-initqueue, waiting for the first
instance to exit, which leaves us hang. It looks like the following:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency (running)
--> kdump-error-handler.sh (running)
--> call dracut-initqueue:
--> blocking and waiting for the original instance to exit.
To fix this, I'd like to introduce a wrapper emergency service. This
emegency service will replace both the systemd and dracut emergency. And
this service does nothing but to isolate to real kdump error handler
service:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency isolate to kdump-error-handler.service
--> dracut-emergency and dracut-initqueue will both be stopped
and kdump-error-handler.service will run kdump-error-handler.sh.
In a normal failure case, this still works:
foo.service fails
--> trigger emergency.service
--> emergency.service isolates to kdump-error-handler.service
--> kdump-error-handler.service will run kdump-error-handler.sh
Signed-off-by: WANG Chao <chaowang@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2014-05-08 11:37:15 +00:00
|
|
|
dump_to_rootfs()
|
|
|
|
{
|
|
|
|
|
2021-04-26 09:09:55 +00:00
|
|
|
if [[ $(systemctl status dracut-initqueue | sed -n "s/^\s*Active: \(\S*\)\s.*$/\1/p") == "inactive" ]]; then
|
|
|
|
dinfo "Trying to bring up initqueue for rootfs mount"
|
|
|
|
systemctl start dracut-initqueue
|
|
|
|
fi
|
|
|
|
|
2021-07-01 19:27:05 +00:00
|
|
|
dinfo "Clean up dead systemd services"
|
|
|
|
systemctl cancel
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "Waiting for rootfs mount, will timeout after 90 seconds"
|
2021-07-20 05:41:08 +00:00
|
|
|
systemctl start --no-block sysroot.mount
|
Introduce kdump error handling service
Now upon failure kdump script might not be called at all and it might
not be able to execute default action. It results in a hang.
Because we disable emergency shell and rely on kdump.sh being invoked
through dracut-pre-pivot hook. But it might happen that we never call
into dracut-pre-pivot hook because certain systemd targets could not
reach due to failure in their dependencies. In those cases error
handling code does not run and system hangs. For example:
sysroot-var-crash.mount --> initrd-root-fs.target --> initrd.target \
--> dracut-pre-pivot.service --> kdump.sh
If /sysroot/var/crash mount fails, initrd-root-fs.target will not be
reached. And then initrd.target will not be reached,
dracut-pre-pivot.service wouldn't run. Finally kdump.sh wouldn't run.
To solve this problem, we need to separate the error handling code from
dracut-pre-pivot hook, and every time when a failure shows up, the
separated code can be called by the emergency service.
By default systemd provides an emergency service which will drop us into
shell every time upon a critical failure. It's very convenient for us to
re-use the framework of systemd emergency, because we don't have to
touch the other parts of systemd. We can use our own script instead of
the default one.
This new scheme will overwrite emergency shell and replace with kdump
error handling code. And this code will do the error handling as needed.
Now, we will not rely on dracut-pre-pivot hook running always. Instead
whenever error happens and it is serious enough that emergency shell
needed to run, now kdump error handler will run.
dracut-emergency is also replaced by kdump error handler and it's
enabled again all the way down. So all the failure (including systemd
and dracut) in 2nd kernel could be captured, and trigger kdump error
handler.
dracut-initqueue is a special case, which calls "systemctl start
emergency" directly, not via "OnFailure=emergency". In case of failure,
emergency is started, but not in a isolation mode, which means
dracut-initqueue is still running. On the other hand, emergency will
call dracut-initqueue again when default action is dump_to_rootfs.
systemd would block on the last dracut-initqueue, waiting for the first
instance to exit, which leaves us hang. It looks like the following:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency (running)
--> kdump-error-handler.sh (running)
--> call dracut-initqueue:
--> blocking and waiting for the original instance to exit.
To fix this, I'd like to introduce a wrapper emergency service. This
emegency service will replace both the systemd and dracut emergency. And
this service does nothing but to isolate to real kdump error handler
service:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency isolate to kdump-error-handler.service
--> dracut-emergency and dracut-initqueue will both be stopped
and kdump-error-handler.service will run kdump-error-handler.sh.
In a normal failure case, this still works:
foo.service fails
--> trigger emergency.service
--> emergency.service isolates to kdump-error-handler.service
--> kdump-error-handler.service will run kdump-error-handler.sh
Signed-off-by: WANG Chao <chaowang@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2014-05-08 11:37:15 +00:00
|
|
|
|
2021-07-20 05:41:08 +00:00
|
|
|
_loop=0
|
|
|
|
while [ $_loop -lt 90 ] && ! is_mounted /sysroot; do
|
|
|
|
sleep 1
|
|
|
|
_loop=$((_loop + 1))
|
|
|
|
done
|
|
|
|
|
|
|
|
if ! is_mounted /sysroot; then
|
|
|
|
derror "Failed to mount rootfs"
|
|
|
|
return
|
|
|
|
fi
|
2020-10-27 09:04:25 +00:00
|
|
|
|
2021-07-20 05:41:08 +00:00
|
|
|
ddebug "NEWROOT=$NEWROOT"
|
Introduce kdump error handling service
Now upon failure kdump script might not be called at all and it might
not be able to execute default action. It results in a hang.
Because we disable emergency shell and rely on kdump.sh being invoked
through dracut-pre-pivot hook. But it might happen that we never call
into dracut-pre-pivot hook because certain systemd targets could not
reach due to failure in their dependencies. In those cases error
handling code does not run and system hangs. For example:
sysroot-var-crash.mount --> initrd-root-fs.target --> initrd.target \
--> dracut-pre-pivot.service --> kdump.sh
If /sysroot/var/crash mount fails, initrd-root-fs.target will not be
reached. And then initrd.target will not be reached,
dracut-pre-pivot.service wouldn't run. Finally kdump.sh wouldn't run.
To solve this problem, we need to separate the error handling code from
dracut-pre-pivot hook, and every time when a failure shows up, the
separated code can be called by the emergency service.
By default systemd provides an emergency service which will drop us into
shell every time upon a critical failure. It's very convenient for us to
re-use the framework of systemd emergency, because we don't have to
touch the other parts of systemd. We can use our own script instead of
the default one.
This new scheme will overwrite emergency shell and replace with kdump
error handling code. And this code will do the error handling as needed.
Now, we will not rely on dracut-pre-pivot hook running always. Instead
whenever error happens and it is serious enough that emergency shell
needed to run, now kdump error handler will run.
dracut-emergency is also replaced by kdump error handler and it's
enabled again all the way down. So all the failure (including systemd
and dracut) in 2nd kernel could be captured, and trigger kdump error
handler.
dracut-initqueue is a special case, which calls "systemctl start
emergency" directly, not via "OnFailure=emergency". In case of failure,
emergency is started, but not in a isolation mode, which means
dracut-initqueue is still running. On the other hand, emergency will
call dracut-initqueue again when default action is dump_to_rootfs.
systemd would block on the last dracut-initqueue, waiting for the first
instance to exit, which leaves us hang. It looks like the following:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency (running)
--> kdump-error-handler.sh (running)
--> call dracut-initqueue:
--> blocking and waiting for the original instance to exit.
To fix this, I'd like to introduce a wrapper emergency service. This
emegency service will replace both the systemd and dracut emergency. And
this service does nothing but to isolate to real kdump error handler
service:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency isolate to kdump-error-handler.service
--> dracut-emergency and dracut-initqueue will both be stopped
and kdump-error-handler.service will run kdump-error-handler.sh.
In a normal failure case, this still works:
foo.service fails
--> trigger emergency.service
--> emergency.service isolates to kdump-error-handler.service
--> kdump-error-handler.service will run kdump-error-handler.sh
Signed-off-by: WANG Chao <chaowang@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2014-05-08 11:37:15 +00:00
|
|
|
dump_fs $NEWROOT
|
|
|
|
}
|
|
|
|
|
|
|
|
kdump_emergency_shell()
|
|
|
|
{
|
2021-04-26 09:09:57 +00:00
|
|
|
ddebug "Switching to kdump emergency shell..."
|
|
|
|
|
|
|
|
[ -f /etc/profile ] && . /etc/profile
|
|
|
|
export PS1='kdump:${PWD}# '
|
|
|
|
|
|
|
|
. /lib/dracut-lib.sh
|
|
|
|
if [ -f /dracut-state.sh ]; then
|
|
|
|
. /dracut-state.sh 2>/dev/null
|
|
|
|
fi
|
|
|
|
|
|
|
|
source_conf /etc/conf.d
|
|
|
|
|
|
|
|
type plymouth >/dev/null 2>&1 && plymouth quit
|
|
|
|
|
|
|
|
source_hook "emergency"
|
|
|
|
while read _tty rest; do
|
|
|
|
(
|
|
|
|
echo
|
|
|
|
echo
|
|
|
|
echo 'Entering kdump emergency mode.'
|
|
|
|
echo 'Type "journalctl" to view system logs.'
|
|
|
|
echo 'Type "rdsosreport" to generate a sosreport, you can then'
|
|
|
|
echo 'save it elsewhere and attach it to a bug report.'
|
|
|
|
echo
|
|
|
|
echo
|
|
|
|
) > /dev/$_tty
|
|
|
|
done < /proc/consoles
|
|
|
|
sh -i -l
|
|
|
|
/bin/rm -f -- /.console_lock
|
Introduce kdump error handling service
Now upon failure kdump script might not be called at all and it might
not be able to execute default action. It results in a hang.
Because we disable emergency shell and rely on kdump.sh being invoked
through dracut-pre-pivot hook. But it might happen that we never call
into dracut-pre-pivot hook because certain systemd targets could not
reach due to failure in their dependencies. In those cases error
handling code does not run and system hangs. For example:
sysroot-var-crash.mount --> initrd-root-fs.target --> initrd.target \
--> dracut-pre-pivot.service --> kdump.sh
If /sysroot/var/crash mount fails, initrd-root-fs.target will not be
reached. And then initrd.target will not be reached,
dracut-pre-pivot.service wouldn't run. Finally kdump.sh wouldn't run.
To solve this problem, we need to separate the error handling code from
dracut-pre-pivot hook, and every time when a failure shows up, the
separated code can be called by the emergency service.
By default systemd provides an emergency service which will drop us into
shell every time upon a critical failure. It's very convenient for us to
re-use the framework of systemd emergency, because we don't have to
touch the other parts of systemd. We can use our own script instead of
the default one.
This new scheme will overwrite emergency shell and replace with kdump
error handling code. And this code will do the error handling as needed.
Now, we will not rely on dracut-pre-pivot hook running always. Instead
whenever error happens and it is serious enough that emergency shell
needed to run, now kdump error handler will run.
dracut-emergency is also replaced by kdump error handler and it's
enabled again all the way down. So all the failure (including systemd
and dracut) in 2nd kernel could be captured, and trigger kdump error
handler.
dracut-initqueue is a special case, which calls "systemctl start
emergency" directly, not via "OnFailure=emergency". In case of failure,
emergency is started, but not in a isolation mode, which means
dracut-initqueue is still running. On the other hand, emergency will
call dracut-initqueue again when default action is dump_to_rootfs.
systemd would block on the last dracut-initqueue, waiting for the first
instance to exit, which leaves us hang. It looks like the following:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency (running)
--> kdump-error-handler.sh (running)
--> call dracut-initqueue:
--> blocking and waiting for the original instance to exit.
To fix this, I'd like to introduce a wrapper emergency service. This
emegency service will replace both the systemd and dracut emergency. And
this service does nothing but to isolate to real kdump error handler
service:
dracut-initqueue (running)
--> call dracut-emergency:
--> dracut-emergency isolate to kdump-error-handler.service
--> dracut-emergency and dracut-initqueue will both be stopped
and kdump-error-handler.service will run kdump-error-handler.sh.
In a normal failure case, this still works:
foo.service fails
--> trigger emergency.service
--> emergency.service isolates to kdump-error-handler.service
--> kdump-error-handler.service will run kdump-error-handler.sh
Signed-off-by: WANG Chao <chaowang@redhat.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
2014-05-08 11:37:15 +00:00
|
|
|
}
|
|
|
|
|
2019-01-17 20:31:23 +00:00
|
|
|
do_failure_action()
|
2014-05-08 11:37:14 +00:00
|
|
|
{
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "Executing failure action $FAILURE_ACTION"
|
2019-01-17 20:31:23 +00:00
|
|
|
eval $FAILURE_ACTION
|
2014-05-08 11:37:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
do_final_action()
|
|
|
|
{
|
2020-10-27 09:04:25 +00:00
|
|
|
dinfo "Executing final action $FINAL_ACTION"
|
2014-05-08 11:37:14 +00:00
|
|
|
eval $FINAL_ACTION
|
|
|
|
}
|