From 3aa0dda4e0c2a3b801d65aeacc4fdfd713a604f2 Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Tue, 27 Oct 2020 18:01:36 +0100 Subject: [PATCH] podman: recover from killed conmon side process When podman containers are created by the resource-agent, the podman runtime spawns a side process (conmon) to monitor the container and record the exit status. If the conmon process dies unexpectedly (e.g. kill -9), the podman container can still be stopped, even if the cli returns a generic error. Try to distinguish this specific failure condition and make the stop operation resilient; when it happens, just log a warning and finish the usual stop actions. --- heartbeat/podman | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/heartbeat/podman b/heartbeat/podman index 81b00ee6f..9f8c2a091 100755 --- a/heartbeat/podman +++ b/heartbeat/podman @@ -419,6 +419,7 @@ podman_start() podman_stop() { local timeout=60 + local rc podman_simple_status if [ $? -eq $OCF_NOT_RUNNING ]; then remove_container @@ -434,16 +435,27 @@ podman_stop() if ocf_is_true "$OCF_RESKEY_force_kill"; then ocf_run podman kill $CONTAINER + rc=$? else ocf_log debug "waiting $timeout second[s] before killing container" ocf_run podman stop -t=$timeout $CONTAINER + rc=$? # on stop, systemd will automatically delete any transient # drop-in conf that has been created earlier fi - if [ $? -ne 0 ]; then - ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." - return $OCF_ERR_GENERIC + if [ $rc -ne 0 ]; then + # If the stop failed, it could be because the controlling conmon + # process died unexpectedly. If so, a generic error code is returned + # but the associated container exit code is -1. If that's the case, + # assume there's no failure and continue with the rm as usual. + if [ $rc -eq 125 ] && \ + podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then + ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." + else + ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." + return $OCF_ERR_GENERIC + fi fi remove_container