resource-agents/SOURCES/bz1886262-podman-recover-from-killed-conmon.patch

From 3aa0dda4e0c2a3b801d65aeacc4fdfd713a604f2 Mon Sep 17 00:00:00 2001
From: Damien Ciabrini <damien.ciabrini@gmail.com>
Date: Tue, 27 Oct 2020 18:01:36 +0100
Subject: [PATCH] podman: recover from killed conmon side process

When podman containers are created by the resource-agent, the podman
runtime spawns a side process (conmon) to monitor the container and
record the exit status.

If the conmon process dies unexpectedly (e.g. kill -9), the podman
container can still be stopped, even if the cli returns a generic
error.

Try to distinguish this specific failure condition and make the stop
operation resilient; when it happens, just log a warning and finish
the usual stop actions.
---
 heartbeat/podman | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/heartbeat/podman b/heartbeat/podman
index 81b00ee6f..9f8c2a091 100755
--- a/heartbeat/podman
+++ b/heartbeat/podman
@@ -419,6 +419,7 @@ podman_start()
 podman_stop()
 {
 	local timeout=60
+	local rc
 	podman_simple_status
 	if [ $? -eq  $OCF_NOT_RUNNING ]; then
 		remove_container
@@ -434,16 +435,27 @@ podman_stop()
 
 	if ocf_is_true "$OCF_RESKEY_force_kill"; then
 		ocf_run podman kill $CONTAINER
+		rc=$?
 	else
 		ocf_log debug "waiting $timeout second[s] before killing container"
 		ocf_run podman stop -t=$timeout $CONTAINER
+		rc=$?
 		# on stop, systemd will automatically delete any transient
 		# drop-in conf that has been created earlier
 	fi
 
-	if [ $? -ne 0 ]; then
-		ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
-		return $OCF_ERR_GENERIC
+	if [ $rc -ne 0 ]; then
+		# If the stop failed, it could be because the controlling conmon
+		# process died unexpectedly. If so, a generic error code is returned
+		# but the associated container exit code is -1. If that's the case,
+		# assume there's no failure and continue with the rm as usual.
+		if [ $rc -eq 125 ] && \
+		   podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then
+			ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
+		else
+			ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
+			return $OCF_ERR_GENERIC
+		fi
 	fi
 
 	remove_container
import resource-agents-4.1.1-94.el8 2021-05-14 04:17:31 +00:00			`From 3aa0dda4e0c2a3b801d65aeacc4fdfd713a604f2 Mon Sep 17 00:00:00 2001`
			`From: Damien Ciabrini <damien.ciabrini@gmail.com>`
			`Date: Tue, 27 Oct 2020 18:01:36 +0100`
			`Subject: [PATCH] podman: recover from killed conmon side process`

			`When podman containers are created by the resource-agent, the podman`
			`runtime spawns a side process (conmon) to monitor the container and`
			`record the exit status.`

			`If the conmon process dies unexpectedly (e.g. kill -9), the podman`
			`container can still be stopped, even if the cli returns a generic`
			`error.`

			`Try to distinguish this specific failure condition and make the stop`
			`operation resilient; when it happens, just log a warning and finish`
			`the usual stop actions.`
			`---`
			`heartbeat/podman \| 18 +++++++++++++++---`
			`1 file changed, 15 insertions(+), 3 deletions(-)`

			`diff --git a/heartbeat/podman b/heartbeat/podman`
			`index 81b00ee6f..9f8c2a091 100755`
			`--- a/heartbeat/podman`
			`+++ b/heartbeat/podman`
			`@@ -419,6 +419,7 @@ podman_start()`
			`podman_stop()`
			`{`
			`local timeout=60`
			`+ local rc`
			`podman_simple_status`
			`if [ $? -eq $OCF_NOT_RUNNING ]; then`
			`remove_container`
			`@@ -434,16 +435,27 @@ podman_stop()`

			`if ocf_is_true "$OCF_RESKEY_force_kill"; then`
			`ocf_run podman kill $CONTAINER`
			`+ rc=$?`
			`else`
			`ocf_log debug "waiting $timeout second[s] before killing container"`
			`ocf_run podman stop -t=$timeout $CONTAINER`
			`+ rc=$?`
			`# on stop, systemd will automatically delete any transient`
			`# drop-in conf that has been created earlier`
			`fi`

			`- if [ $? -ne 0 ]; then`
			`- ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."`
			`- return $OCF_ERR_GENERIC`
			`+ if [ $rc -ne 0 ]; then`
			`+ # If the stop failed, it could be because the controlling conmon`
			`+ # process died unexpectedly. If so, a generic error code is returned`
			`+ # but the associated container exit code is -1. If that's the case,`
			`+ # assume there's no failure and continue with the rm as usual.`
			`+ if [ $rc -eq 125 ] && \`
			`+ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER \| grep -wq "stopped:-1"; then`
			`+ ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."`
			`+ else`
			`+ ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."`
			`+ return $OCF_ERR_GENERIC`
			`+ fi`
			`fi`

			`remove_container`