- podman-etcd: add support for cert rotation

- podman-etcd: compute dynamic revision bump from maxRaftIndex Resolves: RHEL-124203, RHEL-124206
2025-10-28 13:06:41 +01:00 · 2025-10-28 13:06:41 +01:00 · 8890b6688b
commit 8890b6688b
parent 3e111eae9a
3 changed files with 292 additions and 1 deletions
--- a/RHEL-124203-podman-etcd-certificate-rotation.patch
+++ b/RHEL-124203-podman-etcd-certificate-rotation.patch
@ -0,0 +1,166 @@
 From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001
 From: ehila <ehila@redhat.com>
 Date: Thu, 16 Oct 2025 23:39:32 -0400
 Subject: [PATCH] feat: add support for podman-etcd cert rotation
 added a cert check function to the monitor call to force a restart of etcd when the certs have been changed
 Signed-off-by: ehila <ehila@redhat.com>
 ---
 heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)
 diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
 index e1425ec02..b8dfb2f9e 100755
 --- a/heartbeat/podman-etcd
 +++ b/heartbeat/podman-etcd
@@ -40,6 +40,7 @@
 # Parameter defaults
 OCF_RESKEY_image_default="default"
 OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml"
 +OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs"
 OCF_RESKEY_name_default="etcd"
 OCF_RESKEY_nic_default="br-ex"
 OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
 : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
 : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
 +: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}}
 : ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
 : ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}}
 : ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd.
 <content type="string" default="${OCF_RESKEY_pod_manifest_default}"/>
 </parameter>
 +<parameter name="etcd_certs_dir" required="0" unique="0">
 +<longdesc lang="en">
 +The Etcd certificates directory mounted into the etcd container.
 +The agent will monitor this directory for changes and restart the etcd container if the certificates have changed.
 +</longdesc>
 +<shortdesc lang="en">Etcd certificates directory</shortdesc>
 +<content type="string" default="${OCF_RESKEY_etcd_certs_dir_default}"/>
 +</parameter>
 +
 <parameter name="image" required="0" unique="0">
 <longdesc lang="en">
 The podman image to base this container off of.
@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set.
 END
 }
 +etcd_certificates_hash_manager()
 +{
 +	local action="$1"
 +	local current_hash
 +	local stored_hash
 +
 +	# If the certs directory doesn't exist, consider it unchanged
 +	if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then
 +		ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring"
 +		return $OCF_SUCCESS
 +	fi
 +
 +	# Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information
 +	# we only need to monitor the certificate files to detect changes.
 +	if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then
 +		ocf_log err "failed to calculate certificate files hash"
 +		return $OCF_ERR_GENERIC
 +	fi
 +
 +	# If no stored hash exists, create one and return success
 +	if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then
 +		echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"
 +		ocf_log info "created initial certificate hash: $current_hash"
 +		return $OCF_SUCCESS
 +	fi
 +
 +	case "$action" in
 +		"update")
 +			if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then
 +				ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE"
 +			fi
 +			ocf_log info "updated certificate hash: $current_hash"
 +			;;
 +		"check")
 +			if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then
 +				ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE"
 +				# This should not happen but if for some reason we can not read the stored hash,
 +				# use the current hash and log the error but allow etcd to run as long as possible.
 +				stored_hash="$current_hash"
 +			fi
 +			if [ "$current_hash" != "$stored_hash" ]; then
 +				ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)"
 +				return $OCF_ERR_GENERIC
 +			fi
 +			;;
 +		*)
 +			ocf_log err "unsupported action: $action"
 +			return $OCF_ERR_GENERIC
 +			;;
 +	esac
 +
 +	return $OCF_SUCCESS
 +}
 monitor_cmd_exec()
 {
@@ -357,7 +421,7 @@ archive_current_container()
 	# archive corresponding etcd configuration files
 	local files_to_archive=""
 -	for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
 +	for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do
 		if [ -f "$file" ]; then
 			files_to_archive="$files_to_archive $file"
 		else
@@ -1178,6 +1242,11 @@ podman_monitor()
 		return $rc
 	fi
 +	# Check if certificate files have changed, if they have, etcd needs to be restarted
 +	if ! etcd_certificates_hash_manager "check"; then
 +		return $OCF_ERR_GENERIC
 +	fi
 +
 	if is_learner; then
 		ocf_log info "$NODENAME is learner. Cannot get member id"
 		return "$OCF_SUCCESS"
@@ -1483,6 +1552,14 @@ podman_start()
 		return $OCF_ERR_GENERIC
 	fi
 +	# Update the certificate hash after the container has started successfully
 +	# this is to ensure that the certificate hash is updated after a restart is initiated
 +	# by a cert rotation event from the monitor command.
 +	if ! etcd_certificates_hash_manager "update"; then
 +		ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash"
 +		return $OCF_ERR_GENERIC
 +	fi
 +
 	# check if the container has already started
 	podman_simple_status
 	if [ $? -eq $OCF_SUCCESS ]; then
@@ -1888,6 +1965,13 @@ podman_validate()
 		exit $OCF_ERR_CONFIGURED
 	fi
 +	if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \
 +		|| ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \
 +		|| ! rm "$ETCD_CERTS_HASH_FILE"; then
 +		ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE"
 +		exit $OCF_ERR_GENERIC
 +	fi
 +
 	return $OCF_SUCCESS
 }
@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name
 POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
 ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
 ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
 +ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
 # Note: we currently monitor podman containers by with the "podman exec"
 # command, so make sure that invocation is always valid by enforcing the
--- a/RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch
+++ b/RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch
@ -0,0 +1,115 @@
 From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001
 From: Carlo Lobrano <c.lobrano@gmail.com>
 Date: Tue, 28 Oct 2025 12:47:10 +0100
 Subject: [PATCH]  podman-etcd: compute dynamic revision bump from maxRaftIndex
 (#2087)
 Replace hardcoded 1 billion revision bump with dynamic calculation based
 on 20% of the last known maxRaftIndex from revision.json.
 This aligns with the logic used by cluster-etcd-operator's
 quorum-restore-pod utility and ensures the bump amount is proportional
 to the cluster's actual revision state.
 The implementation:
 - Adds compute_bump_revision() function with safe fallback to 1bn
  default
 - Extracts magic values to named constants
  (ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT,
  ETCD_REVISION_JSON)
 - Validates computed values (non-zero, not exceeding default)
 - Logs computation results for debugging
 Reference:
 https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166
 22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
 ---
 heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)
 diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
 index b8dfb2f9e..551d37a20 100755
 --- a/heartbeat/podman-etcd
 +++ b/heartbeat/podman-etcd
@@ -619,16 +619,43 @@ prepare_env() {
 	LISTEN_METRICS_URLS="0.0.0.0"
 }
 +compute_bump_revision() {
 +	# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
 +	# see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
 +	# set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
 +	BUMP_REV=$ETCD_BUMP_REV_DEFAULT
 +	if [ ! -f "${ETCD_REVISION_JSON}" ]; then
 +		ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
 +		return
 +	fi
 +
 +	# this will bump by the amount of 20% of the last known live revision.
 +	if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then
 +		ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
 +		return
 +	fi
 +
 +	if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then
 +		ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
 +		return
 +	fi
 +
 +	BUMP_REV="${COMPUTED_BUMP}"
 +	ocf_log info "bumping etcd revisions by ${BUMP_REV}"
 +}
 generate_etcd_configuration() {
 	if is_force_new_cluster; then
 +		compute_bump_revision
 		# The embedded newline is required for correct YAML formatting.
 		FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
 -force-new-cluster-bump-amount: 1000000000"
 +force-new-cluster-bump-amount: $BUMP_REV"
 	else
 		FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
 	fi
 +	# the space indentation for client-transport-security and peer-transport-security
 +	# is required for correct YAML formatting.
 	cat > "$ETCD_CONFIGURATION_FILE" << EOF
 logger: zap
 log-level: info
@@ -707,7 +734,7 @@ attribute_node_cluster_id()
 {
 	local action="$1"
 	local value
 -	if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then
 +	if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
 		rc=$?
 		ocf_log err "could not get cluster_id, error code: $rc"
 		return "$rc"
@@ -745,7 +772,7 @@ attribute_node_revision()
 	local value
 	local attribute="revision"
 -	if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then
 +	if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then
 		rc=$?
 		ocf_log err "could not get $attribute, error code: $rc"
 		return "$rc"
@@ -1456,7 +1483,7 @@ can_reuse_container() {
 	# If the container does not exist it cannot be reused
 -	if ! container_exists; then 
 +	if ! container_exists; then
 		OCF_RESKEY_reuse=0
 		return "$OCF_SUCCESS"
 	fi
@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name
 POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
 ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
 ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
 +ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
 +ETCD_REVISION_BUMP_PERCENTAGE=0.2
 +ETCD_BUMP_REV_DEFAULT=1000000000
 ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
 # Note: we currently monitor podman containers by with the "podman exec"
--- a/resource-agents.spec
+++ b/resource-agents.spec
@ -45,7 +45,7 @@
 Name:		resource-agents
 Summary:	Open Source HA Reusable Cluster Resource Scripts
 Version:	4.16.0
-Release:	32%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
+Release:	33%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
 License:	GPL-2.0-or-later AND LGPL-2.1-or-later
 URL:		https://github.com/ClusterLabs/resource-agents
 Source0:	%{upstream_prefix}-%{upstream_version}.tar.gz
@ -91,6 +91,8 @@ Patch38:	RHEL-116149-RHEL-116152-2-portblock-fix-incorrect-promotable-descriptio
 Patch39:	RHEL-116149-RHEL-116152-3-portblock-fixes-add-method-and-status_check-parameters.patch
 Patch40:	RHEL-119504-podman-etcd-add-automatic-learner-member-promotion.patch
 Patch41:	RHEL-115495-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch
 Patch42:	RHEL-124203-podman-etcd-certificate-rotation.patch
 Patch43:	RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch
 # bundled ha-cloud-support libs
 Patch500:	ha-cloud-support-aliyun.patch
@ -303,6 +305,8 @@ exit 1
 %patch -p1 -P 39
 %patch -p1 -P 40
 %patch -p1 -P 41
 %patch -p1 -P 42
 %patch -p1 -P 43
 # bundled ha-cloud-support libs
 %patch -p1 -P 500
@ -635,6 +639,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
 %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
 %changelog
 * Tue Oct 28 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-33
 - podman-etcd: add support for cert rotation
 - podman-etcd: compute dynamic revision bump from maxRaftIndex
  Resolves: RHEL-124203, RHEL-124206
 * Wed Oct 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-32
 - portblock: add promotable and nftables support, and method and
  status_check parameters