- podman-etcd: add support for cert rotation

- podman-etcd: compute dynamic revision bump from maxRaftIndex

  Resolves: RHEL-124203, RHEL-124206
This commit is contained in:
Oyvind Albrigtsen 2025-10-28 13:06:41 +01:00
parent 3e111eae9a
commit 8890b6688b
3 changed files with 292 additions and 1 deletions

View File

@ -0,0 +1,166 @@
From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001
From: ehila <ehila@redhat.com>
Date: Thu, 16 Oct 2025 23:39:32 -0400
Subject: [PATCH] feat: add support for podman-etcd cert rotation
added a cert check function to the monitor call to force a restart of etcd when the certs have been changed
Signed-off-by: ehila <ehila@redhat.com>
---
heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 86 insertions(+), 1 deletion(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index e1425ec02..b8dfb2f9e 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -40,6 +40,7 @@
# Parameter defaults
OCF_RESKEY_image_default="default"
OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml"
+OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs"
OCF_RESKEY_name_default="etcd"
OCF_RESKEY_nic_default="br-ex"
OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
+: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}}
: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}}
: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd.
<content type="string" default="${OCF_RESKEY_pod_manifest_default}"/>
</parameter>
+<parameter name="etcd_certs_dir" required="0" unique="0">
+<longdesc lang="en">
+The Etcd certificates directory mounted into the etcd container.
+The agent will monitor this directory for changes and restart the etcd container if the certificates have changed.
+</longdesc>
+<shortdesc lang="en">Etcd certificates directory</shortdesc>
+<content type="string" default="${OCF_RESKEY_etcd_certs_dir_default}"/>
+</parameter>
+
<parameter name="image" required="0" unique="0">
<longdesc lang="en">
The podman image to base this container off of.
@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set.
END
}
+etcd_certificates_hash_manager()
+{
+ local action="$1"
+ local current_hash
+ local stored_hash
+
+ # If the certs directory doesn't exist, consider it unchanged
+ if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then
+ ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring"
+ return $OCF_SUCCESS
+ fi
+
+ # Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information
+ # we only need to monitor the certificate files to detect changes.
+ if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then
+ ocf_log err "failed to calculate certificate files hash"
+ return $OCF_ERR_GENERIC
+ fi
+
+ # If no stored hash exists, create one and return success
+ if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then
+ echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"
+ ocf_log info "created initial certificate hash: $current_hash"
+ return $OCF_SUCCESS
+ fi
+
+ case "$action" in
+ "update")
+ if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then
+ ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE"
+ fi
+ ocf_log info "updated certificate hash: $current_hash"
+ ;;
+ "check")
+ if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then
+ ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE"
+ # This should not happen but if for some reason we can not read the stored hash,
+ # use the current hash and log the error but allow etcd to run as long as possible.
+ stored_hash="$current_hash"
+ fi
+ if [ "$current_hash" != "$stored_hash" ]; then
+ ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)"
+ return $OCF_ERR_GENERIC
+ fi
+ ;;
+ *)
+ ocf_log err "unsupported action: $action"
+ return $OCF_ERR_GENERIC
+ ;;
+ esac
+
+ return $OCF_SUCCESS
+}
monitor_cmd_exec()
{
@@ -357,7 +421,7 @@ archive_current_container()
# archive corresponding etcd configuration files
local files_to_archive=""
- for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do
if [ -f "$file" ]; then
files_to_archive="$files_to_archive $file"
else
@@ -1178,6 +1242,11 @@ podman_monitor()
return $rc
fi
+ # Check if certificate files have changed, if they have, etcd needs to be restarted
+ if ! etcd_certificates_hash_manager "check"; then
+ return $OCF_ERR_GENERIC
+ fi
+
if is_learner; then
ocf_log info "$NODENAME is learner. Cannot get member id"
return "$OCF_SUCCESS"
@@ -1483,6 +1552,14 @@ podman_start()
return $OCF_ERR_GENERIC
fi
+ # Update the certificate hash after the container has started successfully
+ # this is to ensure that the certificate hash is updated after a restart is initiated
+ # by a cert rotation event from the monitor command.
+ if ! etcd_certificates_hash_manager "update"; then
+ ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash"
+ return $OCF_ERR_GENERIC
+ fi
+
# check if the container has already started
podman_simple_status
if [ $? -eq $OCF_SUCCESS ]; then
@@ -1888,6 +1965,13 @@ podman_validate()
exit $OCF_ERR_CONFIGURED
fi
+ if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \
+ || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \
+ || ! rm "$ETCD_CERTS_HASH_FILE"; then
+ ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE"
+ exit $OCF_ERR_GENERIC
+ fi
+
return $OCF_SUCCESS
}
@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
+ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
# Note: we currently monitor podman containers by with the "podman exec"
# command, so make sure that invocation is always valid by enforcing the

View File

@ -0,0 +1,115 @@
From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Tue, 28 Oct 2025 12:47:10 +0100
Subject: [PATCH] podman-etcd: compute dynamic revision bump from maxRaftIndex
(#2087)
Replace hardcoded 1 billion revision bump with dynamic calculation based
on 20% of the last known maxRaftIndex from revision.json.
This aligns with the logic used by cluster-etcd-operator's
quorum-restore-pod utility and ensures the bump amount is proportional
to the cluster's actual revision state.
The implementation:
- Adds compute_bump_revision() function with safe fallback to 1bn
default
- Extracts magic values to named constants
(ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT,
ETCD_REVISION_JSON)
- Validates computed values (non-zero, not exceeding default)
- Logs computation results for debugging
Reference:
https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166
22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
---
heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++----
1 file changed, 34 insertions(+), 4 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index b8dfb2f9e..551d37a20 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -619,16 +619,43 @@ prepare_env() {
LISTEN_METRICS_URLS="0.0.0.0"
}
+compute_bump_revision() {
+ # Same logic used by cluster-etcd-operator quorum-restore-pod utility.
+ # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
+ # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
+ BUMP_REV=$ETCD_BUMP_REV_DEFAULT
+ if [ ! -f "${ETCD_REVISION_JSON}" ]; then
+ ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
+ return
+ fi
+
+ # this will bump by the amount of 20% of the last known live revision.
+ if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then
+ ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
+ return
+ fi
+
+ if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then
+ ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
+ return
+ fi
+
+ BUMP_REV="${COMPUTED_BUMP}"
+ ocf_log info "bumping etcd revisions by ${BUMP_REV}"
+}
generate_etcd_configuration() {
if is_force_new_cluster; then
+ compute_bump_revision
# The embedded newline is required for correct YAML formatting.
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
-force-new-cluster-bump-amount: 1000000000"
+force-new-cluster-bump-amount: $BUMP_REV"
else
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
fi
+ # the space indentation for client-transport-security and peer-transport-security
+ # is required for correct YAML formatting.
cat > "$ETCD_CONFIGURATION_FILE" << EOF
logger: zap
log-level: info
@@ -707,7 +734,7 @@ attribute_node_cluster_id()
{
local action="$1"
local value
- if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then
+ if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
rc=$?
ocf_log err "could not get cluster_id, error code: $rc"
return "$rc"
@@ -745,7 +772,7 @@ attribute_node_revision()
local value
local attribute="revision"
- if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then
+ if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then
rc=$?
ocf_log err "could not get $attribute, error code: $rc"
return "$rc"
@@ -1456,7 +1483,7 @@ can_reuse_container() {
# If the container does not exist it cannot be reused
- if ! container_exists; then
+ if ! container_exists; then
OCF_RESKEY_reuse=0
return "$OCF_SUCCESS"
fi
@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
+ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
+ETCD_REVISION_BUMP_PERCENTAGE=0.2
+ETCD_BUMP_REV_DEFAULT=1000000000
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
# Note: we currently monitor podman containers by with the "podman exec"

View File

@ -45,7 +45,7 @@
Name: resource-agents Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.16.0 Version: 4.16.0
Release: 32%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} Release: 33%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPL-2.0-or-later AND LGPL-2.1-or-later License: GPL-2.0-or-later AND LGPL-2.1-or-later
URL: https://github.com/ClusterLabs/resource-agents URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -91,6 +91,8 @@ Patch38: RHEL-116149-RHEL-116152-2-portblock-fix-incorrect-promotable-descriptio
Patch39: RHEL-116149-RHEL-116152-3-portblock-fixes-add-method-and-status_check-parameters.patch Patch39: RHEL-116149-RHEL-116152-3-portblock-fixes-add-method-and-status_check-parameters.patch
Patch40: RHEL-119504-podman-etcd-add-automatic-learner-member-promotion.patch Patch40: RHEL-119504-podman-etcd-add-automatic-learner-member-promotion.patch
Patch41: RHEL-115495-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch Patch41: RHEL-115495-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch
Patch42: RHEL-124203-podman-etcd-certificate-rotation.patch
Patch43: RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch
# bundled ha-cloud-support libs # bundled ha-cloud-support libs
Patch500: ha-cloud-support-aliyun.patch Patch500: ha-cloud-support-aliyun.patch
@ -303,6 +305,8 @@ exit 1
%patch -p1 -P 39 %patch -p1 -P 39
%patch -p1 -P 40 %patch -p1 -P 40
%patch -p1 -P 41 %patch -p1 -P 41
%patch -p1 -P 42
%patch -p1 -P 43
# bundled ha-cloud-support libs # bundled ha-cloud-support libs
%patch -p1 -P 500 %patch -p1 -P 500
@ -635,6 +639,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog %changelog
* Tue Oct 28 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-33
- podman-etcd: add support for cert rotation
- podman-etcd: compute dynamic revision bump from maxRaftIndex
Resolves: RHEL-124203, RHEL-124206
* Wed Oct 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-32 * Wed Oct 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-32
- portblock: add promotable and nftables support, and method and - portblock: add promotable and nftables support, and method and
status_check parameters status_check parameters