From 8890b6688b4d3032b13d22ed5f7a3f057135312a Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Tue, 28 Oct 2025 13:06:41 +0100 Subject: [PATCH] - podman-etcd: add support for cert rotation - podman-etcd: compute dynamic revision bump from maxRaftIndex Resolves: RHEL-124203, RHEL-124206 --- ...203-podman-etcd-certificate-rotation.patch | 166 ++++++++++++++++++ ...amic-revision-bump-from-maxRaftIndex.patch | 115 ++++++++++++ resource-agents.spec | 12 +- 3 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 RHEL-124203-podman-etcd-certificate-rotation.patch create mode 100644 RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch diff --git a/RHEL-124203-podman-etcd-certificate-rotation.patch b/RHEL-124203-podman-etcd-certificate-rotation.patch new file mode 100644 index 0000000..7774492 --- /dev/null +++ b/RHEL-124203-podman-etcd-certificate-rotation.patch @@ -0,0 +1,166 @@ +From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001 +From: ehila +Date: Thu, 16 Oct 2025 23:39:32 -0400 +Subject: [PATCH] feat: add support for podman-etcd cert rotation + +added a cert check function to the monitor call to force a restart of etcd when the certs have been changed + +Signed-off-by: ehila +--- + heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 86 insertions(+), 1 deletion(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index e1425ec02..b8dfb2f9e 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -40,6 +40,7 @@ + # Parameter defaults + OCF_RESKEY_image_default="default" + OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml" ++OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs" + OCF_RESKEY_name_default="etcd" + OCF_RESKEY_nic_default="br-ex" + OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" +@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} ++: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}} + : ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} + : ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} + : ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}} +@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd. + + + ++ ++ ++The Etcd certificates directory mounted into the etcd container. ++The agent will monitor this directory for changes and restart the etcd container if the certificates have changed. ++ ++Etcd certificates directory ++ ++ ++ + + + The podman image to base this container off of. +@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set. + END + } + ++etcd_certificates_hash_manager() ++{ ++ local action="$1" ++ local current_hash ++ local stored_hash ++ ++ # If the certs directory doesn't exist, consider it unchanged ++ if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then ++ ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring" ++ return $OCF_SUCCESS ++ fi ++ ++ # Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information ++ # we only need to monitor the certificate files to detect changes. ++ if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then ++ ocf_log err "failed to calculate certificate files hash" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # If no stored hash exists, create one and return success ++ if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then ++ echo "$current_hash" > "$ETCD_CERTS_HASH_FILE" ++ ocf_log info "created initial certificate hash: $current_hash" ++ return $OCF_SUCCESS ++ fi ++ ++ case "$action" in ++ "update") ++ if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then ++ ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE" ++ fi ++ ocf_log info "updated certificate hash: $current_hash" ++ ;; ++ "check") ++ if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then ++ ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE" ++ # This should not happen but if for some reason we can not read the stored hash, ++ # use the current hash and log the error but allow etcd to run as long as possible. ++ stored_hash="$current_hash" ++ fi ++ if [ "$current_hash" != "$stored_hash" ]; then ++ ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)" ++ return $OCF_ERR_GENERIC ++ fi ++ ;; ++ *) ++ ocf_log err "unsupported action: $action" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++ ++ return $OCF_SUCCESS ++} + + monitor_cmd_exec() + { +@@ -357,7 +421,7 @@ archive_current_container() + + # archive corresponding etcd configuration files + local files_to_archive="" +- for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do ++ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do + if [ -f "$file" ]; then + files_to_archive="$files_to_archive $file" + else +@@ -1178,6 +1242,11 @@ podman_monitor() + return $rc + fi + ++ # Check if certificate files have changed, if they have, etcd needs to be restarted ++ if ! etcd_certificates_hash_manager "check"; then ++ return $OCF_ERR_GENERIC ++ fi ++ + if is_learner; then + ocf_log info "$NODENAME is learner. Cannot get member id" + return "$OCF_SUCCESS" +@@ -1483,6 +1552,14 @@ podman_start() + return $OCF_ERR_GENERIC + fi + ++ # Update the certificate hash after the container has started successfully ++ # this is to ensure that the certificate hash is updated after a restart is initiated ++ # by a cert rotation event from the monitor command. ++ if ! etcd_certificates_hash_manager "update"; then ++ ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash" ++ return $OCF_ERR_GENERIC ++ fi ++ + # check if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then +@@ -1888,6 +1965,13 @@ podman_validate() + exit $OCF_ERR_CONFIGURED + fi + ++ if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \ ++ || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \ ++ || ! rm "$ETCD_CERTS_HASH_FILE"; then ++ ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE" ++ exit $OCF_ERR_GENERIC ++ fi ++ + return $OCF_SUCCESS + } + +@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name + POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" ++ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" + + # Note: we currently monitor podman containers by with the "podman exec" + # command, so make sure that invocation is always valid by enforcing the diff --git a/RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch b/RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch new file mode 100644 index 0000000..00a31ec --- /dev/null +++ b/RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch @@ -0,0 +1,115 @@ +From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Tue, 28 Oct 2025 12:47:10 +0100 +Subject: [PATCH] podman-etcd: compute dynamic revision bump from maxRaftIndex + (#2087) + +Replace hardcoded 1 billion revision bump with dynamic calculation based +on 20% of the last known maxRaftIndex from revision.json. + +This aligns with the logic used by cluster-etcd-operator's +quorum-restore-pod utility and ensures the bump amount is proportional +to the cluster's actual revision state. + +The implementation: +- Adds compute_bump_revision() function with safe fallback to 1bn + default +- Extracts magic values to named constants + (ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT, + ETCD_REVISION_JSON) +- Validates computed values (non-zero, not exceeding default) +- Logs computation results for debugging + +Reference: +https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166 +22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34 +--- + heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++---- + 1 file changed, 34 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index b8dfb2f9e..551d37a20 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -619,16 +619,43 @@ prepare_env() { + LISTEN_METRICS_URLS="0.0.0.0" + } + ++compute_bump_revision() { ++ # Same logic used by cluster-etcd-operator quorum-restore-pod utility. ++ # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34 ++ # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days. ++ BUMP_REV=$ETCD_BUMP_REV_DEFAULT ++ if [ ! -f "${ETCD_REVISION_JSON}" ]; then ++ ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump" ++ return ++ fi ++ ++ # this will bump by the amount of 20% of the last known live revision. ++ if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then ++ ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump" ++ return ++ fi ++ ++ if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then ++ ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump" ++ return ++ fi ++ ++ BUMP_REV="${COMPUTED_BUMP}" ++ ocf_log info "bumping etcd revisions by ${BUMP_REV}" ++} + + generate_etcd_configuration() { + if is_force_new_cluster; then ++ compute_bump_revision + # The embedded newline is required for correct YAML formatting. + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true +-force-new-cluster-bump-amount: 1000000000" ++force-new-cluster-bump-amount: $BUMP_REV" + else + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false" + fi + ++ # the space indentation for client-transport-security and peer-transport-security ++ # is required for correct YAML formatting. + cat > "$ETCD_CONFIGURATION_FILE" << EOF + logger: zap + log-level: info +@@ -707,7 +734,7 @@ attribute_node_cluster_id() + { + local action="$1" + local value +- if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then ++ if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then + rc=$? + ocf_log err "could not get cluster_id, error code: $rc" + return "$rc" +@@ -745,7 +772,7 @@ attribute_node_revision() + local value + local attribute="revision" + +- if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then ++ if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then + rc=$? + ocf_log err "could not get $attribute, error code: $rc" + return "$rc" +@@ -1456,7 +1483,7 @@ can_reuse_container() { + + + # If the container does not exist it cannot be reused +- if ! container_exists; then ++ if ! container_exists; then + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi +@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name + POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" ++ETCD_REVISION_JSON="/var/lib/etcd/revision.json" ++ETCD_REVISION_BUMP_PERCENTAGE=0.2 ++ETCD_BUMP_REV_DEFAULT=1000000000 + ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash" + + # Note: we currently monitor podman containers by with the "podman exec" diff --git a/resource-agents.spec b/resource-agents.spec index 11a05b5..847cde0 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.16.0 -Release: 32%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 33%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPL-2.0-or-later AND LGPL-2.1-or-later URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -91,6 +91,8 @@ Patch38: RHEL-116149-RHEL-116152-2-portblock-fix-incorrect-promotable-descriptio Patch39: RHEL-116149-RHEL-116152-3-portblock-fixes-add-method-and-status_check-parameters.patch Patch40: RHEL-119504-podman-etcd-add-automatic-learner-member-promotion.patch Patch41: RHEL-115495-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch +Patch42: RHEL-124203-podman-etcd-certificate-rotation.patch +Patch43: RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -303,6 +305,8 @@ exit 1 %patch -p1 -P 39 %patch -p1 -P 40 %patch -p1 -P 41 +%patch -p1 -P 42 +%patch -p1 -P 43 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -635,6 +639,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Tue Oct 28 2025 Oyvind Albrigtsen - 4.16.0-33 +- podman-etcd: add support for cert rotation +- podman-etcd: compute dynamic revision bump from maxRaftIndex + + Resolves: RHEL-124203, RHEL-124206 + * Wed Oct 22 2025 Oyvind Albrigtsen - 4.16.0-32 - portblock: add promotable and nftables support, and method and status_check parameters