- podman-etcd: add support for cert rotation
- podman-etcd: compute dynamic revision bump from maxRaftIndex Resolves: RHEL-124203, RHEL-124206
This commit is contained in:
parent
3e111eae9a
commit
8890b6688b
166
RHEL-124203-podman-etcd-certificate-rotation.patch
Normal file
166
RHEL-124203-podman-etcd-certificate-rotation.patch
Normal file
@ -0,0 +1,166 @@
|
||||
From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001
|
||||
From: ehila <ehila@redhat.com>
|
||||
Date: Thu, 16 Oct 2025 23:39:32 -0400
|
||||
Subject: [PATCH] feat: add support for podman-etcd cert rotation
|
||||
|
||||
added a cert check function to the monitor call to force a restart of etcd when the certs have been changed
|
||||
|
||||
Signed-off-by: ehila <ehila@redhat.com>
|
||||
---
|
||||
heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 file changed, 86 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index e1425ec02..b8dfb2f9e 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -40,6 +40,7 @@
|
||||
# Parameter defaults
|
||||
OCF_RESKEY_image_default="default"
|
||||
OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml"
|
||||
+OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs"
|
||||
OCF_RESKEY_name_default="etcd"
|
||||
OCF_RESKEY_nic_default="br-ex"
|
||||
OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||
@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
+: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}}
|
||||
: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
|
||||
: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}}
|
||||
: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
|
||||
@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd.
|
||||
<content type="string" default="${OCF_RESKEY_pod_manifest_default}"/>
|
||||
</parameter>
|
||||
|
||||
+<parameter name="etcd_certs_dir" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+The Etcd certificates directory mounted into the etcd container.
|
||||
+The agent will monitor this directory for changes and restart the etcd container if the certificates have changed.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Etcd certificates directory</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_etcd_certs_dir_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
<parameter name="image" required="0" unique="0">
|
||||
<longdesc lang="en">
|
||||
The podman image to base this container off of.
|
||||
@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set.
|
||||
END
|
||||
}
|
||||
|
||||
+etcd_certificates_hash_manager()
|
||||
+{
|
||||
+ local action="$1"
|
||||
+ local current_hash
|
||||
+ local stored_hash
|
||||
+
|
||||
+ # If the certs directory doesn't exist, consider it unchanged
|
||||
+ if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then
|
||||
+ ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information
|
||||
+ # we only need to monitor the certificate files to detect changes.
|
||||
+ if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then
|
||||
+ ocf_log err "failed to calculate certificate files hash"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ # If no stored hash exists, create one and return success
|
||||
+ if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then
|
||||
+ echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"
|
||||
+ ocf_log info "created initial certificate hash: $current_hash"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ case "$action" in
|
||||
+ "update")
|
||||
+ if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then
|
||||
+ ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE"
|
||||
+ fi
|
||||
+ ocf_log info "updated certificate hash: $current_hash"
|
||||
+ ;;
|
||||
+ "check")
|
||||
+ if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then
|
||||
+ ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE"
|
||||
+ # This should not happen but if for some reason we can not read the stored hash,
|
||||
+ # use the current hash and log the error but allow etcd to run as long as possible.
|
||||
+ stored_hash="$current_hash"
|
||||
+ fi
|
||||
+ if [ "$current_hash" != "$stored_hash" ]; then
|
||||
+ ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ ;;
|
||||
+ *)
|
||||
+ ocf_log err "unsupported action: $action"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ ;;
|
||||
+ esac
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
|
||||
monitor_cmd_exec()
|
||||
{
|
||||
@@ -357,7 +421,7 @@ archive_current_container()
|
||||
|
||||
# archive corresponding etcd configuration files
|
||||
local files_to_archive=""
|
||||
- for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
|
||||
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do
|
||||
if [ -f "$file" ]; then
|
||||
files_to_archive="$files_to_archive $file"
|
||||
else
|
||||
@@ -1178,6 +1242,11 @@ podman_monitor()
|
||||
return $rc
|
||||
fi
|
||||
|
||||
+ # Check if certificate files have changed, if they have, etcd needs to be restarted
|
||||
+ if ! etcd_certificates_hash_manager "check"; then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
if is_learner; then
|
||||
ocf_log info "$NODENAME is learner. Cannot get member id"
|
||||
return "$OCF_SUCCESS"
|
||||
@@ -1483,6 +1552,14 @@ podman_start()
|
||||
return $OCF_ERR_GENERIC
|
||||
fi
|
||||
|
||||
+ # Update the certificate hash after the container has started successfully
|
||||
+ # this is to ensure that the certificate hash is updated after a restart is initiated
|
||||
+ # by a cert rotation event from the monitor command.
|
||||
+ if ! etcd_certificates_hash_manager "update"; then
|
||||
+ ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
# check if the container has already started
|
||||
podman_simple_status
|
||||
if [ $? -eq $OCF_SUCCESS ]; then
|
||||
@@ -1888,6 +1965,13 @@ podman_validate()
|
||||
exit $OCF_ERR_CONFIGURED
|
||||
fi
|
||||
|
||||
+ if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \
|
||||
+ || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \
|
||||
+ || ! rm "$ETCD_CERTS_HASH_FILE"; then
|
||||
+ ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE"
|
||||
+ exit $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name
|
||||
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
+ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
# command, so make sure that invocation is always valid by enforcing the
|
||||
@ -0,0 +1,115 @@
|
||||
From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Tue, 28 Oct 2025 12:47:10 +0100
|
||||
Subject: [PATCH] podman-etcd: compute dynamic revision bump from maxRaftIndex
|
||||
(#2087)
|
||||
|
||||
Replace hardcoded 1 billion revision bump with dynamic calculation based
|
||||
on 20% of the last known maxRaftIndex from revision.json.
|
||||
|
||||
This aligns with the logic used by cluster-etcd-operator's
|
||||
quorum-restore-pod utility and ensures the bump amount is proportional
|
||||
to the cluster's actual revision state.
|
||||
|
||||
The implementation:
|
||||
- Adds compute_bump_revision() function with safe fallback to 1bn
|
||||
default
|
||||
- Extracts magic values to named constants
|
||||
(ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT,
|
||||
ETCD_REVISION_JSON)
|
||||
- Validates computed values (non-zero, not exceeding default)
|
||||
- Logs computation results for debugging
|
||||
|
||||
Reference:
|
||||
https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166
|
||||
22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||
---
|
||||
heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++----
|
||||
1 file changed, 34 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index b8dfb2f9e..551d37a20 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -619,16 +619,43 @@ prepare_env() {
|
||||
LISTEN_METRICS_URLS="0.0.0.0"
|
||||
}
|
||||
|
||||
+compute_bump_revision() {
|
||||
+ # Same logic used by cluster-etcd-operator quorum-restore-pod utility.
|
||||
+ # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||
+ # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
|
||||
+ BUMP_REV=$ETCD_BUMP_REV_DEFAULT
|
||||
+ if [ ! -f "${ETCD_REVISION_JSON}" ]; then
|
||||
+ ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ # this will bump by the amount of 20% of the last known live revision.
|
||||
+ if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then
|
||||
+ ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then
|
||||
+ ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||
+ return
|
||||
+ fi
|
||||
+
|
||||
+ BUMP_REV="${COMPUTED_BUMP}"
|
||||
+ ocf_log info "bumping etcd revisions by ${BUMP_REV}"
|
||||
+}
|
||||
|
||||
generate_etcd_configuration() {
|
||||
if is_force_new_cluster; then
|
||||
+ compute_bump_revision
|
||||
# The embedded newline is required for correct YAML formatting.
|
||||
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
|
||||
-force-new-cluster-bump-amount: 1000000000"
|
||||
+force-new-cluster-bump-amount: $BUMP_REV"
|
||||
else
|
||||
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
|
||||
fi
|
||||
|
||||
+ # the space indentation for client-transport-security and peer-transport-security
|
||||
+ # is required for correct YAML formatting.
|
||||
cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||
logger: zap
|
||||
log-level: info
|
||||
@@ -707,7 +734,7 @@ attribute_node_cluster_id()
|
||||
{
|
||||
local action="$1"
|
||||
local value
|
||||
- if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then
|
||||
+ if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
|
||||
rc=$?
|
||||
ocf_log err "could not get cluster_id, error code: $rc"
|
||||
return "$rc"
|
||||
@@ -745,7 +772,7 @@ attribute_node_revision()
|
||||
local value
|
||||
local attribute="revision"
|
||||
|
||||
- if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then
|
||||
+ if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then
|
||||
rc=$?
|
||||
ocf_log err "could not get $attribute, error code: $rc"
|
||||
return "$rc"
|
||||
@@ -1456,7 +1483,7 @@ can_reuse_container() {
|
||||
|
||||
|
||||
# If the container does not exist it cannot be reused
|
||||
- if ! container_exists; then
|
||||
+ if ! container_exists; then
|
||||
OCF_RESKEY_reuse=0
|
||||
return "$OCF_SUCCESS"
|
||||
fi
|
||||
@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name
|
||||
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
+ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
|
||||
+ETCD_REVISION_BUMP_PERCENTAGE=0.2
|
||||
+ETCD_BUMP_REV_DEFAULT=1000000000
|
||||
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||
|
||||
# Note: we currently monitor podman containers by with the "podman exec"
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.16.0
|
||||
Release: 32%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 33%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPL-2.0-or-later AND LGPL-2.1-or-later
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -91,6 +91,8 @@ Patch38: RHEL-116149-RHEL-116152-2-portblock-fix-incorrect-promotable-descriptio
|
||||
Patch39: RHEL-116149-RHEL-116152-3-portblock-fixes-add-method-and-status_check-parameters.patch
|
||||
Patch40: RHEL-119504-podman-etcd-add-automatic-learner-member-promotion.patch
|
||||
Patch41: RHEL-115495-db2-use-reintegration-flag-to-avoid-race-condition-on-cluster-reintegration.patch
|
||||
Patch42: RHEL-124203-podman-etcd-certificate-rotation.patch
|
||||
Patch43: RHEL-124206-podman-etcd-compute-dynamic-revision-bump-from-maxRaftIndex.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -303,6 +305,8 @@ exit 1
|
||||
%patch -p1 -P 39
|
||||
%patch -p1 -P 40
|
||||
%patch -p1 -P 41
|
||||
%patch -p1 -P 42
|
||||
%patch -p1 -P 43
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -635,6 +639,12 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Tue Oct 28 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-33
|
||||
- podman-etcd: add support for cert rotation
|
||||
- podman-etcd: compute dynamic revision bump from maxRaftIndex
|
||||
|
||||
Resolves: RHEL-124203, RHEL-124206
|
||||
|
||||
* Wed Oct 22 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-32
|
||||
- portblock: add promotable and nftables support, and method and
|
||||
status_check parameters
|
||||
|
||||
Loading…
Reference in New Issue
Block a user