- podman-etcd: enhance etcd data backup with snapshots and retention
Resolves: RHEL-145622
This commit is contained in:
parent
ab29d76183
commit
4fdf3a778a
@ -0,0 +1,278 @@
|
||||
From 8df1e4dfdee960b971fb598c043b4ccb2b9fefca Mon Sep 17 00:00:00 2001
|
||||
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||
Date: Mon, 3 Nov 2025 12:34:29 +0100
|
||||
Subject: [PATCH] podman-etcd: enhance etcd data backup with snapshots and
|
||||
retention
|
||||
|
||||
Replace basic data directory backup with proper etcd database snapshot
|
||||
functionality. The new implementation:
|
||||
- Creates timestamped snapshot files instead of moving the entire data directory
|
||||
- Stores backups in a non-volatile location (backup_location parameter) instead
|
||||
of the previous volatile HA_RSCTMP directory
|
||||
- Validates backup file existence and size after creation
|
||||
- Implements configurable retention policy via max_backup_snapshots parameter
|
||||
- Automatically cleans up old snapshots to control storage usage
|
||||
|
||||
Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd
|
||||
by default. This provides better backup reliability, persistence across reboots,
|
||||
and storage management for etcd databases.
|
||||
---
|
||||
heartbeat/podman-etcd | 205 ++++++++++++++++++++++++++++++++++++++++--
|
||||
1 file changed, 196 insertions(+), 9 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||
index bb2900536..1d717ec00 100755
|
||||
--- a/heartbeat/podman-etcd
|
||||
+++ b/heartbeat/podman-etcd
|
||||
@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
|
||||
OCF_RESKEY_oom_default="-997"
|
||||
OCF_RESKEY_config_location_default="/var/lib/etcd"
|
||||
OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
+OCF_RESKEY_max_backup_snapshots_default="3"
|
||||
|
||||
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
||||
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
||||
+: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
|
||||
|
||||
|
||||
#######################################################################
|
||||
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
|
||||
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
|
||||
</parameter>
|
||||
|
||||
+<parameter name="max_backup_snapshots" required="0" unique="0">
|
||||
+<longdesc lang="en">
|
||||
+Maximum number of etcd database snapshots to retain. When a new snapshot is created,
|
||||
+older snapshots will be automatically removed to maintain this limit. This helps
|
||||
+control storage usage while ensuring recent backups are available for recovery.
|
||||
+Set max_backup_snapshots=0 to disable backups.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
|
||||
+<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
|
||||
+</parameter>
|
||||
+
|
||||
</parameters>
|
||||
|
||||
<actions>
|
||||
@@ -720,20 +733,190 @@ EOF
|
||||
return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
+# Remove etcd member directory to allow the node to rejoin the cluster as a learner.
|
||||
+#
|
||||
+# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
|
||||
+# data inconsistencies. This function removes the member directory and syncs to disk.
|
||||
+#
|
||||
+# Returns:
|
||||
+# OCF_SUCCESS - Member directory successfully removed
|
||||
+# OCF_ERR_GENERIC - Failed to remove member directory (critical error)
|
||||
+wipe_data_folder_for_learner()
|
||||
+{
|
||||
+ ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin"
|
||||
+ if ! rm -rf "$ETCD_MEMBER_DIR"; then
|
||||
+ ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ sync
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+
|
||||
+# Calculate available disk space in bytes for a given directory.
|
||||
+#
|
||||
+# This function queries the filesystem and returns available space in bytes.
|
||||
+# It converts df output (KB) to bytes for consistent size comparisons.
|
||||
+#
|
||||
+# Arguments:
|
||||
+# $1 - Target directory path to check
|
||||
+#
|
||||
+# Returns:
|
||||
+# OCF_SUCCESS - Available space in bytes (via stdout)
|
||||
+# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
|
||||
+get_available_space_in_directory()
|
||||
+{
|
||||
+ local target_dir=$1
|
||||
+ local available_space_kb
|
||||
+ local available_space_bytes
|
||||
+
|
||||
+ available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1)
|
||||
+
|
||||
+ # Validate output is numeric
|
||||
+ if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then
|
||||
+ echo "df command failed or returned invalid value: $available_space_kb"
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+
|
||||
+ available_space_bytes=$((available_space_kb*1024))
|
||||
+ echo "$available_space_bytes"
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+# Archive etcd database with backup and cleanup
|
||||
+#
|
||||
+# This function creates a backup copy of the etcd database, validates it, and
|
||||
+# removes old backups according to the retention policy. Backups are optional
|
||||
+# and can be disabled by setting max_backup_snapshots=0.
|
||||
+#
|
||||
+# Error handling strategy:
|
||||
+# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
|
||||
+# Backups are beneficial but not critical for recovery operations.
|
||||
+#
|
||||
+# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
|
||||
+# server is not running when this backup is performed.
|
||||
archive_data_folder()
|
||||
{
|
||||
- # TODO: use etcd snapshots
|
||||
- local dest_dir_name
|
||||
- local data_dir="/var/lib/etcd/member"
|
||||
+ local backup_dir="$OCF_RESKEY_backup_location"
|
||||
+ local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
|
||||
|
||||
- dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
|
||||
- if [ ! -d $data_dir ]; then
|
||||
- ocf_log info "no data dir to backup"
|
||||
+ if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then
|
||||
+ ocf_log debug "etcd backup disabled (max_backup_snapshots=0)"
|
||||
return $OCF_SUCCESS
|
||||
fi
|
||||
- ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
|
||||
- mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
|
||||
- sync
|
||||
+
|
||||
+ # Check if the etcd database file exists
|
||||
+ if [ ! -f "$etcd_db_path" ]; then
|
||||
+ ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Ensure backup directory exists
|
||||
+ if [ ! -d "$backup_dir" ]; then
|
||||
+ ocf_log debug "creating backup directory: '$backup_dir'"
|
||||
+ if ! mkdir -p "$backup_dir"; then
|
||||
+ ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log debug "checking disk space: backup_dir=$backup_dir"
|
||||
+ local available_space_bytes
|
||||
+ if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then
|
||||
+ ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ local required_space_bytes
|
||||
+ required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1)
|
||||
+ if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then
|
||||
+ ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then
|
||||
+ ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Generate timestamp and backup filename
|
||||
+ local timestamp
|
||||
+ timestamp=$(date +%Y%m%d-%H%M%S)
|
||||
+
|
||||
+ local backup_file
|
||||
+ backup_file="$backup_dir/snapshot-$timestamp.db"
|
||||
+
|
||||
+ ocf_log info "creating etcd database backup: '$backup_file'"
|
||||
+
|
||||
+ # Create the backup by copying the database file (enable Copy-on-Write copy)
|
||||
+ if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then
|
||||
+ ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Validate the backup file exists and has the expected size
|
||||
+ if [ ! -f "$backup_file" ]; then
|
||||
+ ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ local backup_size_bytes
|
||||
+ backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
|
||||
+ if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then
|
||||
+ ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)"
|
||||
+ rm -f "$backup_file"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)"
|
||||
+
|
||||
+ # Cleanup old backups based on retention policy
|
||||
+ cleanup_old_backups "$backup_dir"
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
+}
|
||||
+
|
||||
+cleanup_old_backups()
|
||||
+{
|
||||
+ local backup_dir="$1"
|
||||
+ local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
|
||||
+ local backup_count
|
||||
+ local backups_to_remove
|
||||
+ local old_backups
|
||||
+
|
||||
+ # Validate max_snapshots is a positive integer
|
||||
+ if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
|
||||
+ ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Count existing backup files
|
||||
+ backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l)
|
||||
+
|
||||
+ if [ "$backup_count" -le "$max_snapshots" ]; then
|
||||
+ ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
|
||||
+ return $OCF_SUCCESS
|
||||
+ fi
|
||||
+
|
||||
+ # Calculate how many backups to remove
|
||||
+ backups_to_remove=$((backup_count - max_snapshots))
|
||||
+ ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"
|
||||
+
|
||||
+ # Find oldest backups sorted by modification time
|
||||
+ # -t sorts by modification time, -r reverses (oldest first)
|
||||
+ # -print0 and -0 handle filenames with spaces/special characters
|
||||
+ old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \
|
||||
+ xargs -0 -r ls -tr | \
|
||||
+ head -n "$backups_to_remove")
|
||||
+
|
||||
+ if [ -n "$old_backups" ]; then
|
||||
+ ocf_log info "removing old backups: $old_backups"
|
||||
+ if ! echo "$old_backups" | xargs -r rm -f; then
|
||||
+ ocf_log warn "failed to remove some old backups, error code: $?"
|
||||
+ fi
|
||||
+ fi
|
||||
+
|
||||
+ return $OCF_SUCCESS
|
||||
}
|
||||
|
||||
etcd_pod_container_exists() {
|
||||
@@ -1902,6 +2085,9 @@ podman_start()
|
||||
fi
|
||||
|
||||
archive_data_folder
|
||||
+ if ! wipe_data_folder_for_learner; then
|
||||
+ return "$OCF_ERR_GENERIC"
|
||||
+ fi
|
||||
fi
|
||||
|
||||
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||
@@ -2251,6 +2437,7 @@ CONTAINER=$OCF_RESKEY_name
|
||||
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||
+ETCD_MEMBER_DIR="/var/lib/etcd/member"
|
||||
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
|
||||
ETCD_REVISION_BUMP_PERCENTAGE=0.2
|
||||
ETCD_BUMP_REV_DEFAULT=1000000000
|
||||
@ -45,7 +45,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.16.0
|
||||
Release: 51%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
Release: 52%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
|
||||
License: GPL-2.0-or-later AND LGPL-2.1-or-later
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
|
||||
@ -116,6 +116,7 @@ Patch63: RHEL-139066-podman-etcd-verify-no-containers-running-or-being-deleted.p
|
||||
Patch64: RHEL-50380-powervs-subnet-wait-for-IP.patch
|
||||
Patch65: RHEL-143524-powervs-move-ip-powervs-subnet-fix-error-logging.patch
|
||||
Patch66: RHEL-116149-RHEL-116152-4-check-correct-binary-during-validate-all.patch
|
||||
Patch67: RHEL-145622-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
Patch500: ha-cloud-support-aliyun.patch
|
||||
@ -353,6 +354,7 @@ exit 1
|
||||
%patch -p1 -P 64
|
||||
%patch -p1 -P 65
|
||||
%patch -p1 -P 66
|
||||
%patch -p1 -P 67
|
||||
|
||||
# bundled ha-cloud-support libs
|
||||
%patch -p1 -P 500
|
||||
@ -685,6 +687,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Wed Feb 4 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-52
|
||||
- podman-etcd: enhance etcd data backup with snapshots and retention
|
||||
|
||||
Resolves: RHEL-145622
|
||||
|
||||
* Tue Feb 3 2026 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.16.0-51
|
||||
- portblock: add promotable and nftables support, and method and
|
||||
status_check parameters
|
||||
|
||||
Loading…
Reference in New Issue
Block a user