diff --git a/RHEL-145622-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch b/RHEL-145622-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch new file mode 100644 index 0000000..56d7701 --- /dev/null +++ b/RHEL-145622-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch @@ -0,0 +1,278 @@ +From 8df1e4dfdee960b971fb598c043b4ccb2b9fefca Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Mon, 3 Nov 2025 12:34:29 +0100 +Subject: [PATCH] podman-etcd: enhance etcd data backup with snapshots and + retention + +Replace basic data directory backup with proper etcd database snapshot +functionality. The new implementation: +- Creates timestamped snapshot files instead of moving the entire data directory +- Stores backups in a non-volatile location (backup_location parameter) instead + of the previous volatile HA_RSCTMP directory +- Validates backup file existence and size after creation +- Implements configurable retention policy via max_backup_snapshots parameter +- Automatically cleans up old snapshots to control storage usage + +Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd +by default. This provides better backup reliability, persistence across reboots, +and storage management for etcd databases. +--- + heartbeat/podman-etcd | 205 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 196 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index bb2900536..1d717ec00 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0" + OCF_RESKEY_oom_default="-997" + OCF_RESKEY_config_location_default="/var/lib/etcd" + OCF_RESKEY_backup_location_default="/var/lib/etcd" ++OCF_RESKEY_max_backup_snapshots_default="3" + + : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} + : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} +@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd" + : ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}} + : ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}} + : ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}} ++: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}} + + + ####################################################################### +@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups. + + + ++ ++ ++Maximum number of etcd database snapshots to retain. When a new snapshot is created, ++older snapshots will be automatically removed to maintain this limit. This helps ++control storage usage while ensuring recent backups are available for recovery. ++Set max_backup_snapshots=0 to disable backups. ++ ++Maximum number of backup snapshots to retain ++ ++ ++ + + + +@@ -720,20 +733,190 @@ EOF + return $OCF_SUCCESS + } + ++# Remove etcd member directory to allow the node to rejoin the cluster as a learner. ++# ++# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent ++# data inconsistencies. This function removes the member directory and syncs to disk. ++# ++# Returns: ++# OCF_SUCCESS - Member directory successfully removed ++# OCF_ERR_GENERIC - Failed to remove member directory (critical error) ++wipe_data_folder_for_learner() ++{ ++ ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin" ++ if ! rm -rf "$ETCD_MEMBER_DIR"; then ++ ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?" ++ return $OCF_ERR_GENERIC ++ fi ++ sync ++ return $OCF_SUCCESS ++} ++ ++ ++# Calculate available disk space in bytes for a given directory. ++# ++# This function queries the filesystem and returns available space in bytes. ++# It converts df output (KB) to bytes for consistent size comparisons. ++# ++# Arguments: ++# $1 - Target directory path to check ++# ++# Returns: ++# OCF_SUCCESS - Available space in bytes (via stdout) ++# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout) ++get_available_space_in_directory() ++{ ++ local target_dir=$1 ++ local available_space_kb ++ local available_space_bytes ++ ++ available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1) ++ ++ # Validate output is numeric ++ if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then ++ echo "df command failed or returned invalid value: $available_space_kb" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ available_space_bytes=$((available_space_kb*1024)) ++ echo "$available_space_bytes" ++ return $OCF_SUCCESS ++} ++ ++# Archive etcd database with backup and cleanup ++# ++# This function creates a backup copy of the etcd database, validates it, and ++# removes old backups according to the retention policy. Backups are optional ++# and can be disabled by setting max_backup_snapshots=0. ++# ++# Error handling strategy: ++# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery. ++# Backups are beneficial but not critical for recovery operations. ++# ++# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd ++# server is not running when this backup is performed. + archive_data_folder() + { +- # TODO: use etcd snapshots +- local dest_dir_name +- local data_dir="/var/lib/etcd/member" ++ local backup_dir="$OCF_RESKEY_backup_location" ++ local etcd_db_path="$ETCD_MEMBER_DIR/snap/db" + +- dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)" +- if [ ! -d $data_dir ]; then +- ocf_log info "no data dir to backup" ++ if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then ++ ocf_log debug "etcd backup disabled (max_backup_snapshots=0)" + return $OCF_SUCCESS + fi +- ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name" +- mv "$data_dir" "$HA_RSCTMP/$dest_dir_name" +- sync ++ ++ # Check if the etcd database file exists ++ if [ ! -f "$etcd_db_path" ]; then ++ ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'" ++ return $OCF_SUCCESS ++ fi ++ ++ # Ensure backup directory exists ++ if [ ! -d "$backup_dir" ]; then ++ ocf_log debug "creating backup directory: '$backup_dir'" ++ if ! mkdir -p "$backup_dir"; then ++ ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'" ++ return $OCF_SUCCESS ++ fi ++ fi ++ ++ ocf_log debug "checking disk space: backup_dir=$backup_dir" ++ local available_space_bytes ++ if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then ++ ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes" ++ return $OCF_SUCCESS ++ fi ++ ++ local required_space_bytes ++ required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1) ++ if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then ++ ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes" ++ return $OCF_SUCCESS ++ fi ++ ++ if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then ++ ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)" ++ return $OCF_SUCCESS ++ fi ++ ++ # Generate timestamp and backup filename ++ local timestamp ++ timestamp=$(date +%Y%m%d-%H%M%S) ++ ++ local backup_file ++ backup_file="$backup_dir/snapshot-$timestamp.db" ++ ++ ocf_log info "creating etcd database backup: '$backup_file'" ++ ++ # Create the backup by copying the database file (enable Copy-on-Write copy) ++ if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then ++ ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?" ++ return $OCF_SUCCESS ++ fi ++ ++ # Validate the backup file exists and has the expected size ++ if [ ! -f "$backup_file" ]; then ++ ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist" ++ return $OCF_SUCCESS ++ fi ++ ++ local backup_size_bytes ++ backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0") ++ if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then ++ ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)" ++ rm -f "$backup_file" ++ return $OCF_SUCCESS ++ fi ++ ++ ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)" ++ ++ # Cleanup old backups based on retention policy ++ cleanup_old_backups "$backup_dir" ++ ++ return $OCF_SUCCESS ++} ++ ++cleanup_old_backups() ++{ ++ local backup_dir="$1" ++ local max_snapshots="$OCF_RESKEY_max_backup_snapshots" ++ local backup_count ++ local backups_to_remove ++ local old_backups ++ ++ # Validate max_snapshots is a positive integer ++ if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then ++ ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup" ++ return $OCF_SUCCESS ++ fi ++ ++ # Count existing backup files ++ backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l) ++ ++ if [ "$backup_count" -le "$max_snapshots" ]; then ++ ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed" ++ return $OCF_SUCCESS ++ fi ++ ++ # Calculate how many backups to remove ++ backups_to_remove=$((backup_count - max_snapshots)) ++ ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots" ++ ++ # Find oldest backups sorted by modification time ++ # -t sorts by modification time, -r reverses (oldest first) ++ # -print0 and -0 handle filenames with spaces/special characters ++ old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \ ++ xargs -0 -r ls -tr | \ ++ head -n "$backups_to_remove") ++ ++ if [ -n "$old_backups" ]; then ++ ocf_log info "removing old backups: $old_backups" ++ if ! echo "$old_backups" | xargs -r rm -f; then ++ ocf_log warn "failed to remove some old backups, error code: $?" ++ fi ++ fi ++ ++ return $OCF_SUCCESS + } + + etcd_pod_container_exists() { +@@ -1902,6 +2085,9 @@ podman_start() + fi + + archive_data_folder ++ if ! wipe_data_folder_for_learner; then ++ return "$OCF_ERR_GENERIC" ++ fi + fi + + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" +@@ -2251,6 +2437,7 @@ CONTAINER=$OCF_RESKEY_name + POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" + ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" + ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" ++ETCD_MEMBER_DIR="/var/lib/etcd/member" + ETCD_REVISION_JSON="/var/lib/etcd/revision.json" + ETCD_REVISION_BUMP_PERCENTAGE=0.2 + ETCD_BUMP_REV_DEFAULT=1000000000 diff --git a/resource-agents.spec b/resource-agents.spec index 7b0f812..1dcc745 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.16.0 -Release: 51%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 52%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPL-2.0-or-later AND LGPL-2.1-or-later URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -116,6 +116,7 @@ Patch63: RHEL-139066-podman-etcd-verify-no-containers-running-or-being-deleted.p Patch64: RHEL-50380-powervs-subnet-wait-for-IP.patch Patch65: RHEL-143524-powervs-move-ip-powervs-subnet-fix-error-logging.patch Patch66: RHEL-116149-RHEL-116152-4-check-correct-binary-during-validate-all.patch +Patch67: RHEL-145622-podman-etcd-enhance-etcd-data-backup-with-snapshots-and-retention.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aliyun.patch @@ -353,6 +354,7 @@ exit 1 %patch -p1 -P 64 %patch -p1 -P 65 %patch -p1 -P 66 +%patch -p1 -P 67 # bundled ha-cloud-support libs %patch -p1 -P 500 @@ -685,6 +687,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Wed Feb 4 2026 Oyvind Albrigtsen - 4.16.0-52 +- podman-etcd: enhance etcd data backup with snapshots and retention + + Resolves: RHEL-145622 + * Tue Feb 3 2026 Oyvind Albrigtsen - 4.16.0-51 - portblock: add promotable and nftables support, and method and status_check parameters