resource-agents/SOURCES/RHEL-113810-podman-etcd-preserve-containers-for-debugging.patch

687 lines
26 KiB
Diff

From 6e9200dc2ffc89382188794742361985309936b2 Mon Sep 17 00:00:00 2001
From: Carlo Lobrano <c.lobrano@gmail.com>
Date: Wed, 23 Jul 2025 09:34:13 +0200
Subject: [PATCH] podman-etcd: preserve containers for debugging
This change modifies the agent to keep stopped containers for log
inspection and debugging, with supporting changes to enable this
behavior.
* Conditionally reuse existing containers when configuration unchanged
* Move etcd inline configuration flags to external file to allow
restarts without container recreation (mainly for the
force-new-cluster flag)
* Archive previous container renaming it into *-previous, and its
configuration files into /var/lib/etcd/config-previous.tar.gz archive.
The tar.gz archive consists in:
* the pod manifest created by CEO, used to generated the Etc
configuration file
* the Etcd configuration file
* the auth json file
Only one copy is maintained to limit disk usage.
* Both configuration and backup files location is configurable with 2
new input arguments.
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
---
heartbeat/podman-etcd | 438 ++++++++++++++++++++++++++++++++----------
1 file changed, 336 insertions(+), 102 deletions(-)
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
index 4969fbaaf..33804414a 100755
--- a/heartbeat/podman-etcd
+++ b/heartbeat/podman-etcd
@@ -46,6 +46,8 @@ OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
OCF_RESKEY_allow_pull_default="1"
OCF_RESKEY_reuse_default="0"
OCF_RESKEY_oom_default="-997"
+OCF_RESKEY_config_location_default="/var/lib/etcd"
+OCF_RESKEY_backup_location_default="/var/lib/etcd"
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
@@ -55,6 +57,9 @@ OCF_RESKEY_oom_default="-997"
: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
+: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
+: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
+
#######################################################################
@@ -242,6 +247,23 @@ https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#
<shortdesc lang="en">OOM for container</shortdesc>
<content type="integer" default="${OCF_RESKEY_oom_default}"/>
</parameter>
+
+<parameter name="config_location" required="0" unique="0">
+<longdesc lang="en">
+The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest.
+</longdesc>
+<shortdesc lang="en">Resource agent state directory</shortdesc>
+<content type="string" default="${OCF_RESKEY_config_location_default}"/>
+</parameter>
+
+<parameter name="backup_location" required="0" unique="0">
+<longdesc lang="en">
+The directory where the resource agent stores its backups.
+</longdesc>
+<shortdesc lang="en">Resource agent backup directory</shortdesc>
+<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
+</parameter>
+
</parameters>
<actions>
@@ -309,42 +331,52 @@ container_exists()
return 1
}
-remove_container()
+# archive_current_container archives the current
+# podman etcd container and its configuration files.
+archive_current_container()
{
- local rc
- local execids
+ # don't attempt to archive a container that doesn't exist
+ if ! container_exists; then
+ return
+ fi
- if ocf_is_true "$OCF_RESKEY_reuse"; then
- # never remove the container if we have reuse enabled.
- return 0
+ # delete any container named "*-previous", or we won't be able to archive the current container.
+ if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then
+ ocf_log info "removing old archived container '$CONTAINER-previous'"
+ if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then
+ ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container"
+ return
+ fi
fi
- if ! container_exists; then
- # don't attempt to remove a container that doesn't exist
- return 0
+ ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes"
+ if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then
+ ocf_log err "could not archive container '$CONTAINER', error code: $?"
+ return
fi
- ocf_log notice "Cleaning up inactive container, ${CONTAINER}."
- ocf_run podman rm -v "$CONTAINER"
- rc=$?
- if [ $rc -ne 0 ]; then
- if [ $rc -eq 2 ]; then
- if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then
- ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
- ocf_run podman rm -f "$CONTAINER"
- rc=$?
- fi
- fi
- # due to a podman bug (rhbz#1841485), sometimes a stopped
- # container can still be associated with Exec sessions, in
- # which case the "podman rm" has to be forced
- execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}')
- if [ "$execids" -ne "0" ]; then
- ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it."
- ocf_run podman rm -f "$CONTAINER"
- rc=$?
+
+ # archive corresponding etcd configuration files
+ local files_to_archive=""
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
+ if [ -f "$file" ]; then
+ files_to_archive="$files_to_archive $file"
+ else
+ ocf_log warn "file '$file' is missing and won't be archived"
fi
+ done
+
+ if [ -z "$files_to_archive" ]; then
+ ocf_log warn "could not find any file to archive."
+ return
+ fi
+
+ # NOTE: tar will override any existing archive as wanted
+ # shellcheck disable=SC2086
+ if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then
+ ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files"
+ else
+ ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'"
fi
- return $rc
}
# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address.
@@ -365,6 +397,7 @@ attribute_node_ip()
local attribute="node_ip"
local ip_addr name
+ # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
name=$(echo "$node" | cut -d: -f1)
# ignore other nodes
@@ -375,7 +408,7 @@ attribute_node_ip()
done
if [ -z "$ip_addr" ]; then
- ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)"
+ ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'"
return 1
fi
@@ -384,9 +417,9 @@ attribute_node_ip()
echo "$ip_addr"
;;
update)
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then
rc="$?"
- ocf_log err "could not set $attribute to $value, error code: $rc"
+ ocf_log err "could not set $attribute to $ip_addr, error code: $rc"
return "$rc"
fi
;;
@@ -428,6 +461,48 @@ get_env_from_manifest() {
echo "$env_var_value"
}
+# etcd configuration file expects duration to be expressed in nanoseconds
+convert_duration_in_nanoseconds() {
+ local duration=$1
+ local value unit nanoseconds
+
+ if [ -z "$duration" ]; then
+ ocf_log err "convert_duration_in_nanoseconds: no duration provided"
+ return 1
+ fi
+
+ if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then
+ ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: <number><unit> where unit is one of s, ms, us, µs, ns"
+ return 1
+ fi
+
+ # Extract numeric value and unit from duration string
+ value=$(echo "$duration" | sed 's/[^0-9]*$//')
+ unit=$(echo "$duration" | sed 's/^[0-9]*//')
+
+ case "$unit" in
+ ns)
+ nanoseconds=$value
+ ;;
+ us|µs)
+ nanoseconds=$((value * 1000))
+ ;;
+ ms)
+ nanoseconds=$((value * 1000000))
+ ;;
+ s)
+ nanoseconds=$((value * 1000000000))
+ ;;
+ *)
+ # this should not happen as the input is already validated
+ ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\""
+ return 1
+ ;;
+ esac
+
+ echo "$nanoseconds"
+}
+
prepare_env() {
local name ip ipurl standalone_node
@@ -457,9 +532,14 @@ prepare_env() {
ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API")
ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES")
ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR")
+ if [ ! -d "$ETCD_DATA_DIR" ]; then
+ ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\""
+ return "$OCF_ERR_ARGS"
+ else
+ ocf_log info "using data-dir: $ETCD_DATA_DIR"
+ fi
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
- ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS")
ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
@@ -475,6 +555,62 @@ prepare_env() {
LISTEN_METRICS_URLS="0.0.0.0"
}
+
+generate_etcd_configuration() {
+ if is_force_new_cluster; then
+ # The embedded newline is required for correct YAML formatting.
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
+force-new-cluster-bump-amount: 1000000000"
+ else
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
+ fi
+
+ cat > "$ETCD_CONFIGURATION_FILE" << EOF
+logger: zap
+log-level: info
+snapshot-count: 10000
+name: $NODENAME
+data-dir: $ETCD_DATA_DIR
+$FORCE_NEW_CLUSTER_CONFIG
+socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS
+election-timeout: $ETCD_ELECTION_TIMEOUT
+enable-pprof: $ETCD_ENABLE_PPROF
+heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL
+quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES
+initial-advertise-peer-urls: "$NODEIPURL:2380"
+listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380"
+listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0"
+initial-cluster: $ETCD_INITIAL_CLUSTER
+initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE
+client-transport-security:
+ cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt
+ key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key
+ client-cert-auth: true
+ trusted-ca-file: $SERVER_CACERT
+peer-transport-security:
+ cert-file: $ETCD_PEER_CERT
+ key-file: $ETCD_PEER_KEY
+ client-cert-auth: true
+ trusted-ca-file: $SERVER_CACERT
+advertise-client-urls: "$NODEIPURL:2379"
+listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978"
+metrics: extensive
+experimental-initial-corrupt-check: true
+experimental-max-learners: 1
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
+EOF
+
+ {
+ if [ -n "$ETCD_CIPHER_SUITES" ]; then
+ echo "cipher-suites:"
+ echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do
+ echo " - \"$cipher\""
+ done
+ fi
+ } >> "$ETCD_CONFIGURATION_FILE"
+}
+
archive_data_folder()
{
# TODO: use etcd snapshots
@@ -634,7 +770,7 @@ add_member_as_learner()
local endpoint_url=$(ip_url $(attribute_node_ip get))
local peer_url=$(ip_url $member_ip)
- ocf_log info "add $member_name ($member_ip) to the member list as learner"
+ ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
rc=$?
if [ $rc -ne 0 ]; then
@@ -1104,18 +1240,18 @@ compare_revision()
peer_revision=$(attribute_node_revision_peer)
if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then
- ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision"
+ ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'"
return "$OCF_ERR_GENERIC"
fi
if [ "$revision" -gt "$peer_revision" ]; then
- ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision"
+ ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'"
echo "newer"
elif [ "$revision" -eq "$peer_revision" ]; then
- ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision"
+ ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'"
echo "equal"
else
- ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision"
+ ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'"
echo "older"
fi
return "$OCF_SUCCESS"
@@ -1144,6 +1280,100 @@ ensure_pod_manifest_exists()
return "$OCF_SUCCESS"
}
+filter_pod_manifest() {
+ # Remove pod-version related fields from POD manifest
+ local pod_manifest="$1"
+ local temporary_file
+ local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))'
+
+ if ! temporary_file=$(mktemp); then
+ ocf_log err "could not create temporary file for '$pod_manifest', error code: $?"
+ return $OCF_ERR_GENERIC
+ fi
+ if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then
+ ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?"
+ return $OCF_ERR_GENERIC
+ fi
+ echo "$temporary_file"
+}
+
+can_reuse_container() {
+ # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes.
+ # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed.
+ local cp_rc
+ local diff_rc
+ local filtered_original_pod_manifest
+ local filtered_copy_pod_manifest
+
+
+ # If the container does not exist it cannot be reused
+ if ! container_exists; then
+ OCF_RESKEY_reuse=0
+ return "$OCF_SUCCESS"
+ fi
+
+ # If the manifest copy doesn't exist, we need a new container.
+ if [ ! -f "$POD_MANIFEST_COPY" ]; then
+ ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created."
+ OCF_RESKEY_reuse=0
+ return "$OCF_SUCCESS"
+ fi
+
+ if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
+ return $OCF_ERR_GENERIC
+ fi
+ if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY"
+ ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
+ diff_rc="$?"
+ # clean up temporary files
+ rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
+ case "$diff_rc" in
+ 0)
+ ocf_log info "Reusing the existing etcd container"
+ OCF_RESKEY_reuse=1
+ ;;
+ 1)
+ ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes"
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
+ cp_rc="$?"
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
+ return "$OCF_ERR_GENERIC"
+ fi
+ ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created"
+ OCF_RESKEY_reuse=0
+ ;;
+ *)
+ ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc"
+ return "$OCF_ERR_GENERIC"
+ ;;
+ esac
+
+ return "$OCF_SUCCESS"
+}
+
+ensure_pod_manifest_copy_exists() {
+ local cp_rc
+
+ if [ -f "$POD_MANIFEST_COPY" ]; then
+ return "$OCF_SUCCESS"
+ fi
+
+ # If the manifest copy doesn't exist, create it and ensure a new container.
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
+ cp_rc="$?"
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
+ return "$OCF_ERR_GENERIC"
+ fi
+
+ ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created"
+
+ return "$OCF_SUCCESS"
+}
+
podman_start()
{
local cid
@@ -1173,6 +1403,13 @@ podman_start()
return $OCF_ERR_GENERIC
fi
+ # check if the container has already started
+ podman_simple_status
+ if [ $? -eq $OCF_SUCCESS ]; then
+ ocf_log info "the '$CONTAINER' has already started. Nothing to do"
+ return "$OCF_SUCCESS"
+ fi
+
if ! ensure_pod_manifest_exists; then
ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)"
return "$OCF_ERR_GENERIC"
@@ -1186,8 +1423,9 @@ podman_start()
ocf_log info "static pod was running: start normally"
else
if is_force_new_cluster; then
- ocf_log notice "$NODENAME marked to force-new-cluster"
+ ocf_log notice "'$NODENAME' marked to force-new-cluster"
else
+ ocf_log info "'$NODENAME' is not marked to force-new-cluster"
# When the local agent starts, we can infer the cluster state by counting
# how many agents are starting or already active:
# - 1 active agent: it's the peer (we are just starting)
@@ -1195,6 +1433,7 @@ podman_start()
# - 0 active agents, 2 starting: both agents are starting simultaneously
local active_resources_count
active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
+ ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
case "$active_resources_count" in
1)
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
@@ -1205,17 +1444,17 @@ podman_start()
fi
;;
0)
+ # count how many agents are starting now
+ local start_resources_count
+ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
+ ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')"
+
# we need to compare the revisions in any of the following branches
# so call the function only once here
if ! revision_compare_result=$(compare_revision); then
ocf_log err "could not compare revisions, error code: $?"
return "$OCF_ERR_GENERIC"
fi
-
- # count how many agents are starting now
- local start_resources_count
- start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
-
case "$start_resources_count" in
1)
ocf_log debug "peer not starting: ensure we can start a new cluster"
@@ -1231,6 +1470,7 @@ podman_start()
fi
;;
2)
+ # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes?
ocf_log info "peer starting"
if [ "$revision_compare_result" = "newer" ]; then
set_force_new_cluster
@@ -1263,7 +1503,7 @@ podman_start()
fi
podman_create_mounts
- local run_opts="--detach --name=${CONTAINER}"
+ local run_opts="--detach --name=${CONTAINER} --replace"
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
@@ -1297,61 +1537,59 @@ podman_start()
archive_data_folder
fi
- prepare_env
+ ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
+ if ! can_reuse_container ; then
+ rc="$?"
+ ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
+ return "$rc"
+ fi
+
+ # Archive current container and its configuration before creating
+ # new configuration files.
+ if ! ocf_is_true "$OCF_RESKEY_reuse"; then
+ # Log archive container failures but don't block, as the priority
+ # is ensuring the etcd container starts successfully.
+ archive_current_container
+ fi
+
+ if ! ensure_pod_manifest_copy_exists; then
+ return $OCF_ERR_GENERIC
+ fi
+
+ if ! prepare_env; then
+ ocf_log err "Could not prepare environment for podman, error code: $?"
+ return $OCF_ERR_GENERIC
+ fi
+
+ if ! generate_etcd_configuration; then
+ ocf_log err "Could not generate etcd configuration, error code: $?"
+ return $OCF_ERR_GENERIC
+ fi
- # add etcd-specific opts
run_opts="$run_opts \
- --network=host \
- -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
- -v /var/lib/etcd:/var/lib/etcd \
- --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \
- --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \
- --env ETCD_DATA_DIR=$ETCD_DATA_DIR \
- --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \
- --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \
- --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \
- --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \
- --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \
- --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \
- --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \
- --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \
- --env ETCD_NAME=$NODENAME \
- --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \
- --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \
- --env ETCDCTL_API=$ETCDCTL_API \
- --env ETCDCTL_CACERT=$SERVER_CACERT \
- --env ETCDCTL_CERT=$ETCD_PEER_CERT \
- --env ETCDCTL_KEY=$ETCD_PEER_KEY \
- --authfile=$OCF_RESKEY_authfile \
- --security-opt label=disable"
+ --network=host \
+ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
+ -v /var/lib/etcd:/var/lib/etcd \
+ --env ETCDCTL_API=$ETCDCTL_API \
+ --env ETCDCTL_CACERT=$SERVER_CACERT \
+ --env ETCDCTL_CERT=$ETCD_PEER_CERT \
+ --env ETCDCTL_KEY=$ETCD_PEER_KEY \
+ --authfile=$OCF_RESKEY_authfile \
+ --security-opt label=disable"
if [ -n "$OCF_RESKEY_run_opts" ]; then
run_opts="$run_opts $OCF_RESKEY_run_opts"
fi
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \
- --log-level=info \
- --experimental-initial-corrupt-check=true \
- --snapshot-count=10000 \
- --initial-advertise-peer-urls=$NODEIPURL:2380 \
- --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
- --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
- --trusted-ca-file=$SERVER_CACERT \
- --client-cert-auth=true \
- --peer-cert-file=$ETCD_PEER_CERT \
- --peer-key-file=$ETCD_PEER_KEY \
- --peer-trusted-ca-file=$SERVER_CACERT \
- --peer-client-cert-auth=true \
- --advertise-client-urls=$NODEIPURL:2379 \
- --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \
- --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \
- --metrics=extensive \
- --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978"
- if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
+ if [ -f "$ETCD_CONFIGURATION_FILE" ]; then
+ ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE"
+ else
+ ocf_log err "could not find $ETCD_CONFIGURATION_FILE"
+ return "$OCF_ERR_GENERIC"
fi
- if is_force_new_cluster; then
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster"
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE"
+ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
fi
if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then
@@ -1377,9 +1615,7 @@ podman_start()
ocf_log info "starting existing container $CONTAINER."
ocf_run podman start "$CONTAINER"
else
- # make sure any previous container matching our container name is cleaned up first.
- # we already know at this point it wouldn't be running
- remove_container
+ ocf_log info "starting new container $CONTAINER."
run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd"
if [ $? -eq 125 ]; then
return $OCF_ERR_GENERIC
@@ -1439,7 +1675,6 @@ podman_stop()
local rc
podman_simple_status
if [ $? -eq $OCF_NOT_RUNNING ]; then
- remove_container
ocf_log info "could not leave members list: etcd container not running"
return $OCF_SUCCESS
fi
@@ -1475,7 +1710,7 @@ podman_stop()
ocf_run podman kill "$CONTAINER"
rc=$?
else
- ocf_log debug "waiting $timeout second[s] before killing container"
+ ocf_log info "waiting $timeout second[s] before killing container"
ocf_run podman stop -t="$timeout" "$CONTAINER"
rc=$?
# on stop, systemd will automatically delete any transient
@@ -1496,11 +1731,6 @@ podman_stop()
fi
fi
- if ! remove_container; then
- ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
- return $OCF_ERR_GENERIC
- fi
-
return $OCF_SUCCESS
}
@@ -1532,6 +1762,7 @@ podman_validate()
check_binary oc
check_binary podman
check_binary jq
+ check_binary tar
if [ -z "$OCF_RESKEY_node_ip_map" ]; then
ocf_exit_reason "'node_ip_map' option is required"
@@ -1589,6 +1820,9 @@ else
fi
CONTAINER=$OCF_RESKEY_name
+POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
+ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
+ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
# Note: we currently monitor podman containers by with the "podman exec"
# command, so make sure that invocation is always valid by enforcing the