687 lines
26 KiB
Diff
687 lines
26 KiB
Diff
From 6e9200dc2ffc89382188794742361985309936b2 Mon Sep 17 00:00:00 2001
|
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
|
Date: Wed, 23 Jul 2025 09:34:13 +0200
|
|
Subject: [PATCH] podman-etcd: preserve containers for debugging
|
|
|
|
This change modifies the agent to keep stopped containers for log
|
|
inspection and debugging, with supporting changes to enable this
|
|
behavior.
|
|
|
|
* Conditionally reuse existing containers when configuration unchanged
|
|
* Move etcd inline configuration flags to external file to allow
|
|
restarts without container recreation (mainly for the
|
|
force-new-cluster flag)
|
|
* Archive previous container renaming it into *-previous, and its
|
|
configuration files into /var/lib/etcd/config-previous.tar.gz archive.
|
|
The tar.gz archive consists in:
|
|
* the pod manifest created by CEO, used to generated the Etc
|
|
configuration file
|
|
* the Etcd configuration file
|
|
* the auth json file
|
|
Only one copy is maintained to limit disk usage.
|
|
* Both configuration and backup files location is configurable with 2
|
|
new input arguments.
|
|
|
|
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
|
---
|
|
heartbeat/podman-etcd | 438 ++++++++++++++++++++++++++++++++----------
|
|
1 file changed, 336 insertions(+), 102 deletions(-)
|
|
|
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
|
index 4969fbaaf..33804414a 100755
|
|
--- a/heartbeat/podman-etcd
|
|
+++ b/heartbeat/podman-etcd
|
|
@@ -46,6 +46,8 @@ OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
|
OCF_RESKEY_allow_pull_default="1"
|
|
OCF_RESKEY_reuse_default="0"
|
|
OCF_RESKEY_oom_default="-997"
|
|
+OCF_RESKEY_config_location_default="/var/lib/etcd"
|
|
+OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
|
|
|
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
|
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
|
@@ -55,6 +57,9 @@ OCF_RESKEY_oom_default="-997"
|
|
: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
|
|
: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
|
|
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
|
+: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
|
+: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
|
+
|
|
|
|
#######################################################################
|
|
|
|
@@ -242,6 +247,23 @@ https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#
|
|
<shortdesc lang="en">OOM for container</shortdesc>
|
|
<content type="integer" default="${OCF_RESKEY_oom_default}"/>
|
|
</parameter>
|
|
+
|
|
+<parameter name="config_location" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Resource agent state directory</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_config_location_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="backup_location" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+The directory where the resource agent stores its backups.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Resource agent backup directory</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
|
|
+</parameter>
|
|
+
|
|
</parameters>
|
|
|
|
<actions>
|
|
@@ -309,42 +331,52 @@ container_exists()
|
|
return 1
|
|
}
|
|
|
|
-remove_container()
|
|
+# archive_current_container archives the current
|
|
+# podman etcd container and its configuration files.
|
|
+archive_current_container()
|
|
{
|
|
- local rc
|
|
- local execids
|
|
+ # don't attempt to archive a container that doesn't exist
|
|
+ if ! container_exists; then
|
|
+ return
|
|
+ fi
|
|
|
|
- if ocf_is_true "$OCF_RESKEY_reuse"; then
|
|
- # never remove the container if we have reuse enabled.
|
|
- return 0
|
|
+ # delete any container named "*-previous", or we won't be able to archive the current container.
|
|
+ if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then
|
|
+ ocf_log info "removing old archived container '$CONTAINER-previous'"
|
|
+ if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then
|
|
+ ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container"
|
|
+ return
|
|
+ fi
|
|
fi
|
|
|
|
- if ! container_exists; then
|
|
- # don't attempt to remove a container that doesn't exist
|
|
- return 0
|
|
+ ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes"
|
|
+ if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then
|
|
+ ocf_log err "could not archive container '$CONTAINER', error code: $?"
|
|
+ return
|
|
fi
|
|
- ocf_log notice "Cleaning up inactive container, ${CONTAINER}."
|
|
- ocf_run podman rm -v "$CONTAINER"
|
|
- rc=$?
|
|
- if [ $rc -ne 0 ]; then
|
|
- if [ $rc -eq 2 ]; then
|
|
- if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then
|
|
- ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
|
|
- ocf_run podman rm -f "$CONTAINER"
|
|
- rc=$?
|
|
- fi
|
|
- fi
|
|
- # due to a podman bug (rhbz#1841485), sometimes a stopped
|
|
- # container can still be associated with Exec sessions, in
|
|
- # which case the "podman rm" has to be forced
|
|
- execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}')
|
|
- if [ "$execids" -ne "0" ]; then
|
|
- ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it."
|
|
- ocf_run podman rm -f "$CONTAINER"
|
|
- rc=$?
|
|
+
|
|
+ # archive corresponding etcd configuration files
|
|
+ local files_to_archive=""
|
|
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
|
|
+ if [ -f "$file" ]; then
|
|
+ files_to_archive="$files_to_archive $file"
|
|
+ else
|
|
+ ocf_log warn "file '$file' is missing and won't be archived"
|
|
fi
|
|
+ done
|
|
+
|
|
+ if [ -z "$files_to_archive" ]; then
|
|
+ ocf_log warn "could not find any file to archive."
|
|
+ return
|
|
+ fi
|
|
+
|
|
+ # NOTE: tar will override any existing archive as wanted
|
|
+ # shellcheck disable=SC2086
|
|
+ if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then
|
|
+ ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files"
|
|
+ else
|
|
+ ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'"
|
|
fi
|
|
- return $rc
|
|
}
|
|
|
|
# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address.
|
|
@@ -365,6 +397,7 @@ attribute_node_ip()
|
|
local attribute="node_ip"
|
|
local ip_addr name
|
|
|
|
+ # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes
|
|
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
|
name=$(echo "$node" | cut -d: -f1)
|
|
# ignore other nodes
|
|
@@ -375,7 +408,7 @@ attribute_node_ip()
|
|
done
|
|
|
|
if [ -z "$ip_addr" ]; then
|
|
- ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)"
|
|
+ ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'"
|
|
return 1
|
|
fi
|
|
|
|
@@ -384,9 +417,9 @@ attribute_node_ip()
|
|
echo "$ip_addr"
|
|
;;
|
|
update)
|
|
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then
|
|
rc="$?"
|
|
- ocf_log err "could not set $attribute to $value, error code: $rc"
|
|
+ ocf_log err "could not set $attribute to $ip_addr, error code: $rc"
|
|
return "$rc"
|
|
fi
|
|
;;
|
|
@@ -428,6 +461,48 @@ get_env_from_manifest() {
|
|
echo "$env_var_value"
|
|
}
|
|
|
|
+# etcd configuration file expects duration to be expressed in nanoseconds
|
|
+convert_duration_in_nanoseconds() {
|
|
+ local duration=$1
|
|
+ local value unit nanoseconds
|
|
+
|
|
+ if [ -z "$duration" ]; then
|
|
+ ocf_log err "convert_duration_in_nanoseconds: no duration provided"
|
|
+ return 1
|
|
+ fi
|
|
+
|
|
+ if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then
|
|
+ ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: <number><unit> where unit is one of s, ms, us, µs, ns"
|
|
+ return 1
|
|
+ fi
|
|
+
|
|
+ # Extract numeric value and unit from duration string
|
|
+ value=$(echo "$duration" | sed 's/[^0-9]*$//')
|
|
+ unit=$(echo "$duration" | sed 's/^[0-9]*//')
|
|
+
|
|
+ case "$unit" in
|
|
+ ns)
|
|
+ nanoseconds=$value
|
|
+ ;;
|
|
+ us|µs)
|
|
+ nanoseconds=$((value * 1000))
|
|
+ ;;
|
|
+ ms)
|
|
+ nanoseconds=$((value * 1000000))
|
|
+ ;;
|
|
+ s)
|
|
+ nanoseconds=$((value * 1000000000))
|
|
+ ;;
|
|
+ *)
|
|
+ # this should not happen as the input is already validated
|
|
+ ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\""
|
|
+ return 1
|
|
+ ;;
|
|
+ esac
|
|
+
|
|
+ echo "$nanoseconds"
|
|
+}
|
|
+
|
|
prepare_env() {
|
|
local name ip ipurl standalone_node
|
|
|
|
@@ -457,9 +532,14 @@ prepare_env() {
|
|
ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API")
|
|
ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES")
|
|
ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR")
|
|
+ if [ ! -d "$ETCD_DATA_DIR" ]; then
|
|
+ ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\""
|
|
+ return "$OCF_ERR_ARGS"
|
|
+ else
|
|
+ ocf_log info "using data-dir: $ETCD_DATA_DIR"
|
|
+ fi
|
|
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
|
|
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
|
|
- ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS")
|
|
ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
|
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
|
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
|
|
@@ -475,6 +555,62 @@ prepare_env() {
|
|
LISTEN_METRICS_URLS="0.0.0.0"
|
|
}
|
|
|
|
+
|
|
+generate_etcd_configuration() {
|
|
+ if is_force_new_cluster; then
|
|
+ # The embedded newline is required for correct YAML formatting.
|
|
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
|
|
+force-new-cluster-bump-amount: 1000000000"
|
|
+ else
|
|
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
|
|
+ fi
|
|
+
|
|
+ cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
|
+logger: zap
|
|
+log-level: info
|
|
+snapshot-count: 10000
|
|
+name: $NODENAME
|
|
+data-dir: $ETCD_DATA_DIR
|
|
+$FORCE_NEW_CLUSTER_CONFIG
|
|
+socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS
|
|
+election-timeout: $ETCD_ELECTION_TIMEOUT
|
|
+enable-pprof: $ETCD_ENABLE_PPROF
|
|
+heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL
|
|
+quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES
|
|
+initial-advertise-peer-urls: "$NODEIPURL:2380"
|
|
+listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380"
|
|
+listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0"
|
|
+initial-cluster: $ETCD_INITIAL_CLUSTER
|
|
+initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE
|
|
+client-transport-security:
|
|
+ cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt
|
|
+ key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key
|
|
+ client-cert-auth: true
|
|
+ trusted-ca-file: $SERVER_CACERT
|
|
+peer-transport-security:
|
|
+ cert-file: $ETCD_PEER_CERT
|
|
+ key-file: $ETCD_PEER_KEY
|
|
+ client-cert-auth: true
|
|
+ trusted-ca-file: $SERVER_CACERT
|
|
+advertise-client-urls: "$NODEIPURL:2379"
|
|
+listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
|
+metrics: extensive
|
|
+experimental-initial-corrupt-check: true
|
|
+experimental-max-learners: 1
|
|
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
|
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
|
+EOF
|
|
+
|
|
+ {
|
|
+ if [ -n "$ETCD_CIPHER_SUITES" ]; then
|
|
+ echo "cipher-suites:"
|
|
+ echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do
|
|
+ echo " - \"$cipher\""
|
|
+ done
|
|
+ fi
|
|
+ } >> "$ETCD_CONFIGURATION_FILE"
|
|
+}
|
|
+
|
|
archive_data_folder()
|
|
{
|
|
# TODO: use etcd snapshots
|
|
@@ -634,7 +770,7 @@ add_member_as_learner()
|
|
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
|
local peer_url=$(ip_url $member_ip)
|
|
|
|
- ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
|
+ ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
|
|
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
|
rc=$?
|
|
if [ $rc -ne 0 ]; then
|
|
@@ -1104,18 +1240,18 @@ compare_revision()
|
|
peer_revision=$(attribute_node_revision_peer)
|
|
|
|
if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then
|
|
- ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision"
|
|
+ ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'"
|
|
return "$OCF_ERR_GENERIC"
|
|
fi
|
|
|
|
if [ "$revision" -gt "$peer_revision" ]; then
|
|
- ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision"
|
|
+ ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'"
|
|
echo "newer"
|
|
elif [ "$revision" -eq "$peer_revision" ]; then
|
|
- ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision"
|
|
+ ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'"
|
|
echo "equal"
|
|
else
|
|
- ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision"
|
|
+ ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'"
|
|
echo "older"
|
|
fi
|
|
return "$OCF_SUCCESS"
|
|
@@ -1144,6 +1280,100 @@ ensure_pod_manifest_exists()
|
|
return "$OCF_SUCCESS"
|
|
}
|
|
|
|
+filter_pod_manifest() {
|
|
+ # Remove pod-version related fields from POD manifest
|
|
+ local pod_manifest="$1"
|
|
+ local temporary_file
|
|
+ local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))'
|
|
+
|
|
+ if ! temporary_file=$(mktemp); then
|
|
+ ocf_log err "could not create temporary file for '$pod_manifest', error code: $?"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then
|
|
+ ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ echo "$temporary_file"
|
|
+}
|
|
+
|
|
+can_reuse_container() {
|
|
+ # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes.
|
|
+ # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed.
|
|
+ local cp_rc
|
|
+ local diff_rc
|
|
+ local filtered_original_pod_manifest
|
|
+ local filtered_copy_pod_manifest
|
|
+
|
|
+
|
|
+ # If the container does not exist it cannot be reused
|
|
+ if ! container_exists; then
|
|
+ OCF_RESKEY_reuse=0
|
|
+ return "$OCF_SUCCESS"
|
|
+ fi
|
|
+
|
|
+ # If the manifest copy doesn't exist, we need a new container.
|
|
+ if [ ! -f "$POD_MANIFEST_COPY" ]; then
|
|
+ ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created."
|
|
+ OCF_RESKEY_reuse=0
|
|
+ return "$OCF_SUCCESS"
|
|
+ fi
|
|
+
|
|
+ if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY"
|
|
+ ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
|
|
+ diff_rc="$?"
|
|
+ # clean up temporary files
|
|
+ rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
|
|
+ case "$diff_rc" in
|
|
+ 0)
|
|
+ ocf_log info "Reusing the existing etcd container"
|
|
+ OCF_RESKEY_reuse=1
|
|
+ ;;
|
|
+ 1)
|
|
+ ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes"
|
|
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
|
|
+ cp_rc="$?"
|
|
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+ ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created"
|
|
+ OCF_RESKEY_reuse=0
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ ;;
|
|
+ esac
|
|
+
|
|
+ return "$OCF_SUCCESS"
|
|
+}
|
|
+
|
|
+ensure_pod_manifest_copy_exists() {
|
|
+ local cp_rc
|
|
+
|
|
+ if [ -f "$POD_MANIFEST_COPY" ]; then
|
|
+ return "$OCF_SUCCESS"
|
|
+ fi
|
|
+
|
|
+ # If the manifest copy doesn't exist, create it and ensure a new container.
|
|
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
|
|
+ cp_rc="$?"
|
|
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+
|
|
+ ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created"
|
|
+
|
|
+ return "$OCF_SUCCESS"
|
|
+}
|
|
+
|
|
podman_start()
|
|
{
|
|
local cid
|
|
@@ -1173,6 +1403,13 @@ podman_start()
|
|
return $OCF_ERR_GENERIC
|
|
fi
|
|
|
|
+ # check if the container has already started
|
|
+ podman_simple_status
|
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
|
+ ocf_log info "the '$CONTAINER' has already started. Nothing to do"
|
|
+ return "$OCF_SUCCESS"
|
|
+ fi
|
|
+
|
|
if ! ensure_pod_manifest_exists; then
|
|
ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)"
|
|
return "$OCF_ERR_GENERIC"
|
|
@@ -1186,8 +1423,9 @@ podman_start()
|
|
ocf_log info "static pod was running: start normally"
|
|
else
|
|
if is_force_new_cluster; then
|
|
- ocf_log notice "$NODENAME marked to force-new-cluster"
|
|
+ ocf_log notice "'$NODENAME' marked to force-new-cluster"
|
|
else
|
|
+ ocf_log info "'$NODENAME' is not marked to force-new-cluster"
|
|
# When the local agent starts, we can infer the cluster state by counting
|
|
# how many agents are starting or already active:
|
|
# - 1 active agent: it's the peer (we are just starting)
|
|
@@ -1195,6 +1433,7 @@ podman_start()
|
|
# - 0 active agents, 2 starting: both agents are starting simultaneously
|
|
local active_resources_count
|
|
active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
|
+ ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
|
|
case "$active_resources_count" in
|
|
1)
|
|
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
|
@@ -1205,17 +1444,17 @@ podman_start()
|
|
fi
|
|
;;
|
|
0)
|
|
+ # count how many agents are starting now
|
|
+ local start_resources_count
|
|
+ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
|
+ ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')"
|
|
+
|
|
# we need to compare the revisions in any of the following branches
|
|
# so call the function only once here
|
|
if ! revision_compare_result=$(compare_revision); then
|
|
ocf_log err "could not compare revisions, error code: $?"
|
|
return "$OCF_ERR_GENERIC"
|
|
fi
|
|
-
|
|
- # count how many agents are starting now
|
|
- local start_resources_count
|
|
- start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
|
-
|
|
case "$start_resources_count" in
|
|
1)
|
|
ocf_log debug "peer not starting: ensure we can start a new cluster"
|
|
@@ -1231,6 +1470,7 @@ podman_start()
|
|
fi
|
|
;;
|
|
2)
|
|
+ # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes?
|
|
ocf_log info "peer starting"
|
|
if [ "$revision_compare_result" = "newer" ]; then
|
|
set_force_new_cluster
|
|
@@ -1263,7 +1503,7 @@ podman_start()
|
|
fi
|
|
|
|
podman_create_mounts
|
|
- local run_opts="--detach --name=${CONTAINER}"
|
|
+ local run_opts="--detach --name=${CONTAINER} --replace"
|
|
|
|
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
|
|
|
@@ -1297,61 +1537,59 @@ podman_start()
|
|
archive_data_folder
|
|
fi
|
|
|
|
- prepare_env
|
|
+ ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
|
+ if ! can_reuse_container ; then
|
|
+ rc="$?"
|
|
+ ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+
|
|
+ # Archive current container and its configuration before creating
|
|
+ # new configuration files.
|
|
+ if ! ocf_is_true "$OCF_RESKEY_reuse"; then
|
|
+ # Log archive container failures but don't block, as the priority
|
|
+ # is ensuring the etcd container starts successfully.
|
|
+ archive_current_container
|
|
+ fi
|
|
+
|
|
+ if ! ensure_pod_manifest_copy_exists; then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ if ! prepare_env; then
|
|
+ ocf_log err "Could not prepare environment for podman, error code: $?"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ if ! generate_etcd_configuration; then
|
|
+ ocf_log err "Could not generate etcd configuration, error code: $?"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
|
|
- # add etcd-specific opts
|
|
run_opts="$run_opts \
|
|
- --network=host \
|
|
- -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
|
- -v /var/lib/etcd:/var/lib/etcd \
|
|
- --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \
|
|
- --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \
|
|
- --env ETCD_DATA_DIR=$ETCD_DATA_DIR \
|
|
- --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \
|
|
- --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \
|
|
- --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \
|
|
- --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \
|
|
- --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \
|
|
- --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \
|
|
- --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \
|
|
- --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \
|
|
- --env ETCD_NAME=$NODENAME \
|
|
- --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \
|
|
- --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \
|
|
- --env ETCDCTL_API=$ETCDCTL_API \
|
|
- --env ETCDCTL_CACERT=$SERVER_CACERT \
|
|
- --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
|
- --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
|
- --authfile=$OCF_RESKEY_authfile \
|
|
- --security-opt label=disable"
|
|
+ --network=host \
|
|
+ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
|
+ -v /var/lib/etcd:/var/lib/etcd \
|
|
+ --env ETCDCTL_API=$ETCDCTL_API \
|
|
+ --env ETCDCTL_CACERT=$SERVER_CACERT \
|
|
+ --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
|
+ --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
|
+ --authfile=$OCF_RESKEY_authfile \
|
|
+ --security-opt label=disable"
|
|
if [ -n "$OCF_RESKEY_run_opts" ]; then
|
|
run_opts="$run_opts $OCF_RESKEY_run_opts"
|
|
fi
|
|
|
|
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \
|
|
- --log-level=info \
|
|
- --experimental-initial-corrupt-check=true \
|
|
- --snapshot-count=10000 \
|
|
- --initial-advertise-peer-urls=$NODEIPURL:2380 \
|
|
- --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
|
|
- --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
|
|
- --trusted-ca-file=$SERVER_CACERT \
|
|
- --client-cert-auth=true \
|
|
- --peer-cert-file=$ETCD_PEER_CERT \
|
|
- --peer-key-file=$ETCD_PEER_KEY \
|
|
- --peer-trusted-ca-file=$SERVER_CACERT \
|
|
- --peer-client-cert-auth=true \
|
|
- --advertise-client-urls=$NODEIPURL:2379 \
|
|
- --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \
|
|
- --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \
|
|
- --metrics=extensive \
|
|
- --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
|
- if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
|
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
|
+ if [ -f "$ETCD_CONFIGURATION_FILE" ]; then
|
|
+ ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE"
|
|
+ else
|
|
+ ocf_log err "could not find $ETCD_CONFIGURATION_FILE"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
fi
|
|
|
|
- if is_force_new_cluster; then
|
|
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster"
|
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE"
|
|
+ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
|
fi
|
|
|
|
if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then
|
|
@@ -1377,9 +1615,7 @@ podman_start()
|
|
ocf_log info "starting existing container $CONTAINER."
|
|
ocf_run podman start "$CONTAINER"
|
|
else
|
|
- # make sure any previous container matching our container name is cleaned up first.
|
|
- # we already know at this point it wouldn't be running
|
|
- remove_container
|
|
+ ocf_log info "starting new container $CONTAINER."
|
|
run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd"
|
|
if [ $? -eq 125 ]; then
|
|
return $OCF_ERR_GENERIC
|
|
@@ -1439,7 +1675,6 @@ podman_stop()
|
|
local rc
|
|
podman_simple_status
|
|
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
- remove_container
|
|
ocf_log info "could not leave members list: etcd container not running"
|
|
return $OCF_SUCCESS
|
|
fi
|
|
@@ -1475,7 +1710,7 @@ podman_stop()
|
|
ocf_run podman kill "$CONTAINER"
|
|
rc=$?
|
|
else
|
|
- ocf_log debug "waiting $timeout second[s] before killing container"
|
|
+ ocf_log info "waiting $timeout second[s] before killing container"
|
|
ocf_run podman stop -t="$timeout" "$CONTAINER"
|
|
rc=$?
|
|
# on stop, systemd will automatically delete any transient
|
|
@@ -1496,11 +1731,6 @@ podman_stop()
|
|
fi
|
|
fi
|
|
|
|
- if ! remove_container; then
|
|
- ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
|
- return $OCF_ERR_GENERIC
|
|
- fi
|
|
-
|
|
return $OCF_SUCCESS
|
|
}
|
|
|
|
@@ -1532,6 +1762,7 @@ podman_validate()
|
|
check_binary oc
|
|
check_binary podman
|
|
check_binary jq
|
|
+ check_binary tar
|
|
|
|
if [ -z "$OCF_RESKEY_node_ip_map" ]; then
|
|
ocf_exit_reason "'node_ip_map' option is required"
|
|
@@ -1589,6 +1820,9 @@ else
|
|
fi
|
|
|
|
CONTAINER=$OCF_RESKEY_name
|
|
+POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
|
+ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
|
+ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
|
|
|
# Note: we currently monitor podman containers by with the "podman exec"
|
|
# command, so make sure that invocation is always valid by enforcing the
|