From 6e9200dc2ffc89382188794742361985309936b2 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Wed, 23 Jul 2025 09:34:13 +0200 Subject: [PATCH] podman-etcd: preserve containers for debugging This change modifies the agent to keep stopped containers for log inspection and debugging, with supporting changes to enable this behavior. * Conditionally reuse existing containers when configuration unchanged * Move etcd inline configuration flags to external file to allow restarts without container recreation (mainly for the force-new-cluster flag) * Archive previous container renaming it into *-previous, and its configuration files into /var/lib/etcd/config-previous.tar.gz archive. The tar.gz archive consists in: * the pod manifest created by CEO, used to generated the Etc configuration file * the Etcd configuration file * the auth json file Only one copy is maintained to limit disk usage. * Both configuration and backup files location is configurable with 2 new input arguments. Signed-off-by: Carlo Lobrano --- heartbeat/podman-etcd | 438 ++++++++++++++++++++++++++++++++---------- 1 file changed, 336 insertions(+), 102 deletions(-) diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd index 4969fbaaf..33804414a 100755 --- a/heartbeat/podman-etcd +++ b/heartbeat/podman-etcd @@ -46,6 +46,8 @@ OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" OCF_RESKEY_allow_pull_default="1" OCF_RESKEY_reuse_default="0" OCF_RESKEY_oom_default="-997" +OCF_RESKEY_config_location_default="/var/lib/etcd" +OCF_RESKEY_backup_location_default="/var/lib/etcd" : ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} : ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} @@ -55,6 +57,9 @@ OCF_RESKEY_oom_default="-997" : ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}} : ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} : ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}} +: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}} +: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}} + ####################################################################### @@ -242,6 +247,23 @@ https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/# OOM for container + + + +The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest. + +Resource agent state directory + + + + + +The directory where the resource agent stores its backups. + +Resource agent backup directory + + + @@ -309,42 +331,52 @@ container_exists() return 1 } -remove_container() +# archive_current_container archives the current +# podman etcd container and its configuration files. +archive_current_container() { - local rc - local execids + # don't attempt to archive a container that doesn't exist + if ! container_exists; then + return + fi - if ocf_is_true "$OCF_RESKEY_reuse"; then - # never remove the container if we have reuse enabled. - return 0 + # delete any container named "*-previous", or we won't be able to archive the current container. + if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then + ocf_log info "removing old archived container '$CONTAINER-previous'" + if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then + ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container" + return + fi fi - if ! container_exists; then - # don't attempt to remove a container that doesn't exist - return 0 + ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes" + if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then + ocf_log err "could not archive container '$CONTAINER', error code: $?" + return fi - ocf_log notice "Cleaning up inactive container, ${CONTAINER}." - ocf_run podman rm -v "$CONTAINER" - rc=$? - if [ $rc -ne 0 ]; then - if [ $rc -eq 2 ]; then - if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then - ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it." - ocf_run podman rm -f "$CONTAINER" - rc=$? - fi - fi - # due to a podman bug (rhbz#1841485), sometimes a stopped - # container can still be associated with Exec sessions, in - # which case the "podman rm" has to be forced - execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}') - if [ "$execids" -ne "0" ]; then - ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." - ocf_run podman rm -f "$CONTAINER" - rc=$? + + # archive corresponding etcd configuration files + local files_to_archive="" + for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do + if [ -f "$file" ]; then + files_to_archive="$files_to_archive $file" + else + ocf_log warn "file '$file' is missing and won't be archived" fi + done + + if [ -z "$files_to_archive" ]; then + ocf_log warn "could not find any file to archive." + return + fi + + # NOTE: tar will override any existing archive as wanted + # shellcheck disable=SC2086 + if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then + ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files" + else + ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'" fi - return $rc } # Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address. @@ -365,6 +397,7 @@ attribute_node_ip() local attribute="node_ip" local ip_addr name + # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do name=$(echo "$node" | cut -d: -f1) # ignore other nodes @@ -375,7 +408,7 @@ attribute_node_ip() done if [ -z "$ip_addr" ]; then - ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)" + ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'" return 1 fi @@ -384,9 +417,9 @@ attribute_node_ip() echo "$ip_addr" ;; update) - if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then + if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then rc="$?" - ocf_log err "could not set $attribute to $value, error code: $rc" + ocf_log err "could not set $attribute to $ip_addr, error code: $rc" return "$rc" fi ;; @@ -428,6 +461,48 @@ get_env_from_manifest() { echo "$env_var_value" } +# etcd configuration file expects duration to be expressed in nanoseconds +convert_duration_in_nanoseconds() { + local duration=$1 + local value unit nanoseconds + + if [ -z "$duration" ]; then + ocf_log err "convert_duration_in_nanoseconds: no duration provided" + return 1 + fi + + if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then + ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: where unit is one of s, ms, us, µs, ns" + return 1 + fi + + # Extract numeric value and unit from duration string + value=$(echo "$duration" | sed 's/[^0-9]*$//') + unit=$(echo "$duration" | sed 's/^[0-9]*//') + + case "$unit" in + ns) + nanoseconds=$value + ;; + us|µs) + nanoseconds=$((value * 1000)) + ;; + ms) + nanoseconds=$((value * 1000000)) + ;; + s) + nanoseconds=$((value * 1000000000)) + ;; + *) + # this should not happen as the input is already validated + ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\"" + return 1 + ;; + esac + + echo "$nanoseconds" +} + prepare_env() { local name ip ipurl standalone_node @@ -457,9 +532,14 @@ prepare_env() { ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API") ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES") ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR") + if [ ! -d "$ETCD_DATA_DIR" ]; then + ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\"" + return "$OCF_ERR_ARGS" + else + ocf_log info "using data-dir: $ETCD_DATA_DIR" + fi ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT") ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF") - ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS") ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL") @@ -475,6 +555,62 @@ prepare_env() { LISTEN_METRICS_URLS="0.0.0.0" } + +generate_etcd_configuration() { + if is_force_new_cluster; then + # The embedded newline is required for correct YAML formatting. + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true +force-new-cluster-bump-amount: 1000000000" + else + FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false" + fi + + cat > "$ETCD_CONFIGURATION_FILE" << EOF +logger: zap +log-level: info +snapshot-count: 10000 +name: $NODENAME +data-dir: $ETCD_DATA_DIR +$FORCE_NEW_CLUSTER_CONFIG +socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS +election-timeout: $ETCD_ELECTION_TIMEOUT +enable-pprof: $ETCD_ENABLE_PPROF +heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL +quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES +initial-advertise-peer-urls: "$NODEIPURL:2380" +listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380" +listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0" +initial-cluster: $ETCD_INITIAL_CLUSTER +initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE +client-transport-security: + cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt + key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key + client-cert-auth: true + trusted-ca-file: $SERVER_CACERT +peer-transport-security: + cert-file: $ETCD_PEER_CERT + key-file: $ETCD_PEER_KEY + client-cert-auth: true + trusted-ca-file: $SERVER_CACERT +advertise-client-urls: "$NODEIPURL:2379" +listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978" +metrics: extensive +experimental-initial-corrupt-check: true +experimental-max-learners: 1 +experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") +experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") +EOF + + { + if [ -n "$ETCD_CIPHER_SUITES" ]; then + echo "cipher-suites:" + echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do + echo " - \"$cipher\"" + done + fi + } >> "$ETCD_CONFIGURATION_FILE" +} + archive_data_folder() { # TODO: use etcd snapshots @@ -634,7 +770,7 @@ add_member_as_learner() local endpoint_url=$(ip_url $(attribute_node_ip get)) local peer_url=$(ip_url $member_ip) - ocf_log info "add $member_name ($member_ip) to the member list as learner" + ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner" out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner) rc=$? if [ $rc -ne 0 ]; then @@ -1104,18 +1240,18 @@ compare_revision() peer_revision=$(attribute_node_revision_peer) if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then - ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision" + ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'" return "$OCF_ERR_GENERIC" fi if [ "$revision" -gt "$peer_revision" ]; then - ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision" + ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'" echo "newer" elif [ "$revision" -eq "$peer_revision" ]; then - ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision" + ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'" echo "equal" else - ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision" + ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'" echo "older" fi return "$OCF_SUCCESS" @@ -1144,6 +1280,100 @@ ensure_pod_manifest_exists() return "$OCF_SUCCESS" } +filter_pod_manifest() { + # Remove pod-version related fields from POD manifest + local pod_manifest="$1" + local temporary_file + local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))' + + if ! temporary_file=$(mktemp); then + ocf_log err "could not create temporary file for '$pod_manifest', error code: $?" + return $OCF_ERR_GENERIC + fi + if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then + ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?" + return $OCF_ERR_GENERIC + fi + echo "$temporary_file" +} + +can_reuse_container() { + # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes. + # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed. + local cp_rc + local diff_rc + local filtered_original_pod_manifest + local filtered_copy_pod_manifest + + + # If the container does not exist it cannot be reused + if ! container_exists; then + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi + + # If the manifest copy doesn't exist, we need a new container. + if [ ! -f "$POD_MANIFEST_COPY" ]; then + ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created." + OCF_RESKEY_reuse=0 + return "$OCF_SUCCESS" + fi + + if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then + return $OCF_ERR_GENERIC + fi + if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then + return $OCF_ERR_GENERIC + fi + + ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY" + ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest" + diff_rc="$?" + # clean up temporary files + rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest" + case "$diff_rc" in + 0) + ocf_log info "Reusing the existing etcd container" + OCF_RESKEY_reuse=1 + ;; + 1) + ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes" + if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then + cp_rc="$?" + ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc" + return "$OCF_ERR_GENERIC" + fi + ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created" + OCF_RESKEY_reuse=0 + ;; + *) + ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc" + return "$OCF_ERR_GENERIC" + ;; + esac + + return "$OCF_SUCCESS" +} + +ensure_pod_manifest_copy_exists() { + local cp_rc + + if [ -f "$POD_MANIFEST_COPY" ]; then + return "$OCF_SUCCESS" + fi + + # If the manifest copy doesn't exist, create it and ensure a new container. + if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then + cp_rc="$?" + ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc" + return "$OCF_ERR_GENERIC" + fi + + ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created" + + return "$OCF_SUCCESS" +} + podman_start() { local cid @@ -1173,6 +1403,13 @@ podman_start() return $OCF_ERR_GENERIC fi + # check if the container has already started + podman_simple_status + if [ $? -eq $OCF_SUCCESS ]; then + ocf_log info "the '$CONTAINER' has already started. Nothing to do" + return "$OCF_SUCCESS" + fi + if ! ensure_pod_manifest_exists; then ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)" return "$OCF_ERR_GENERIC" @@ -1186,8 +1423,9 @@ podman_start() ocf_log info "static pod was running: start normally" else if is_force_new_cluster; then - ocf_log notice "$NODENAME marked to force-new-cluster" + ocf_log notice "'$NODENAME' marked to force-new-cluster" else + ocf_log info "'$NODENAME' is not marked to force-new-cluster" # When the local agent starts, we can infer the cluster state by counting # how many agents are starting or already active: # - 1 active agent: it's the peer (we are just starting) @@ -1195,6 +1433,7 @@ podman_start() # - 0 active agents, 2 starting: both agents are starting simultaneously local active_resources_count active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) + ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')" case "$active_resources_count" in 1) if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then @@ -1205,17 +1444,17 @@ podman_start() fi ;; 0) + # count how many agents are starting now + local start_resources_count + start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) + ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')" + # we need to compare the revisions in any of the following branches # so call the function only once here if ! revision_compare_result=$(compare_revision); then ocf_log err "could not compare revisions, error code: $?" return "$OCF_ERR_GENERIC" fi - - # count how many agents are starting now - local start_resources_count - start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) - case "$start_resources_count" in 1) ocf_log debug "peer not starting: ensure we can start a new cluster" @@ -1231,6 +1470,7 @@ podman_start() fi ;; 2) + # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes? ocf_log info "peer starting" if [ "$revision_compare_result" = "newer" ]; then set_force_new_cluster @@ -1263,7 +1503,7 @@ podman_start() fi podman_create_mounts - local run_opts="--detach --name=${CONTAINER}" + local run_opts="--detach --name=${CONTAINER} --replace" run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}" @@ -1297,61 +1537,59 @@ podman_start() archive_data_folder fi - prepare_env + ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced" + if ! can_reuse_container ; then + rc="$?" + ocf_log err "could not determine etcd container reuse strategy, rc: $rc" + return "$rc" + fi + + # Archive current container and its configuration before creating + # new configuration files. + if ! ocf_is_true "$OCF_RESKEY_reuse"; then + # Log archive container failures but don't block, as the priority + # is ensuring the etcd container starts successfully. + archive_current_container + fi + + if ! ensure_pod_manifest_copy_exists; then + return $OCF_ERR_GENERIC + fi + + if ! prepare_env; then + ocf_log err "Could not prepare environment for podman, error code: $?" + return $OCF_ERR_GENERIC + fi + + if ! generate_etcd_configuration; then + ocf_log err "Could not generate etcd configuration, error code: $?" + return $OCF_ERR_GENERIC + fi - # add etcd-specific opts run_opts="$run_opts \ - --network=host \ - -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ - -v /var/lib/etcd:/var/lib/etcd \ - --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \ - --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \ - --env ETCD_DATA_DIR=$ETCD_DATA_DIR \ - --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \ - --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \ - --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \ - --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \ - --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \ - --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \ - --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \ - --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \ - --env ETCD_NAME=$NODENAME \ - --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \ - --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \ - --env ETCDCTL_API=$ETCDCTL_API \ - --env ETCDCTL_CACERT=$SERVER_CACERT \ - --env ETCDCTL_CERT=$ETCD_PEER_CERT \ - --env ETCDCTL_KEY=$ETCD_PEER_KEY \ - --authfile=$OCF_RESKEY_authfile \ - --security-opt label=disable" + --network=host \ + -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ + -v /var/lib/etcd:/var/lib/etcd \ + --env ETCDCTL_API=$ETCDCTL_API \ + --env ETCDCTL_CACERT=$SERVER_CACERT \ + --env ETCDCTL_CERT=$ETCD_PEER_CERT \ + --env ETCDCTL_KEY=$ETCD_PEER_KEY \ + --authfile=$OCF_RESKEY_authfile \ + --security-opt label=disable" if [ -n "$OCF_RESKEY_run_opts" ]; then run_opts="$run_opts $OCF_RESKEY_run_opts" fi - OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \ - --log-level=info \ - --experimental-initial-corrupt-check=true \ - --snapshot-count=10000 \ - --initial-advertise-peer-urls=$NODEIPURL:2380 \ - --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \ - --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \ - --trusted-ca-file=$SERVER_CACERT \ - --client-cert-auth=true \ - --peer-cert-file=$ETCD_PEER_CERT \ - --peer-key-file=$ETCD_PEER_KEY \ - --peer-trusted-ca-file=$SERVER_CACERT \ - --peer-client-cert-auth=true \ - --advertise-client-urls=$NODEIPURL:2379 \ - --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \ - --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \ - --metrics=extensive \ - --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978" - if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then - OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" + if [ -f "$ETCD_CONFIGURATION_FILE" ]; then + ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE" + else + ocf_log err "could not find $ETCD_CONFIGURATION_FILE" + return "$OCF_ERR_GENERIC" fi - if is_force_new_cluster; then - OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster" + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE" + if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then + OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" fi if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then @@ -1377,9 +1615,7 @@ podman_start() ocf_log info "starting existing container $CONTAINER." ocf_run podman start "$CONTAINER" else - # make sure any previous container matching our container name is cleaned up first. - # we already know at this point it wouldn't be running - remove_container + ocf_log info "starting new container $CONTAINER." run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd" if [ $? -eq 125 ]; then return $OCF_ERR_GENERIC @@ -1439,7 +1675,6 @@ podman_stop() local rc podman_simple_status if [ $? -eq $OCF_NOT_RUNNING ]; then - remove_container ocf_log info "could not leave members list: etcd container not running" return $OCF_SUCCESS fi @@ -1475,7 +1710,7 @@ podman_stop() ocf_run podman kill "$CONTAINER" rc=$? else - ocf_log debug "waiting $timeout second[s] before killing container" + ocf_log info "waiting $timeout second[s] before killing container" ocf_run podman stop -t="$timeout" "$CONTAINER" rc=$? # on stop, systemd will automatically delete any transient @@ -1496,11 +1731,6 @@ podman_stop() fi fi - if ! remove_container; then - ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." - return $OCF_ERR_GENERIC - fi - return $OCF_SUCCESS } @@ -1532,6 +1762,7 @@ podman_validate() check_binary oc check_binary podman check_binary jq + check_binary tar if [ -z "$OCF_RESKEY_node_ip_map" ]; then ocf_exit_reason "'node_ip_map' option is required" @@ -1589,6 +1820,9 @@ else fi CONTAINER=$OCF_RESKEY_name +POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml" +ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml" +ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz" # Note: we currently monitor podman containers by with the "podman exec" # command, so make sure that invocation is always valid by enforcing the