1644 lines
52 KiB
Diff
1644 lines
52 KiB
Diff
From 959b5c88c6a5e6a7a537eb6fc7e5033db8387777 Mon Sep 17 00:00:00 2001
|
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
|
Date: Thu, 24 Apr 2025 13:16:59 +0200
|
|
Subject: [PATCH] podman-etcd: new resource agent (#2023)
|
|
|
|
Introduce a resource agent for Podman to manage etcd instances.
|
|
|
|
This agent enables Pacemaker to control etcd containers, handling
|
|
start, stop, monitor, and recovery operations.
|
|
---
|
|
doc/man/Makefile.am | 1 +
|
|
heartbeat/Makefile.am | 1 +
|
|
heartbeat/podman-etcd | 1597 +++++++++++++++++++++++++++++++++++++++++
|
|
3 files changed, 1599 insertions(+)
|
|
create mode 100755 heartbeat/podman-etcd
|
|
|
|
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
|
|
index bc8935782..0d34c7c65 100644
|
|
--- a/doc/man/Makefile.am
|
|
+++ b/doc/man/Makefile.am
|
|
@@ -187,6 +187,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
|
|
ocf_heartbeat_pgsqlms.7 \
|
|
ocf_heartbeat_pingd.7 \
|
|
ocf_heartbeat_podman.7 \
|
|
+ ocf_heartbeat_podman-etcd.7 \
|
|
ocf_heartbeat_portblock.7 \
|
|
ocf_heartbeat_postfix.7 \
|
|
ocf_heartbeat_pound.7 \
|
|
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
|
index 5c41e0038..839505af9 100644
|
|
--- a/heartbeat/Makefile.am
|
|
+++ b/heartbeat/Makefile.am
|
|
@@ -159,6 +159,7 @@ ocf_SCRIPTS = AoEtarget \
|
|
pgsqlms \
|
|
pingd \
|
|
podman \
|
|
+ podman-etcd \
|
|
portblock \
|
|
postfix \
|
|
pound \
|
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
|
new file mode 100755
|
|
index 000000000..514dd2e5b
|
|
--- /dev/null
|
|
+++ b/heartbeat/podman-etcd
|
|
@@ -0,0 +1,1597 @@
|
|
+#!/bin/sh
|
|
+#
|
|
+# The podman etcd HA resource agent creates and launches a etcd podman
|
|
+# container based off a supplied podman image. Containers managed by
|
|
+# this agent are both created and removed upon the agent's start and
|
|
+# stop actions.
|
|
+#
|
|
+# Based on the podman resource agent.
|
|
+#
|
|
+# Copyright (c) 2014 David Vossel <davidvossel@gmail.com>
|
|
+# Michele Baldessari <michele@acksyn.org>
|
|
+# All Rights Reserved.
|
|
+#
|
|
+# This program is free software; you can redistribute it and/or modify
|
|
+# it under the terms of version 2 of the GNU General Public License as
|
|
+# published by the Free Software Foundation.
|
|
+#
|
|
+# This program is distributed in the hope that it would be useful, but
|
|
+# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
+#
|
|
+# Further, this software is distributed without any warranty that it is
|
|
+# free of the rightful claim of any third person regarding infringement
|
|
+# or the like. Any license provided herein, whether implied or
|
|
+# otherwise, applies only to this software file. Patent licenses, if
|
|
+# any, provided herein do not apply to combinations of this program with
|
|
+# other software, or any other product whatsoever.
|
|
+#
|
|
+# You should have received a copy of the GNU General Public License
|
|
+# along with this program; if not, write the Free Software Foundation,
|
|
+# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
|
|
+#
|
|
+
|
|
+#######################################################################
|
|
+# Initialization:
|
|
+
|
|
+: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
|
|
+. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
+
|
|
+# Parameter defaults
|
|
+OCF_RESKEY_image_default="default"
|
|
+OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml"
|
|
+OCF_RESKEY_name_default="etcd"
|
|
+OCF_RESKEY_nic_default="br-ex"
|
|
+OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
|
+OCF_RESKEY_allow_pull_default="1"
|
|
+OCF_RESKEY_reuse_default="0"
|
|
+
|
|
+: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
|
+: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
|
+: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
|
|
+: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}}
|
|
+: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
|
|
+: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
|
|
+: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
|
|
+
|
|
+#######################################################################
|
|
+
|
|
+meta_data()
|
|
+{
|
|
+ cat <<END
|
|
+<?xml version="1.0"?>
|
|
+<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
|
+<resource-agent name="podman-etcd" version="1.0">
|
|
+<version>1.0</version>
|
|
+
|
|
+<longdesc lang="en">
|
|
+The podman-etcd HA resource agent creates and launches a etcd podman
|
|
+container based off a supplied podman image. Containers managed by
|
|
+this agent are both created and removed upon the agent's start and
|
|
+stop actions.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Podman etcd container resource agent.</shortdesc>
|
|
+
|
|
+<parameters>
|
|
+<parameter name="pod_manifest" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+The Pod manifest with the configuration for Etcd.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Etcd pod manifest</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_pod_manifest_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="image" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+The podman image to base this container off of.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">podman image</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_image_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="name" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+The name to give the created container. By default this will
|
|
+be that resource's instance name.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">podman container name</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_name_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="node_ip_map" unique="0" required="1">
|
|
+<longdesc lang="en">
|
|
+A mapping of node names to IPs.
|
|
+
|
|
+This takes the form of:
|
|
+n1:ip1;n2:ip2
|
|
+
|
|
+where the etcd container on n1 would have IP ip1
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Container node name to IP mapping</shortdesc>
|
|
+<content type="string"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="nic" unique="0">
|
|
+<longdesc lang="en">
|
|
+Network interface to lookup interface for host.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Network interface</shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_nic_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="authfile" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Path of the authentication file.
|
|
+
|
|
+The file is created by podman login.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Path of the authentication file </shortdesc>
|
|
+<content type="string" default="${OCF_RESKEY_authfile_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="allow_pull" unique="0">
|
|
+<longdesc lang="en">
|
|
+Allow the image to be pulled from the configured podman registry when
|
|
+the image does not exist locally. NOTE, this can drastically increase
|
|
+the time required to start the container if the image repository is
|
|
+pulled over the network.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Allow pulling non-local images</shortdesc>
|
|
+<content type="boolean" default="${OCF_RESKEY_allow_pull_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="run_opts" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Add options to be appended to the 'podman run' command which is used
|
|
+when creating the container during the start action. This option allows
|
|
+users to do things such as setting a custom entry point and injecting
|
|
+environment variables into the newly created container. Note the '-d'
|
|
+option is supplied regardless of this value to force containers to run
|
|
+in the background.
|
|
+
|
|
+NOTE: Do not explicitly specify the --name argument in the run_opts. This
|
|
+agent will set --name using either the resource's instance or the name
|
|
+provided in the 'name' argument of this agent.
|
|
+
|
|
+</longdesc>
|
|
+<shortdesc lang="en">run options</shortdesc>
|
|
+<content type="string"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="run_cmd" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Specify a command to launch within the container once
|
|
+it has initialized.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">run command</shortdesc>
|
|
+<content type="string"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="run_cmd_opts" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Options to be added to the 'run_cmd'.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">run command options</shortdesc>
|
|
+<content type="string"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="mount_points" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+A comma separated list of directories that the container is expecting to use.
|
|
+The agent will ensure they exist by running 'mkdir -p'
|
|
+</longdesc>
|
|
+<shortdesc lang="en">Required mount points</shortdesc>
|
|
+<content type="string"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="monitor_cmd" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Specify the full path of a command to launch within the container to check
|
|
+the health of the container. This command must return 0 to indicate that
|
|
+the container is healthy. A non-zero return code will indicate that the
|
|
+container has failed and should be recovered.
|
|
+
|
|
+Note: Using this method for monitoring processes inside a container
|
|
+is not recommended, as containerd tries to track processes running
|
|
+inside the container and does not deal well with many short-lived
|
|
+processes being spawned. Ensure that your container monitors its
|
|
+own processes and terminates on fatal error rather than invoking
|
|
+a command from the outside.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">monitor command</shortdesc>
|
|
+<content type="string"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="force_kill" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Kill a container immediately rather than waiting for it to gracefully
|
|
+shutdown
|
|
+</longdesc>
|
|
+<shortdesc lang="en">force kill</shortdesc>
|
|
+<content type="boolean"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="reuse" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Allow the container to be reused once it is stopped. By default,
|
|
+containers get removed once they are stopped. Enable this option
|
|
+to have the particular one persist when this happens.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">reuse container</shortdesc>
|
|
+<content type="boolean" default="${OCF_RESKEY_reuse_default}"/>
|
|
+</parameter>
|
|
+
|
|
+<parameter name="drop_in_dependency" required="0" unique="0">
|
|
+<longdesc lang="en">
|
|
+Use transient drop-in files to add extra dependencies to the systemd
|
|
+scopes associated to the container. During reboot, this prevents systemd
|
|
+to stop the container before pacemaker.
|
|
+</longdesc>
|
|
+<shortdesc lang="en">drop-in dependency</shortdesc>
|
|
+<content type="boolean"/>
|
|
+</parameter>
|
|
+</parameters>
|
|
+
|
|
+<actions>
|
|
+<action name="start" timeout="600s" />
|
|
+<action name="stop" timeout="90s" />
|
|
+<action name="monitor" timeout="25s" interval="30s" depth="0" />
|
|
+<action name="promote" timeout="300s" />
|
|
+<action name="demote" timeout="120s" />
|
|
+<action name="meta-data" timeout="5s" />
|
|
+<action name="validate-all" timeout="30s" />
|
|
+</actions>
|
|
+</resource-agent>
|
|
+END
|
|
+}
|
|
+
|
|
+#######################################################################
|
|
+REQUIRE_IMAGE_PULL=0
|
|
+
|
|
+podman_usage()
|
|
+{
|
|
+ cat <<END
|
|
+usage: $0 {start|stop|monitor|promote|demote|validate-all|meta-data}
|
|
+
|
|
+Expects to have a fully populated OCF RA-compliant environment set.
|
|
+END
|
|
+}
|
|
+
|
|
+
|
|
+monitor_cmd_exec()
|
|
+{
|
|
+ local rc=$OCF_SUCCESS
|
|
+ local out
|
|
+
|
|
+ out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
|
|
+ rc=$?
|
|
+ # 125: no container with name or ID ${CONTAINER} found
|
|
+ # 126: container state improper (not running)
|
|
+ # 127: any other error
|
|
+ # 255: podman 2+: container not running
|
|
+ case "$rc" in
|
|
+ 125|126|255)
|
|
+ rc=$OCF_NOT_RUNNING
|
|
+ ;;
|
|
+ 0)
|
|
+ ocf_log debug "monitor cmd passed: exit code = $rc"
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out"
|
|
+ rc=$OCF_ERR_GENERIC
|
|
+ ;;
|
|
+ esac
|
|
+
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+container_exists()
|
|
+{
|
|
+ local rc
|
|
+ local out
|
|
+
|
|
+ out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1)
|
|
+ rc=$?
|
|
+ # 125: no container with name or ID ${CONTAINER} found
|
|
+ if [ $rc -ne 125 ]; then
|
|
+ return 0
|
|
+ fi
|
|
+ return 1
|
|
+}
|
|
+
|
|
+remove_container()
|
|
+{
|
|
+ local rc
|
|
+ local execids
|
|
+
|
|
+ if ocf_is_true "$OCF_RESKEY_reuse"; then
|
|
+ # never remove the container if we have reuse enabled.
|
|
+ return 0
|
|
+ fi
|
|
+
|
|
+ if ! container_exists; then
|
|
+ # don't attempt to remove a container that doesn't exist
|
|
+ return 0
|
|
+ fi
|
|
+ ocf_log notice "Cleaning up inactive container, ${CONTAINER}."
|
|
+ ocf_run podman rm -v "$CONTAINER"
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ if [ $rc -eq 2 ]; then
|
|
+ if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then
|
|
+ ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
|
|
+ ocf_run podman rm -f "$CONTAINER"
|
|
+ rc=$?
|
|
+ fi
|
|
+ fi
|
|
+ # due to a podman bug (rhbz#1841485), sometimes a stopped
|
|
+ # container can still be associated with Exec sessions, in
|
|
+ # which case the "podman rm" has to be forced
|
|
+ execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}')
|
|
+ if [ "$execids" -ne "0" ]; then
|
|
+ ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it."
|
|
+ ocf_run podman rm -f "$CONTAINER"
|
|
+ rc=$?
|
|
+ fi
|
|
+ fi
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+attribute_node_ip()
|
|
+{
|
|
+ local action="$1"
|
|
+ local attribute="node_ip"
|
|
+ local value
|
|
+
|
|
+ if ! value=$(ip -brief addr show "$OCF_RESKEY_nic" | awk '{gsub("/.*", "", $3); print $3}'); then
|
|
+ rc=$?
|
|
+ ocf_log err "could not get node ip, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+
|
|
+ case "$action" in
|
|
+ get)
|
|
+ echo "$value"
|
|
+ ;;
|
|
+ update)
|
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
|
+ rc="$?"
|
|
+ ocf_log err "could not set $attribute to $value, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+ ;;
|
|
+ clear)
|
|
+ crm_attribute --name "$attribute" --delete
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "unsupported $action for $attribute"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ ;;
|
|
+ esac
|
|
+}
|
|
+
|
|
+attribute_node_ip_peer() {
|
|
+ local peer_name
|
|
+ peer_name=$(get_peer_node_name)
|
|
+ crm_attribute --query --name "node_ip" --node "$peer_name" | awk -F"value=" '{print $2}'
|
|
+}
|
|
+
|
|
+get_env_from_manifest() {
|
|
+ local env_var_name="$1"
|
|
+ local env_var_value
|
|
+
|
|
+ # The agent waits for the manifest to exist before starting, so the
|
|
+ # file should exist already, but this check is included for robustness.
|
|
+ if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then
|
|
+ ocf_log err "external etcd pod manifest ($OCF_RESKEY_pod_manifest) not found"
|
|
+ exit "$OCF_ERR_INSTALLED"
|
|
+ fi
|
|
+
|
|
+ if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then
|
|
+ rc=$?
|
|
+ ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc"
|
|
+ exit "$OCF_ERR_INSTALLED"
|
|
+ fi
|
|
+
|
|
+ ocf_log debug "ETCD pod environment variable $env_var_name: $env_var_value"
|
|
+
|
|
+ echo "$env_var_value"
|
|
+}
|
|
+
|
|
+prepare_env() {
|
|
+ local name ip standalone_node
|
|
+
|
|
+ NODEIP="$(attribute_node_ip get)"
|
|
+
|
|
+ if is_force_new_cluster; then
|
|
+ ALL_ETCD_ENDPOINTS="https://$NODEIP:2379"
|
|
+ ETCD_INITIAL_CLUSTER_STATE="new"
|
|
+ ETCD_INITIAL_CLUSTER="$NODENAME=https://$NODEIP:2380"
|
|
+ else
|
|
+ ETCD_INITIAL_CLUSTER_STATE="existing"
|
|
+ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
|
+ name=$(echo "$node" | awk -F":" '{print $1}')
|
|
+ ip=$(echo "$node" | awk -F":" '{print $2}')
|
|
+ if [ -z "$name" ] || [ -z "$ip" ]; then
|
|
+ ocf_exit_reason "name or ip missing for 1 or more nodes"
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379"
|
|
+ [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=https://$ip:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=https://$ip:2380"
|
|
+ done
|
|
+ fi
|
|
+
|
|
+ ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API")
|
|
+ ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES")
|
|
+ ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR")
|
|
+ ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
|
|
+ ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
|
|
+ ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS")
|
|
+ ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
|
+ ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
|
+ ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
|
|
+ ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES")
|
|
+ ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS")
|
|
+
|
|
+ SERVER_CACERT=$(get_env_from_manifest "ETCDCTL_CACERT")
|
|
+ ETCD_PEER_CERT=$(get_env_from_manifest "ETCDCTL_CERT")
|
|
+ ETCD_PEER_KEY=$(get_env_from_manifest "ETCDCTL_KEY")
|
|
+
|
|
+ if is_learner; then
|
|
+ LISTEN_CLIENT_URLS="$NODEIP"
|
|
+ LISTEN_PEER_URLS="$NODEIP"
|
|
+ LISTEN_METRICS_URLS="$NODEIP"
|
|
+ else
|
|
+ LISTEN_CLIENT_URLS="0.0.0.0"
|
|
+ LISTEN_PEER_URLS="0.0.0.0"
|
|
+ LISTEN_METRICS_URLS="0.0.0.0"
|
|
+ fi
|
|
+}
|
|
+
|
|
+archive_data_folder()
|
|
+{
|
|
+ # TODO: use etcd snapshots
|
|
+ local dest_dir_name
|
|
+ local data_dir="/var/lib/etcd/member"
|
|
+
|
|
+ dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
|
|
+ if [ ! -d $data_dir ]; then
|
|
+ ocf_log info "no data dir to backup"
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+ ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
|
|
+ mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
|
|
+ sync
|
|
+}
|
|
+
|
|
+etcd_pod_container_exists() {
|
|
+ local count_matches
|
|
+ # Check whether the etcd pod exists on the same node (header line included)
|
|
+ count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
|
|
+ if [ "$count_matches" -eq 1 ]; then
|
|
+ # etcd pod found
|
|
+ return 0
|
|
+ fi
|
|
+ # etcd pod not found
|
|
+ return 1
|
|
+}
|
|
+
|
|
+attribute_node_cluster_id()
|
|
+{
|
|
+ local action="$1"
|
|
+ local value
|
|
+ if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then
|
|
+ rc=$?
|
|
+ ocf_log err "could not get cluster_id, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+
|
|
+ case "$action" in
|
|
+ get)
|
|
+ echo "$value"
|
|
+ ;;
|
|
+ update)
|
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then
|
|
+ rc=$?
|
|
+ ocf_log err "could not update cluster_id, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "unsupported $action for attribute_node_cluster_id"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ ;;
|
|
+ esac
|
|
+}
|
|
+
|
|
+attribute_node_cluster_id_peer()
|
|
+{
|
|
+ local nodename
|
|
+
|
|
+ nodename=$(get_peer_node_name)
|
|
+ crm_attribute --query --type nodes --node "$nodename" --name "cluster_id" | awk -F"value=" '{print $2}'
|
|
+}
|
|
+
|
|
+attribute_node_revision()
|
|
+{
|
|
+ local action="$1"
|
|
+ local value
|
|
+ local attribute="revision"
|
|
+
|
|
+ if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then
|
|
+ rc=$?
|
|
+ ocf_log err "could not get $attribute, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+
|
|
+ case "$action" in
|
|
+ get)
|
|
+ echo "$value"
|
|
+ ;;
|
|
+ update)
|
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
|
+ rc=$?
|
|
+ ocf_log err "could not update etcd $revision, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "unsupported $action for attribute_node_revision"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ ;;
|
|
+ esac
|
|
+}
|
|
+
|
|
+attribute_node_revision_peer()
|
|
+{
|
|
+ local nodename
|
|
+ nodename=$(get_peer_node_name)
|
|
+ crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}'
|
|
+}
|
|
+
|
|
+attribute_node_member_id()
|
|
+{
|
|
+ local action="$1"
|
|
+ local attribute="member_id"
|
|
+
|
|
+ if ! container_exists; then
|
|
+ # we need a running container to execute etcdctl.
|
|
+ return 0
|
|
+ fi
|
|
+
|
|
+ case "$action" in
|
|
+ get)
|
|
+ # When we need this value at the agent startup we don't have a etcd
|
|
+ # container running, so we always get this value from CIB
|
|
+ crm_attribute --query --type nodes --node "$NODENAME" --name "$attribute" | awk -F"value=" '{print $2}'
|
|
+ ;;
|
|
+ update)
|
|
+ local member_list_json
|
|
+ member_list_json=$(get_member_list_json)
|
|
+ ocf_log info "member list: $member_list_json"
|
|
+ if [ -z "$member_list_json" ] ; then
|
|
+ ocf_log err "could not get $attribute: could not get member list JSON"
|
|
+ return "$rc"
|
|
+ fi
|
|
+
|
|
+ local value
|
|
+ if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
|
|
+ rc=$?
|
|
+ ocf_log err "could not get $attribute from member list JSON, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+
|
|
+ # JSON member_id is decimal, while etcdctl command needs the hex version
|
|
+ value=$(printf "%x" "$value")
|
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
|
+ rc=$?
|
|
+ ocf_log err "could not update etcd $attribute, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+ ;;
|
|
+ clear)
|
|
+ crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --delete
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "unsupported $action for attribute_node_member_id"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ ;;
|
|
+ esac
|
|
+}
|
|
+
|
|
+add_member_as_learner()
|
|
+{
|
|
+ local rc
|
|
+ local member_name=$1
|
|
+ local member_ip=$2
|
|
+
|
|
+ ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
|
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="https://$(attribute_node_ip get):2379" member add "$member_name" --peer-urls="https://$member_ip:2380" --learner)
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_log err "could not add $member_name as learner, error code: $rc"
|
|
+ return $rc
|
|
+ fi
|
|
+ ocf_log info "$out"
|
|
+
|
|
+ attribute_learner_node update "$member_name"
|
|
+ return $?
|
|
+}
|
|
+
|
|
+set_force_new_cluster()
|
|
+{
|
|
+ local rc
|
|
+ crm_attribute --lifetime reboot --node "$NODENAME" --name "force_new_cluster" --update "$NODENAME"
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_log err "could not set force_new_cluster attribute to $NODENAME"
|
|
+ fi
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+get_force_new_cluster()
|
|
+{
|
|
+ crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}'
|
|
+}
|
|
+
|
|
+clear_force_new_cluster()
|
|
+{
|
|
+ local force_new_cluster_node
|
|
+
|
|
+ force_new_cluster_node=$(get_force_new_cluster)
|
|
+ if [ -z "$force_new_cluster_node" ]; then
|
|
+ ocf_log info "$NODENAME: force_new_cluster attribute not set"
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+
|
|
+ # only the holder of "force_new_cluster" attribute can delete it
|
|
+ if [ "$NODENAME" = "$force_new_cluster_node" ]; then
|
|
+ crm_attribute --lifetime reboot --name "force_new_cluster" --delete
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_log err "could not clear force_new_cluster attribute, error code: $rc"
|
|
+ else
|
|
+ ocf_log info "$NODENAME: force_new_cluster attribute cleared"
|
|
+ fi
|
|
+ return $rc
|
|
+ else
|
|
+ ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)"
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+}
|
|
+
|
|
+is_force_new_cluster()
|
|
+{
|
|
+ # Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise.
|
|
+ local value
|
|
+
|
|
+ value=$(get_force_new_cluster)
|
|
+ if [ -z "$value" ]; then
|
|
+ ocf_log debug "force_new_cluster attribute is not set"
|
|
+ return 1
|
|
+ fi
|
|
+
|
|
+ if [ "$value" = "$NODENAME" ]; then
|
|
+ ocf_log debug "$NODENAME has force_new_cluster set"
|
|
+ return 0
|
|
+ fi
|
|
+
|
|
+ ocf_log info "force_new_cluster attribute set on peer node $value"
|
|
+ return 1
|
|
+}
|
|
+
|
|
+is_standalone()
|
|
+{
|
|
+ local standalone_node
|
|
+
|
|
+ standalone_node=$(get_standalone_node)
|
|
+ if [ -z "$standalone_node" ]; then
|
|
+ ocf_log debug "no node running standalone"
|
|
+ return 1
|
|
+ fi
|
|
+
|
|
+ if [ "$NODENAME" = "$standalone_node" ]; then
|
|
+ ocf_log debug "$NODENAME is set as standalone"
|
|
+ return 0
|
|
+ fi
|
|
+ ocf_log debug "$NODENAME is set as learner"
|
|
+ return 1
|
|
+
|
|
+}
|
|
+
|
|
+set_standalone_node()
|
|
+{
|
|
+ local rc
|
|
+
|
|
+ ocf_log info "add $NODENAME as standalone"
|
|
+ crm_attribute --name "standalone_node" --update "$NODENAME"
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_log err "could not set standalone_node attribute to $NODENAME"
|
|
+ fi
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+get_standalone_node()
|
|
+{
|
|
+ crm_attribute --query --name "standalone_node" | awk -F"value=" '{print $2}'
|
|
+}
|
|
+
|
|
+clear_standalone_node()
|
|
+{
|
|
+ crm_attribute --name "standalone_node" --delete
|
|
+}
|
|
+
|
|
+clear_standalone_and_learner_if_not_learners()
|
|
+{
|
|
+ local rc
|
|
+ local member_list_json="$1"
|
|
+
|
|
+ number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l)
|
|
+ if [ "$number_of_members" -ne 2 ]; then
|
|
+ ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2"
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+
|
|
+ id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_log err "could not get isLearner field from member list, error code: $rc"
|
|
+ return $rc
|
|
+ fi
|
|
+
|
|
+ if [ -z "$id" ]; then
|
|
+ clear_standalone_node
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_og error "could not clear standalone_node attribute, error code: $rc"
|
|
+ return $rc
|
|
+ fi
|
|
+ fi
|
|
+ if [ -z "$id" ]; then
|
|
+ attribute_learner_node clear
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_og error "could not clear learner_node attribute, error code: $rc"
|
|
+ return $rc
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+attribute_learner_node()
|
|
+{
|
|
+ local action="$1"
|
|
+ local value="$2"
|
|
+ local attribute="learner_node"
|
|
+
|
|
+ case "$action" in
|
|
+ get)
|
|
+ crm_attribute --query --name "$attribute" | awk -F"value=" '{print $2}'
|
|
+ ;;
|
|
+ update)
|
|
+ if ! crm_attribute --name "$attribute" --update "$value"; then
|
|
+ rc="$?"
|
|
+ ocf_log err "could not set $attribute to $value, error code: $rc"
|
|
+ return "$rc"
|
|
+ fi
|
|
+ ;;
|
|
+ clear)
|
|
+ crm_attribute --name "$attribute" --delete
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "unsupported $action for $attribute"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ ;;
|
|
+ esac
|
|
+}
|
|
+
|
|
+is_learner()
|
|
+{
|
|
+ if [ "$NODENAME" = "$(attribute_learner_node get)" ]; then
|
|
+ return 0
|
|
+ fi
|
|
+ return 1
|
|
+}
|
|
+
|
|
+get_peer_node_name() {
|
|
+ crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
|
|
+}
|
|
+
|
|
+get_all_etcd_endpoints() {
|
|
+ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
|
+ name=$(echo "$node" | awk -F":" '{print $1}')
|
|
+ ip=$(echo "$node" | awk -F":" '{print $2}')
|
|
+ if [ -z "$name" ] || [ -z "$ip" ]; then
|
|
+ ocf_exit_reason "name or ip missing for 1 or more nodes"
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379"
|
|
+ done
|
|
+ echo "$ALL_ETCD_ENDPOINTS"
|
|
+}
|
|
+
|
|
+get_endpoint_status_json()
|
|
+{
|
|
+ # Get the status of all endpoints
|
|
+ local all_etcd_endpoints
|
|
+
|
|
+ all_etcd_endpoints=$(get_all_etcd_endpoints)
|
|
+ podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json
|
|
+}
|
|
+
|
|
+get_member_list_json() {
|
|
+ # Get the list of members visible to the current node
|
|
+ local this_node_endpoint
|
|
+
|
|
+ this_node_endpoint="https://$(attribute_node_ip get):2379"
|
|
+ podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
|
|
+}
|
|
+
|
|
+check_peers()
|
|
+{
|
|
+ # Check peers endpoint status and locally accessible member list
|
|
+ local member_list_json
|
|
+
|
|
+ if ! container_exists; then
|
|
+ # we need a running container to execute etcdctl.
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+
|
|
+ member_list_json=$(get_member_list_json)
|
|
+ rc=$?
|
|
+ ocf_log debug "member list: $member_list_json"
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_log info "podman failed to get member list, error code: $rc"
|
|
+
|
|
+ endpoint_status_json=$(get_endpoint_status_json)
|
|
+ ocf_log info "endpoint status: $endpoint_status_json"
|
|
+
|
|
+ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
|
+ if [ "$count_endpoints" -eq 1 ]; then
|
|
+ ocf_log info "one endpoint only: checking status errors"
|
|
+ endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
|
+ if echo "$endpoint_status_errors" | grep -q "no leader"; then
|
|
+ set_force_new_cluster
|
|
+ set_standalone_node
|
|
+ ocf_exit_reason "$NODENAME must force a new cluster"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ if [ "$endpoint_status_errors" != "null" ]; then
|
|
+ ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+
|
|
+ # Example of .members[] instance fields in member list json format:
|
|
+ # NOTE that "name" is present in voting members only, while "isLearner" in learner members only
|
|
+ # and the value is always true (not a string) in that case.
|
|
+ # {
|
|
+ # "ID": <member ID>,
|
|
+ # "name": "<node hostname>",
|
|
+ # "peerURLs": [
|
|
+ # "https://<node IP>:2380"
|
|
+ # ],
|
|
+ # "clientURLs": [
|
|
+ # "https://<node IP>:2379"
|
|
+ # ]
|
|
+ # }
|
|
+ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
|
+ name=$(echo "$node" | awk -F":" '{print $1}')
|
|
+ # do not check itself
|
|
+ if [ "$name" = "$NODENAME" ]; then
|
|
+ continue
|
|
+ fi
|
|
+
|
|
+ # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
|
+ ip=$(echo "$node" | awk -F":" '{print $2}')
|
|
+ id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
|
+ if [ -z "$id" ]; then
|
|
+ ocf_log info "$name is not in the members list"
|
|
+ add_member_as_learner "$name" "$ip"
|
|
+ set_standalone_node
|
|
+ else
|
|
+ ocf_log debug "$name is in the members list by IP: $ip"
|
|
+ clear_standalone_and_learner_if_not_learners "$member_list_json"
|
|
+ fi
|
|
+ done
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+podman_simple_status()
|
|
+{
|
|
+ local rc
|
|
+
|
|
+ # simple status is implemented via podman exec
|
|
+ # everything besides success is considered "not running"
|
|
+ monitor_cmd_exec
|
|
+ rc=$?
|
|
+ if [ $rc -ne $OCF_SUCCESS ]; then
|
|
+ rc=$OCF_NOT_RUNNING;
|
|
+ fi
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+podman_monitor()
|
|
+{
|
|
+ # We rely on running podman exec to monitor the container
|
|
+ # state because that command seems to be less prone to
|
|
+ # performance issue under IO load.
|
|
+ #
|
|
+ # For probes to work, we expect cmd_exec to be able to report
|
|
+ # when a container is not running. Here, we're not interested
|
|
+ # in distinguishing whether it's stopped or non existing
|
|
+ # (there's function container_exists for that)
|
|
+ monitor_cmd_exec
|
|
+ rc=$?
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ return $rc
|
|
+ fi
|
|
+
|
|
+ if is_learner; then
|
|
+ ocf_log info "$NODENAME is learner. Cannot get member id"
|
|
+ return "$OCF_SUCCESS"
|
|
+ fi
|
|
+ # Failing to cache data and check member list should not cause the
|
|
+ # monitor operation to fail.
|
|
+ # TODO: move this inside check_peers where we already query member list json
|
|
+ attribute_node_member_id update
|
|
+ if ! check_peers; then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ # node revision comes from the disk, so if it is not available is a fatal failure
|
|
+ attribute_node_revision update
|
|
+ return $?
|
|
+}
|
|
+
|
|
+podman_create_mounts() {
|
|
+ oldIFS="$IFS"
|
|
+ IFS=","
|
|
+ for directory in $OCF_RESKEY_mount_points; do
|
|
+ mkdir -p "$directory"
|
|
+ done
|
|
+ IFS="$oldIFS"
|
|
+}
|
|
+
|
|
+podman_container_id()
|
|
+{
|
|
+ # Retrieve the container ID by doing a "podman ps" rather than
|
|
+ # a "podman inspect", because the latter has performance issues
|
|
+ # under IO load.
|
|
+ # We could have run "podman start $CONTAINER" to get the ID back
|
|
+ # but if the container is stopped, the command will return a
|
|
+ # name instead of a container ID. This would break us.
|
|
+ podman ps --no-trunc --format '{{.ID}} {{.Names}}' | grep -F -w -m1 "$CONTAINER" | cut -d' ' -f1
|
|
+}
|
|
+
|
|
+
|
|
+create_transient_drop_in_dependency()
|
|
+{
|
|
+ local cid=$1
|
|
+ local rc=$OCF_SUCCESS
|
|
+
|
|
+ if [ -z "$cid" ]; then
|
|
+ ocf_exit_reason "Container ID not found for \"$CONTAINER\". Not creating drop-in dependency"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ ocf_log info "Creating drop-in dependency for \"$CONTAINER\" ($cid)"
|
|
+ for scope in "libpod-$cid.scope.d" "libpod-conmon-$cid.scope.d"; do
|
|
+ if [ $rc -eq $OCF_SUCCESS ] && [ ! -d /run/systemd/transient/"$scope" ]; then
|
|
+ mkdir -p /run/systemd/transient/"$scope" && \
|
|
+ printf "[Unit]\nBefore=pacemaker.service" > /run/systemd/transient/"$scope"/dep.conf && \
|
|
+ chmod ago+r /run/systemd/transient/"$scope" /run/systemd/transient/"$scope"/dep.conf
|
|
+ rc=$?
|
|
+ fi
|
|
+ done
|
|
+
|
|
+ if [ $rc -ne $OCF_SUCCESS ]; then
|
|
+ ocf_log err "Could not create drop-in dependency for \"$CONTAINER\" ($cid)"
|
|
+ else
|
|
+ systemctl daemon-reload
|
|
+ rc=$?
|
|
+ if [ $rc -ne $OCF_SUCCESS ]; then
|
|
+ ocf_log err "Could not refresh service definition after creating drop-in for \"$CONTAINER\""
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+
|
|
+run_new_container()
|
|
+{
|
|
+ local opts=$1
|
|
+ local image=$2
|
|
+ local cmd=$3
|
|
+ local rc
|
|
+
|
|
+ ocf_log info "running container $CONTAINER for the first time"
|
|
+ out=$(podman run $opts $image $cmd 2>&1)
|
|
+ rc=$?
|
|
+
|
|
+ if [ -n "$out" ]; then
|
|
+ out="$(echo "$out" | tr -s ' \t\r\n' ' ')"
|
|
+ if [ $rc -eq 0 ]; then
|
|
+ ocf_log info "$out"
|
|
+ else
|
|
+ ocf_log err "$out"
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ if [ $rc -eq 125 ]; then
|
|
+ # If an internal podman error occurred, it might be because
|
|
+ # the internal storage layer still references an old container
|
|
+ # with the same name, even though podman itself thinks there
|
|
+ # is no such container. If so, purge the storage layer to try
|
|
+ # to clean the corruption and try again.
|
|
+ if echo "$out" | grep -q "unknown.*flag"; then
|
|
+ ocf_exit_reason "$out"
|
|
+ return $rc
|
|
+ fi
|
|
+
|
|
+ ocf_log warn "Internal podman error while creating new container $CONTAINER. Retrying."
|
|
+ ocf_run podman rm --storage "$CONTAINER"
|
|
+ ocf_run podman run $opts $image $cmd
|
|
+ rc=$?
|
|
+ elif [ $rc -eq 127 ]; then
|
|
+ # rhbz#1972209: podman 3.0.x seems to be hit by a race
|
|
+ # where the cgroup is not yet set up properly when the OCI
|
|
+ # runtime configures the container. If that happens, recreate
|
|
+ # the container as long as we get the same error code or
|
|
+ # until start timeout preempts us.
|
|
+ while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do
|
|
+ ocf_log warn "Internal podman error while assigning cgroup. Retrying."
|
|
+ # Arbitrary sleep to prevent consuming all CPU while looping
|
|
+ sleep 1
|
|
+ podman rm -f "$CONTAINER"
|
|
+ out=$(podman run $opts $image $cmd 2>&1)
|
|
+ rc=$?
|
|
+ done
|
|
+ # Log the created container ID if it succeeded
|
|
+ if [ $rc -eq 0 ]; then
|
|
+ ocf_log info "$out"
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ return $rc
|
|
+}
|
|
+
|
|
+compare_revision()
|
|
+{
|
|
+ # Compare local revision (from disk) against peer revision (from CIB).
|
|
+ # returns "older", "equal" or "newer"
|
|
+ local revision
|
|
+ local peer_node_name
|
|
+ local peer_revision
|
|
+
|
|
+ revision=$(attribute_node_revision get)
|
|
+ peer_revision=$(attribute_node_revision_peer)
|
|
+
|
|
+ if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then
|
|
+ ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+
|
|
+ if [ "$revision" -gt "$peer_revision" ]; then
|
|
+ ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision"
|
|
+ echo "newer"
|
|
+ elif [ "$revision" -eq "$peer_revision" ]; then
|
|
+ ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision"
|
|
+ echo "equal"
|
|
+ else
|
|
+ ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision"
|
|
+ echo "older"
|
|
+ fi
|
|
+ return "$OCF_SUCCESS"
|
|
+}
|
|
+
|
|
+ensure_pod_manifest_exists()
|
|
+{
|
|
+ local wait_timeout_sec=$((10 * 60))
|
|
+ local poll_interval_sec=5
|
|
+ local poll_retries=$((wait_timeout_sec/poll_interval_sec))
|
|
+
|
|
+ for try in $(seq "$poll_retries"); do
|
|
+ if [ -f "$OCF_RESKEY_pod_manifest" ]; then
|
|
+ ocf_log info "pod manifest ($OCF_RESKEY_pod_manifest) found"
|
|
+ break
|
|
+ fi
|
|
+ ocf_log debug "pod manifest ($OCF_RESKEY_pod_manifest) does not exist yet: retry in $poll_interval_sec seconds."
|
|
+ sleep "$poll_interval_sec"
|
|
+ done
|
|
+
|
|
+ if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then
|
|
+ ocf_log err "pod manifest ($OCF_RESKEY_pod_manifest) still missing after $wait_timeout_sec seconds."
|
|
+ return "$OCF_ERR_CONFIGURED"
|
|
+ fi
|
|
+
|
|
+ return "$OCF_SUCCESS"
|
|
+}
|
|
+
|
|
+podman_start()
|
|
+{
|
|
+ local cid
|
|
+ local rc
|
|
+ local etcd_pod_wait_timeout_sec=$((10 * 60))
|
|
+ local etcd_pod_poll_interval_sec=10
|
|
+ local etcd_pod_poll_retries=$((etcd_pod_wait_timeout_sec/etcd_pod_poll_interval_sec))
|
|
+ local pod_was_running=false
|
|
+
|
|
+ ocf_log notice "podman-etcd start"
|
|
+ attribute_node_ip update
|
|
+ attribute_node_cluster_id update
|
|
+ attribute_node_revision update
|
|
+
|
|
+ # ensure the etcd pod is not running before starting the container
|
|
+ ocf_log info "ensure etcd pod is not running (retries: $etcd_pod_poll_retries, interval: $etcd_pod_poll_interval_sec)"
|
|
+ for try in $(seq $etcd_pod_poll_retries); do
|
|
+ if ! etcd_pod_container_exists; then
|
|
+ break
|
|
+ fi
|
|
+ ocf_log info "etcd pod running: retry in $etcd_pod_poll_interval_sec seconds."
|
|
+ pod_was_running=true
|
|
+ sleep $etcd_pod_poll_interval_sec
|
|
+ done
|
|
+ if etcd_pod_container_exists; then
|
|
+ ocf_exit_reason "etcd pod is still running after $etcd_pod_wait_timeout_sec seconds."
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ if ! ensure_pod_manifest_exists; then
|
|
+ ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+
|
|
+ # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1.
|
|
+ # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots.
|
|
+ # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set
|
|
+ # during the current node boot session, implying a deliberate request to recover the cluster.
|
|
+ if ocf_is_true "$pod_was_running"; then
|
|
+ ocf_log info "static pod was running: start normally"
|
|
+ else
|
|
+ if is_force_new_cluster; then
|
|
+ ocf_log notice "$NODENAME marked to force-new-cluster"
|
|
+ else
|
|
+ # When the local agent starts, we can infer the cluster state by counting
|
|
+ # how many agents are starting or already active:
|
|
+ # - 1 active agent: it's the peer (we are just starting)
|
|
+ # - 0 active agents, 1 starting: we are starting; the peer is not starting
|
|
+ # - 0 active agents, 2 starting: both agents are starting simultaneously
|
|
+ local active_resources_count
|
|
+ active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
|
+ case "$active_resources_count" in
|
|
+ 1)
|
|
+ if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
|
+ ocf_log info "peer active but in learner mode: start normally"
|
|
+ else
|
|
+ ocf_log info "peer is active standalone: joining as learner"
|
|
+ JOIN_AS_LEARNER=true
|
|
+ fi
|
|
+ ;;
|
|
+ 0)
|
|
+ # we need to compare the revisions in any of the following branches
|
|
+ # so call the function only once here
|
|
+ if ! revision_compare_result=$(compare_revision); then
|
|
+ ocf_log err "could not compare revisions, error code: $?"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+
|
|
+ # count how many agents are starting now
|
|
+ local start_resources_count
|
|
+ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
|
+
|
|
+ case "$start_resources_count" in
|
|
+ 1)
|
|
+ ocf_log debug "peer not starting: ensure we can start a new cluster"
|
|
+ if [ "$revision_compare_result" != "older" ]; then
|
|
+ # If our revision is the same as or newer than the peer's last saved
|
|
+ # revision, and the peer agent isn't currently starting, we can
|
|
+ # restore e-quorum by forcing a new cluster.
|
|
+ set_force_new_cluster
|
|
+ else
|
|
+ ocf_log err "local revision is older and peer is not starting: cannot start"
|
|
+ ocf_exit_reason "local revision is older and peer is not starting: cannot start"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+ ;;
|
|
+ 2)
|
|
+ ocf_log info "peer starting"
|
|
+ if [ "$revision_compare_result" = "newer" ]; then
|
|
+ set_force_new_cluster
|
|
+ elif [ "$revision_compare_result" = "older" ]; then
|
|
+ ocf_log info "$NODENAME shall join as learner"
|
|
+ JOIN_AS_LEARNER=true
|
|
+ else
|
|
+ if [ "$(attribute_node_cluster_id get)" = "$(attribute_node_cluster_id_peer)" ]; then
|
|
+ ocf_log info "same cluster_id and revision: start normal"
|
|
+ else
|
|
+ ocf_exit_reason "same revision but different cluster id"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+ fi
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "Unexpected start resource count: $start_resources_count"
|
|
+ podman_notify
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ ;;
|
|
+ esac
|
|
+ ;;
|
|
+ *)
|
|
+ ocf_log err "Unexpected active resource count: $active_resources_count"
|
|
+ podman_notify
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ ;;
|
|
+ esac
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ podman_create_mounts
|
|
+ local run_opts="-d --name=${CONTAINER}"
|
|
+ # check to see if the container has already started
|
|
+ podman_simple_status
|
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
|
+ return "$OCF_SUCCESS"
|
|
+ fi
|
|
+
|
|
+ if ocf_is_true "$JOIN_AS_LEARNER"; then
|
|
+ local wait_timeout_sec=$((10*60))
|
|
+ local poll_interval_sec=5
|
|
+ local retries=$(( wait_timeout_sec / poll_interval_sec ))
|
|
+
|
|
+ ocf_log info "ensure the leader node added $NODENAME as learner member before continuing (timeout: $wait_timeout_sec seconds)"
|
|
+ for try in $(seq $retries); do
|
|
+ learner_node=$(attribute_learner_node get)
|
|
+ if [ "$NODENAME" != "$learner_node" ]; then
|
|
+ ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds."
|
|
+ sleep $poll_interval_sec
|
|
+ continue
|
|
+ fi
|
|
+ ocf_log info "learner node $learner_node in the member list"
|
|
+ break
|
|
+ done
|
|
+ if [ "$NODENAME" != "$(attribute_learner_node get)" ]; then
|
|
+ ocf_log err "wait for $NODENAME to be in the member list timed out"
|
|
+ return "$OCF_ERR_GENERIC"
|
|
+ fi
|
|
+
|
|
+ archive_data_folder
|
|
+ fi
|
|
+
|
|
+ prepare_env
|
|
+
|
|
+ # add etcd-specific opts
|
|
+ run_opts="$run_opts \
|
|
+ --network=host \
|
|
+ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
|
+ -v /var/lib/etcd:/var/lib/etcd \
|
|
+ --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \
|
|
+ --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \
|
|
+ --env ETCD_DATA_DIR=$ETCD_DATA_DIR \
|
|
+ --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \
|
|
+ --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \
|
|
+ --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \
|
|
+ --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \
|
|
+ --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \
|
|
+ --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \
|
|
+ --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \
|
|
+ --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \
|
|
+ --env ETCD_NAME=$NODENAME \
|
|
+ --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \
|
|
+ --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \
|
|
+ --env ETCDCTL_API=$ETCDCTL_API \
|
|
+ --env ETCDCTL_CACERT=$SERVER_CACERT \
|
|
+ --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
|
+ --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
|
+ --authfile=$OCF_RESKEY_authfile \
|
|
+ --security-opt label=disable"
|
|
+ if [ -n "$OCF_RESKEY_run_opts" ]; then
|
|
+ run_opts="$run_opts $OCF_RESKEY_run_opts"
|
|
+ fi
|
|
+
|
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \
|
|
+ --log-level=info \
|
|
+ --experimental-initial-corrupt-check=true \
|
|
+ --snapshot-count=10000 \
|
|
+ --initial-advertise-peer-urls=https://${NODEIP}:2380 \
|
|
+ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
|
|
+ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
|
|
+ --trusted-ca-file=$SERVER_CACERT \
|
|
+ --client-cert-auth=true \
|
|
+ --peer-cert-file=$ETCD_PEER_CERT \
|
|
+ --peer-key-file=$ETCD_PEER_KEY \
|
|
+ --peer-trusted-ca-file=$SERVER_CACERT \
|
|
+ --peer-client-cert-auth=true \
|
|
+ --advertise-client-urls=https://${NODEIP}:2379 \
|
|
+ --listen-client-urls=https://${LISTEN_CLIENT_URLS}:2379,unixs://${NODEIP}:0 \
|
|
+ --listen-peer-urls=https://${LISTEN_PEER_URLS}:2380 \
|
|
+ --metrics=extensive \
|
|
+ --listen-metrics-urls=https://${LISTEN_METRICS_URLS}:9978"
|
|
+ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
|
+ fi
|
|
+
|
|
+ if is_force_new_cluster; then
|
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster"
|
|
+ fi
|
|
+
|
|
+ if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then
|
|
+ # no container image provided via input parameters. Read it from the pod manifest.
|
|
+ OCF_RESKEY_image=$(jq -r '.spec.containers[] | select( .name=="etcd").image' "$OCF_RESKEY_pod_manifest")
|
|
+ ocf_log info "using container image ($OCF_RESKEY_image) from Pod manifest ($OCF_RESKEY_pod_manifest)"
|
|
+ else
|
|
+ # use the container image provided as input parameter
|
|
+ ocf_log info "using container image ($OCF_RESKEY_image) via input parameters"
|
|
+ fi
|
|
+
|
|
+ if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then
|
|
+ ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}"
|
|
+ if ! podman pull --authfile="$OCF_RESKEY_authfile" "${OCF_RESKEY_image}"; then
|
|
+ ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ else
|
|
+ ocf_log notice "Pull image not required, ${OCF_RESKEY_image}"
|
|
+ fi
|
|
+
|
|
+ if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then
|
|
+ ocf_log info "starting existing container $CONTAINER."
|
|
+ ocf_run podman start "$CONTAINER"
|
|
+ else
|
|
+ # make sure any previous container matching our container name is cleaned up first.
|
|
+ # we already know at this point it wouldn't be running
|
|
+ remove_container
|
|
+ run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd"
|
|
+ if [ $? -eq 125 ]; then
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ fi
|
|
+ rc=$?
|
|
+
|
|
+ # if the container was stopped or didn't exist before, systemd
|
|
+ # removed the libpod* scopes. So always try to recreate the drop-ins
|
|
+ if [ $rc -eq 0 ] && ocf_is_true "$OCF_RESKEY_drop_in_dependency"; then
|
|
+ cid=$(podman_container_id)
|
|
+ create_transient_drop_in_dependency "$cid"
|
|
+ rc=$?
|
|
+ fi
|
|
+
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ ocf_exit_reason "podman failed to launch container (error code: $rc)"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ # wait for monitor to pass before declaring that the container is started
|
|
+ while true; do
|
|
+ podman_simple_status
|
|
+ if [ $? -ne $OCF_SUCCESS ]; then
|
|
+ ocf_exit_reason "Newly created podman container exited after start"
|
|
+ ocf_run podman logs --tail 20 "${CONTAINER}"
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ monitor_cmd_exec
|
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
|
+ ocf_log notice "Container $CONTAINER started successfully"
|
|
+ if is_force_new_cluster; then
|
|
+ clear_force_new_cluster
|
|
+
|
|
+ local peer_node_name
|
|
+ local peer_node_ip
|
|
+ peer_node_name="$(get_peer_node_name)"
|
|
+ peer_node_ip="$(attribute_node_ip_peer)"
|
|
+ if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then
|
|
+ add_member_as_learner "$peer_node_name" "$peer_node_ip"
|
|
+ else
|
|
+ ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})"
|
|
+ fi
|
|
+ fi
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+
|
|
+ ocf_exit_reason "waiting on monitor_cmd to pass after start"
|
|
+ sleep 1
|
|
+ done
|
|
+}
|
|
+
|
|
+podman_stop()
|
|
+{
|
|
+ local timeout=60
|
|
+ local rc
|
|
+ podman_simple_status
|
|
+ if [ $? -eq $OCF_NOT_RUNNING ]; then
|
|
+ remove_container
|
|
+ ocf_log info "could not leave members list: etcd container not running"
|
|
+ return $OCF_SUCCESS
|
|
+ fi
|
|
+
|
|
+ attribute_node_revision update
|
|
+ attribute_node_cluster_id update
|
|
+
|
|
+ if ! member_id=$(attribute_node_member_id get); then
|
|
+ ocf_log err "error leaving members list: could not get member-id"
|
|
+ else
|
|
+ # TODO: is it worth/possible to check the current status instead than relying on cached attributes?
|
|
+ if is_standalone; then
|
|
+ ocf_log info "last member. Not leaving the member list"
|
|
+ else
|
|
+ ocf_log info "leaving members list as member with ID $member_id"
|
|
+ endpoint="https://$(attribute_node_ip get):2379"
|
|
+ if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
|
+ rc=$?
|
|
+ ocf_log err "error leaving members list, error code: $rc"
|
|
+ fi
|
|
+ fi
|
|
+ fi
|
|
+ attribute_node_member_id clear
|
|
+
|
|
+ if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
|
+ timeout=$(((OCF_RESKEY_CRM_meta_timeout/1000) -10 ))
|
|
+ if [ $timeout -lt 10 ]; then
|
|
+ timeout=10
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ if ocf_is_true "$OCF_RESKEY_force_kill"; then
|
|
+ ocf_run podman kill "$CONTAINER"
|
|
+ rc=$?
|
|
+ else
|
|
+ ocf_log debug "waiting $timeout second[s] before killing container"
|
|
+ ocf_run podman stop -t="$timeout" "$CONTAINER"
|
|
+ rc=$?
|
|
+ # on stop, systemd will automatically delete any transient
|
|
+ # drop-in conf that has been created earlier
|
|
+ fi
|
|
+
|
|
+ if [ $rc -ne 0 ]; then
|
|
+ # If the stop failed, it could be because the controlling conmon
|
|
+ # process died unexpectedly. If so, a generic error code is returned
|
|
+ # but the associated container exit code is -1. If that's the case,
|
|
+ # assume there's no failure and continue with the rm as usual.
|
|
+ if [ $rc -eq 125 ] && \
|
|
+ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' "$CONTAINER" | grep -Eq '^(exited|stopped):-1$'; then
|
|
+ ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
|
|
+ else
|
|
+ ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+ fi
|
|
+
|
|
+ if ! remove_container; then
|
|
+ ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
|
+ return $OCF_ERR_GENERIC
|
|
+ fi
|
|
+
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+image_exists()
|
|
+{
|
|
+ if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then
|
|
+ # the actual container image was not defined yet. Nor by
|
|
+ # the user via OCF_RESKEY, nor by reading the Pod manifest
|
|
+ return 0
|
|
+ fi
|
|
+ if podman image exists "${OCF_RESKEY_image}"; then
|
|
+ # image found
|
|
+ return 0
|
|
+ fi
|
|
+
|
|
+ if ocf_is_true "$OCF_RESKEY_allow_pull"; then
|
|
+ REQUIRE_IMAGE_PULL=1
|
|
+ ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start"
|
|
+ return 0
|
|
+ fi
|
|
+ # image not found.
|
|
+ return 1
|
|
+}
|
|
+
|
|
+podman_validate()
|
|
+{
|
|
+ check_binary curl
|
|
+ check_binary crictl
|
|
+ check_binary oc
|
|
+ check_binary podman
|
|
+ check_binary jq
|
|
+
|
|
+ if [ -z "$OCF_RESKEY_node_ip_map" ]; then
|
|
+ ocf_exit_reason "'node_ip_map' option is required"
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ if [ -z "$OCF_RESKEY_pod_manifest" ]; then
|
|
+ ocf_exit_reason "'pod_manifest' option is required"
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ if [ -z "$OCF_RESKEY_image" ]; then
|
|
+ ocf_exit_reason "'image' option is required"
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ if ! image_exists; then
|
|
+ ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+
|
|
+ return $OCF_SUCCESS
|
|
+}
|
|
+
|
|
+podman_notify()
|
|
+{
|
|
+ ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }"
|
|
+}
|
|
+
|
|
+# TODO :
|
|
+# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters.
|
|
+# When a user appoints reuse, the resource agent cannot connect plural clones with a container.
|
|
+
|
|
+if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then
|
|
+ if [ -n "$OCF_RESKEY_name" ]; then
|
|
+ if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ]
|
|
+ then
|
|
+ ocf_exit_reason "Cannot make plural clones from the same name parameter."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+ if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ]
|
|
+ then
|
|
+ ocf_exit_reason "Cannot make plural master from the same name parameter."
|
|
+ exit $OCF_ERR_CONFIGURED
|
|
+ fi
|
|
+ fi
|
|
+ : ${OCF_RESKEY_name=$(echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-')}
|
|
+else
|
|
+ : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}}
|
|
+fi
|
|
+
|
|
+CONTAINER=$OCF_RESKEY_name
|
|
+
|
|
+# Note: we currently monitor podman containers by with the "podman exec"
|
|
+# command, so make sure that invocation is always valid by enforcing the
|
|
+# exec command to be non-empty
|
|
+: ${OCF_RESKEY_monitor_cmd:=/bin/true}
|
|
+
|
|
+# When OCF_RESKEY_drop_in_dependency is not populated, we
|
|
+# look at another file-based way of enabling the option.
|
|
+# Otherwise, consider it disabled.
|
|
+if [ -z "$OCF_RESKEY_drop_in_dependency" ]; then
|
|
+ if [ -f "/etc/sysconfig/podman_drop_in" ] || \
|
|
+ [ -f "/etc/default/podman_drop_in" ]; then
|
|
+ OCF_RESKEY_drop_in_dependency=yes
|
|
+ fi
|
|
+fi
|
|
+
|
|
+
|
|
+case $__OCF_ACTION in
|
|
+meta-data) meta_data
|
|
+ exit $OCF_SUCCESS;;
|
|
+usage|help) podman_usage
|
|
+ exit $OCF_SUCCESS
|
|
+ ;;
|
|
+esac
|
|
+
|
|
+NODENAME=$(ocf_local_nodename)
|
|
+JOIN_AS_LEARNER=false
|
|
+
|
|
+case $__OCF_ACTION in
|
|
+start)
|
|
+ podman_validate || exit $?
|
|
+ podman_start;;
|
|
+stop) podman_stop;;
|
|
+monitor) podman_monitor;;
|
|
+notify) podman_notify;;
|
|
+validate-all) podman_validate;;
|
|
+*) podman_usage
|
|
+ exit $OCF_ERR_UNIMPLEMENTED
|
|
+ ;;
|
|
+esac
|
|
+rc=$?
|
|
+ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
|
|
+exit $rc
|