diff --git a/SOURCES/RHEL-40589-azure-events-az-update-API-versions-add-retry-for-metadata.patch b/SOURCES/RHEL-40589-azure-events-az-update-API-versions-add-retry-for-metadata.patch new file mode 100644 index 0000000..6507266 --- /dev/null +++ b/SOURCES/RHEL-40589-azure-events-az-update-API-versions-add-retry-for-metadata.patch @@ -0,0 +1,333 @@ +From 7739c2a802c1dddb6757ff75cf7f6582a89bd518 Mon Sep 17 00:00:00 2001 +From: id +Date: Fri, 31 May 2024 09:00:18 +0200 +Subject: [PATCH] azure-events-az: update to API versions, add retry + functionality for metadata requests, update tests + +--- + heartbeat/azure-events-az.in | 117 ++++++++++++++++++++++++----------- + heartbeat/ocf.py | 50 +++++++++++++-- + 2 files changed, 126 insertions(+), 41 deletions(-) + +diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in +index 46d4d1f3d9..6d31e5abae 100644 +--- a/heartbeat/azure-events-az.in ++++ b/heartbeat/azure-events-az.in +@@ -27,7 +27,7 @@ import ocf + ############################################################################## + + +-VERSION = "0.10" ++VERSION = "0.20" + USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro()) + + attr_globalPullState = "azure-events-az_globalPullState" +@@ -39,9 +39,6 @@ attr_healthstate = "#health-azure" + default_loglevel = ocf.logging.INFO + default_relevantEventTypes = set(["Reboot", "Redeploy"]) + +-global_pullMaxAttempts = 3 +-global_pullDelaySecs = 1 +- + ############################################################################## + + class attrDict(defaultdict): +@@ -71,16 +68,22 @@ class azHelper: + metadata_host = "http://169.254.169.254/metadata" + instance_api = "instance" + events_api = "scheduledevents" +- api_version = "2019-08-01" ++ events_api_version = "2020-07-01" ++ instance_api_version = "2021-12-13" + + @staticmethod +- def _sendMetadataRequest(endpoint, postData=None): ++ def _sendMetadataRequest(endpoint, postData=None, api_version="2019-08-01"): + """ + Send a request to Azure's Azure Metadata Service API + """ +- url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version) ++ ++ retryCount = int(ocf.get_parameter("retry_count",3)) ++ retryWaitTime = int(ocf.get_parameter("retry_wait",20)) ++ requestTimeout = int(ocf.get_parameter("request_timeout",15)) ++ ++ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, api_version) + data = "" +- ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData)) ++ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s, retry_count = %s, retry_wait time = %s, request_timeout = %s" % (endpoint, postData, retryCount, retryWaitTime, requestTimeout)) + ocf.logger.debug("_sendMetadataRequest: url = %s" % url) + + if postData and type(postData) != bytes: +@@ -89,18 +92,37 @@ class azHelper: + req = urllib2.Request(url, postData) + req.add_header("Metadata", "true") + req.add_header("User-Agent", USER_AGENT) +- try: +- resp = urllib2.urlopen(req) +- except URLError as e: +- if hasattr(e, 'reason'): +- ocf.logger.warning("Failed to reach the server: %s" % e.reason) +- clusterHelper.setAttr(attr_globalPullState, "IDLE") +- elif hasattr(e, 'code'): +- ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code) +- clusterHelper.setAttr(attr_globalPullState, "IDLE") +- else: +- data = resp.read() +- ocf.logger.debug("_sendMetadataRequest: response = %s" % data) ++ ++ if retryCount > 0: ++ ocf.logger.debug("_sendMetadataRequest: retry enabled") ++ ++ successful = None ++ for retry in range(retryCount+1): ++ try: ++ resp = urllib2.urlopen(req, timeout=requestTimeout) ++ except Exception as e: ++ excType = e.__class__.__name__ ++ if excType == TimeoutError.__name__: ++ ocf.logger.warning("Request timed out after %s seconds Error: %s" % (requestTimeout, e)) ++ if excType == URLError.__name__: ++ if hasattr(e, 'reason'): ++ ocf.logger.warning("Failed to reach the server: %s" % e.reason) ++ elif hasattr(e, 'code'): ++ ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code) ++ ++ if retryCount > 1 and retry != retryCount: ++ ocf.logger.warning("Request failed, retry (%s/%s) wait %s seconds before retry (wait time)" % (retry + 1,retryCount,retryWaitTime)) ++ time.sleep(retryWaitTime) ++ ++ else: ++ data = resp.read() ++ ocf.logger.debug("_sendMetadataRequest: response = %s" % data) ++ successful = 1 ++ break ++ ++ # When no request was successful also with retry enabled, set the cluster to idle ++ if successful is None: ++ clusterHelper.setAttr(attr_globalPullState, "IDLE") + + if data: + data = json.loads(data) +@@ -115,14 +137,15 @@ class azHelper: + """ + ocf.logger.debug("getInstanceInfo: begin") + +- jsondata = azHelper._sendMetadataRequest(azHelper.instance_api) ++ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api, None, azHelper.instance_api_version) + ocf.logger.debug("getInstanceInfo: json = %s" % jsondata) + + if jsondata: + ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"])) + return attrDict(jsondata["compute"]) + else: +- ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info") ++ apiCall = "%s/%s?api-version=%s" % (azHelper.metadata_host, azHelper.instance_api, azHelper.instance_api_version) ++ ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info - call: %s" % apiCall) + sys.exit(ocf.OCF_ERR_GENERIC) + + @staticmethod +@@ -132,11 +155,17 @@ class azHelper: + """ + ocf.logger.debug("pullScheduledEvents: begin") + +- jsondata = azHelper._sendMetadataRequest(azHelper.events_api) ++ jsondata = azHelper._sendMetadataRequest(azHelper.events_api, None, azHelper.events_api_version) + ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata) + +- ocf.logger.debug("pullScheduledEvents: finished") +- return attrDict(jsondata) ++ if jsondata: ++ ocf.logger.debug("pullScheduledEvents: finished") ++ return attrDict(jsondata) ++ else: ++ apiCall = "%s/%s?api-version=%s" % (azHelper.metadata_host, azHelper.events_api, azHelper.events_api_version) ++ ocf.ocf_exit_reason("pullScheduledEvents: Unable to get scheduledevents info - call: %s" % apiCall) ++ sys.exit(ocf.OCF_ERR_GENERIC) ++ + + @staticmethod + def forceEvents(eventIDs): +@@ -534,7 +563,7 @@ class Node: + except ValueError: + # Handle the exception + ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node)) +- ++ + ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy) + return isInStandy + +@@ -584,7 +613,7 @@ class raAzEvents: + + def monitor(self): + ocf.logger.debug("monitor: begin") +- ++ + events = azHelper.pullScheduledEvents() + + # get current document version +@@ -600,21 +629,21 @@ class raAzEvents: + ocf.logger.info("monitor: already handled curDocVersion, skip") + return ocf.OCF_SUCCESS + +- localAzEventIDs = set() ++ localAzEventIds = dict() + for e in localEvents: +- localAzEventIDs.add(e.EventId) ++ localAzEventIds[e.EventId] = json.dumps(e) + + curState = self.node.getState() + clusterEventIDs = self.node.getEventIDs() + + ocf.logger.debug("monitor: curDocVersion has not been handled yet") +- ++ + if clusterEventIDs: + # there are pending events set, so our state must be STOPPING or IN_EVENT + i = 0; touchedEventIDs = False + while i < len(clusterEventIDs): + # clean up pending events that are already finished according to AZ +- if clusterEventIDs[i] not in localAzEventIDs: ++ if clusterEventIDs[i] not in localAzEventIds.keys(): + ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i])) + clusterEventIDs.pop(i) + touchedEventIDs = True +@@ -644,12 +673,12 @@ class raAzEvents: + ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait") + else: + if curState == AVAILABLE: +- if len(localAzEventIDs) > 0: ++ if len(localAzEventIds) > 0: + if clusterHelper.otherNodesAvailable(self.node): +- ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs))) +- curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs) ++ ocf.logger.info("monitor: can handle local events %s -> set state STOPPING - %s" % (str(list(localAzEventIds.keys())), str(list(localAzEventIds.values())))) ++ curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIds.keys()) + else: +- ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs)) ++ ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD - %s" % (str(list(localAzEventIds.keys())), str(list(localAzEventIds.values())))) + self.node.setState(ON_HOLD) + else: + ocf.logger.debug("monitor: no local azEvents to handle") +@@ -761,6 +790,24 @@ def main(): + longdesc="Set to true to enable verbose logging", + content_type="boolean", + default="false") ++ agent.add_parameter( ++ "retry_count", ++ shortdesc="Azure IMDS webservice retry count", ++ longdesc="Set to any number bigger than zero to enable retry count", ++ content_type="integer", ++ default="3") ++ agent.add_parameter( ++ "retry_wait", ++ shortdesc="Configure a retry wait time", ++ longdesc="Set retry wait time in seconds", ++ content_type="integer", ++ default="20") ++ agent.add_parameter( ++ "request_timeout", ++ shortdesc="Configure a request timeout", ++ longdesc="Set request timeout in seconds", ++ content_type="integer", ++ default="15") + agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS) + agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS) + agent.add_action("validate-all", timeout=20, handler=validate_action) +diff --git a/heartbeat/ocf.py b/heartbeat/ocf.py +index dda2fed4bb..571cd19664 100644 +--- a/heartbeat/ocf.py ++++ b/heartbeat/ocf.py +@@ -16,7 +16,7 @@ + # You should have received a copy of the GNU Lesser General Public + # License along with this library; if not, write to the Free Software + # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-# ++# + + import sys, os, logging, syslog + +@@ -42,19 +42,19 @@ + # OCF does not include the concept of master/slave resources so we + # need to extend it so we can discover a resource's complete state. + # +-# OCF_RUNNING_MASTER: ++# OCF_RUNNING_MASTER: + # The resource is in "master" mode and fully operational + # OCF_FAILED_MASTER: + # The resource is in "master" mode but in a failed state +-# ++# + # The extra two values should only be used during a probe. + # + # Probes are used to discover resources that were started outside of + # the CRM and/or left behind if the LRM fails. +-# ++# + # They can be identified in RA scripts by checking for: + # [ "${__OCF_ACTION}" = "monitor" -a "${OCF_RESKEY_CRM_meta_interval}" = "0" ] +-# ++# + # Failed "slaves" should continue to use: OCF_ERR_GENERIC + # Fully operational "slaves" should continue to use: OCF_SUCCESS + # +@@ -451,15 +451,17 @@ def value_for_parameter(param): + sys.exit(OCF_ERR_UNIMPLEMENTED) + + ++ + if __name__ == "__main__": + import unittest ++ import logging + + class TestMetadata(unittest.TestCase): + def test_noparams_noactions(self): + m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc") + self.assertEqual(""" + +- ++ + 1.0 + + longdesc +@@ -483,4 +485,40 @@ def test_params_actions(self): + m.add_action("start") + self.assertEqual(str(m.actions[0]), '\n') + ++ def test_retry_params_actions(self): ++ log= logging.getLogger( "test_retry_params_actions" ) ++ ++ m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc") ++ m.add_parameter( ++ "retry_count", ++ shortdesc="Azure ims webservice retry count", ++ longdesc="Set to any number bigger than zero to enable retry count", ++ content_type="integer", ++ default="0") ++ m.add_parameter( ++ "retry_wait", ++ shortdesc="Configure a retry wait time", ++ longdesc="Set retry wait time in seconds", ++ content_type="integer", ++ default="20") ++ m.add_parameter( ++ "request_timeout", ++ shortdesc="Configure a request timeout", ++ longdesc="Set request timeout in seconds", ++ content_type="integer", ++ default="15") ++ ++ m.add_action("start") ++ ++ log.debug( "actions= %s", str(m.actions[0] )) ++ self.assertEqual(str(m.actions[0]), '\n') ++ ++ log.debug( "parameters= %s", str(m.parameters[0] )) ++ log.debug( "parameters= %s", str(m.parameters[1] )) ++ log.debug( "parameters= %s", str(m.parameters[2] )) ++ self.assertEqual(str(m.parameters[0]), '\nSet to any number bigger than zero to enable retry count\nAzure ims webservice retry count\n\n\n') ++ self.assertEqual(str(m.parameters[1]), '\nSet retry wait time in seconds\nConfigure a retry wait time\n\n\n') ++ self.assertEqual(str(m.parameters[2]), '\nSet request timeout in seconds\nConfigure a request timeout\n\n\n') ++ ++ logging.basicConfig( stream=sys.stderr ) + unittest.main() diff --git a/SOURCES/RHEL-42513-powervs-subnet-new-ra.patch b/SOURCES/RHEL-42513-powervs-subnet-new-ra.patch new file mode 100644 index 0000000..11a0865 --- /dev/null +++ b/SOURCES/RHEL-42513-powervs-subnet-new-ra.patch @@ -0,0 +1,1165 @@ +diff --color -uNr a/configure.ac b/configure.ac +--- a/configure.ac 2024-10-18 12:30:47.834626309 +0200 ++++ b/configure.ac 2024-10-18 12:32:15.620672697 +0200 +@@ -1010,6 +1010,7 @@ + AC_CONFIG_FILES([heartbeat/mpathpersist], [chmod +x heartbeat/mpathpersist]) + AC_CONFIG_FILES([heartbeat/nfsnotify], [chmod +x heartbeat/nfsnotify]) + AC_CONFIG_FILES([heartbeat/openstack-info], [chmod +x heartbeat/openstack-info]) ++AC_CONFIG_FILES([heartbeat/powervs-subnet], [chmod +x heartbeat/powervs-subnet]) + AC_CONFIG_FILES([heartbeat/rabbitmq-cluster], [chmod +x heartbeat/rabbitmq-cluster]) + AC_CONFIG_FILES([heartbeat/redis], [chmod +x heartbeat/redis]) + AC_CONFIG_FILES([heartbeat/rsyslog], [chmod +x heartbeat/rsyslog]) +diff --color -uNr a/doc/man/Makefile.am b/doc/man/Makefile.am +--- a/doc/man/Makefile.am 2024-10-18 12:30:47.801625540 +0200 ++++ b/doc/man/Makefile.am 2024-10-18 12:33:57.763053742 +0200 +@@ -190,6 +190,7 @@ + ocf_heartbeat_portblock.7 \ + ocf_heartbeat_postfix.7 \ + ocf_heartbeat_pound.7 \ ++ ocf_heartbeat_powervs-subnet.7 \ + ocf_heartbeat_proftpd.7 \ + ocf_heartbeat_rabbitmq-cluster.7 \ + ocf_heartbeat_redis.7 \ +diff --color -uNr a/.gitignore b/.gitignore +--- a/.gitignore 2024-10-18 12:30:47.801625540 +0200 ++++ b/.gitignore 2024-10-18 10:45:57.222895499 +0200 +@@ -22,6 +22,7 @@ + make/stamp-h1 + make/clusterautoconfig.h* + missing ++resource-agents.spec + *.pc + .deps + .libs +@@ -76,6 +77,7 @@ + heartbeat/mpathpersist + heartbeat/nfsnotify + heartbeat/openstack-info ++heartbeat/powervs-subnet + heartbeat/rabbitmq-cluster + heartbeat/redis + heartbeat/rsyslog +diff --color -uNr a/heartbeat/Makefile.am b/heartbeat/Makefile.am +--- a/heartbeat/Makefile.am 2024-10-18 12:30:47.801625540 +0200 ++++ b/heartbeat/Makefile.am 2024-10-18 12:33:02.884774474 +0200 +@@ -165,6 +165,7 @@ + portblock \ + postfix \ + pound \ ++ powervs-subnet \ + proftpd \ + rabbitmq-cluster \ + redis \ +diff --color -uNr a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in +--- a/heartbeat/powervs-subnet.in 1970-01-01 01:00:00.000000000 +0100 ++++ b/heartbeat/powervs-subnet.in 2024-10-18 12:31:09.071121354 +0200 +@@ -0,0 +1,1109 @@ ++#!@PYTHON@ -tt ++# ------------------------------------------------------------------------ ++# Description: Resource Agent to move a Power Virtual Server subnet ++# and its IP address from one virtual server instance ++# to another. ++# ++# Authors: Edmund Haefele ++# Walter Orb ++# ++# Copyright (c) 2024 International Business Machines, Inc. ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++# ------------------------------------------------------------------------ ++ ++import ipaddress ++import json ++import math ++import os ++import re ++import socket ++import subprocess ++import sys ++import textwrap ++import time ++ ++import requests ++import requests.adapters ++import urllib3.util ++ ++OCF_FUNCTIONS_DIR = os.environ.get( ++ "OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT") ++) ++ ++sys.path.append(OCF_FUNCTIONS_DIR) ++ ++try: ++ import ocf ++except ImportError: ++ sys.stderr.write("ImportError: ocf module import failed.") ++ sys.exit(5) ++ ++ ++class PowerCloudAPIError(Exception): ++ def __init__(self, message, exit_code): ++ ocf.ocf_exit_reason(message) ++ sys.exit(exit_code) ++ ++ ++class nmcli: ++ """A wrapper class to run nmcli system commands.""" ++ ++ NMCLI_SYSTEM_CMD = ["nmcli", "-t"] ++ CONN_PREFIX = "VIP_" ++ DEV_PREFIX = "env" ++ ROUTING_PRIO = 50 ++ ROUTING_TABLE = ocf.get_parameter("route_table", 500) ++ _WAIT_FOR_NIC_SLEEP = 3 ++ ++ def __init__(self): ++ """Class implements only classmethods or staticmethods, instantiation is not used.""" ++ pass ++ ++ @classmethod ++ def _nmcli_os_cmd(cls, nmcli_args): ++ """run os nmcli command with the specified arguments. ++ ++ Returns the output as a dictionary. ++ """ ++ ++ ocf.logger.debug("_nmcli_os_cmd: args: {}".format(nmcli_args)) ++ output = None ++ try: ++ result = subprocess.run( ++ cls.NMCLI_SYSTEM_CMD + nmcli_args, ++ capture_output=True, ++ text=True, ++ check=True, ++ env={"LANG": "C"}, ++ ) ++ if len(nmcli_args) == 1 or nmcli_args[0] == "-g" or nmcli_args[1] == "show": ++ # return output as dict ++ output = dict( ++ item.split(":", 1) ++ for item in result.stdout.rstrip().splitlines() ++ if ":" in item ++ ) ++ except subprocess.CalledProcessError as e: ++ raise PowerCloudAPIError( ++ f"_nmcli_os_cmd: error executing nmcli: {e.stderr}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ return output ++ ++ @classmethod ++ def _nmcli_cmd(cls, command, subcommand=None, name=None, **kwargs): ++ """Prepare arguments to call nmcli command.""" ++ ++ ocf.logger.debug( ++ f"_nmcli_cmd: args: command: {command}, subcommand: {subcommand}, name: {name}" ++ ) ++ if command in ["connection", "device"]: ++ nmcli_args = [command] ++ else: ++ raise PowerCloudAPIError( ++ f"_nmcli_cmd: nmcli {command} not implemented", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ if name: ++ if subcommand in ("show", "delete", "down", "up"): ++ nmcli_args += [subcommand, name] ++ elif subcommand == "add": ++ nmcli_args += [subcommand, "type", "ethernet", "con-name", name] ++ else: ++ raise PowerCloudAPIError( ++ f"_nmcli_cmd: nmcli {command} {subcommand} not implemented", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ elif subcommand in ("add", "delete", "down", "up"): ++ raise PowerCloudAPIError( ++ f"_nmcli_cmd: name argument required for nmcli {command} {subcommand}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ options = kwargs.get("options", {}) ++ for k, v in options.items(): ++ nmcli_args += [k, v] ++ ++ return cls._nmcli_os_cmd(nmcli_args) ++ ++ @classmethod ++ def _nmcli_find(cls, command, match_key, match_value): ++ """Find the network object whose attribute with the specified key matches the specified value.""" ++ ++ ocf.logger.debug( ++ f"_nmcli_find: args: command: {command}, key: {match_key}, value: {match_value}" ++ ) ++ ++ nm_object = None ++ for name in cls._nmcli_cmd(command=command, subcommand="show"): ++ if not re.search(f"({cls.CONN_PREFIX})?{cls.DEV_PREFIX}", name): ++ # check only connections or devices with device prefix in name ++ continue ++ obj_attrs = cls._nmcli_cmd(command=command, subcommand="show", name=name) ++ if re.search(match_value, obj_attrs.get(match_key, "")): ++ ocf.logger.debug(f"_nmcli_find: found match: name: {name}") ++ nm_object = obj_attrs ++ break ++ ++ return nm_object ++ ++ @classmethod ++ def cleanup(cls): ++ """Clean up orphaned Network Manager connections.""" ++ ++ connections = cls._nmcli_os_cmd(["-g", "UUID,NAME,ACTIVE", "connection"]) ++ for uuid in connections: ++ name, active = connections[uuid].split(":") ++ if active == "no" and name.startswith(f"{cls.CONN_PREFIX}{cls.DEV_PREFIX}"): ++ ocf.logger.debug(f"nmcli.cleanup: delete orphaned connection {name}") ++ nmcli.connection.delete(uuid) ++ ++ @classmethod ++ def wait_for_nic(cls, mac, timeout=720): ++ """Wait for a NIC with a given MAC address to become available.""" ++ ++ ocf.logger.debug(f"wait_for_nic: args: mac: {mac}, timeout: {timeout} s") ++ mac_address = mac.upper() ++ retries = math.ceil((timeout * 0.95) / cls._WAIT_FOR_NIC_SLEEP) - 1 ++ for attempt in range(1, retries + 1): ++ try: ++ ocf.logger.debug( ++ f"wait_for_nic: waiting for nic with mac address {mac_address} ..." ++ ) ++ nm_object = cls._nmcli_find("device", "GENERAL.HWADDR", mac_address) ++ if nm_object: ++ break ++ finally: ++ time.sleep(cls._WAIT_FOR_NIC_SLEEP) ++ else: # no break ++ raise PowerCloudAPIError( ++ f"wait_for_nic: timeout while waiting for nic with MAC address {mac_address}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ nic = nm_object.get("GENERAL.DEVICE") ++ wait_time = (attempt - 1) * cls._WAIT_FOR_NIC_SLEEP ++ ++ ocf.logger.info( ++ f"wait_for_nic: found network device {nic} with MAC address {mac_address} after waiting {wait_time} seconds" ++ ) ++ ++ return nic ++ ++ @classmethod ++ def find_gateway(cls, ip): ++ """Find the gateway address for a given IP.""" ++ ++ ocf.logger.debug(f"find_gateway: args: ip: {ip}") ++ ++ gateway = None ++ ip_address = ip.split("/")[0] ++ dev = cls._nmcli_find("device", "IP4.ADDRESS[1]", ip_address) ++ if dev: ++ # Sample IP4.ROUTE[2]: dst = 0.0.0.0/0, nh = 10.10.10.101, mt = 102, table=200 ++ # extract next hop (nh) value ++ ip4_route2 = dict( ++ item.split("=") ++ for item in dev["IP4.ROUTE[2]"].replace(" ", "").split(",") ++ ) ++ gateway = ip4_route2.get("nh", None) ++ ++ return gateway ++ ++ class connection: ++ """Provides methods to run nmcli connection commands.""" ++ ++ @staticmethod ++ def show(name=None, **kwargs): ++ return nmcli._nmcli_cmd("connection", "show", name, **kwargs) ++ ++ @staticmethod ++ def add(name, **kwargs): ++ return nmcli._nmcli_cmd("connection", "add", name, **kwargs) ++ ++ @staticmethod ++ def delete(name, **kwargs): ++ return nmcli._nmcli_cmd("connection", "delete", name, **kwargs) ++ ++ @staticmethod ++ def down(name, **kwargs): ++ return nmcli._nmcli_cmd("connection", "down", name, **kwargs) ++ ++ @staticmethod ++ def up(name, **kwargs): ++ return nmcli._nmcli_cmd("connection", "up", name, **kwargs) ++ ++ @staticmethod ++ def find(match_key, match_value): ++ return nmcli._nmcli_find("connection", match_key, match_value) ++ ++ class device: ++ """Provides methods to run nmcli device commands.""" ++ ++ @staticmethod ++ def show(name=None, **kwargs): ++ return nmcli._nmcli_cmd("device", "show", name, **kwargs) ++ ++ @staticmethod ++ def find(match_key, match_value): ++ return nmcli._nmcli_find("device", match_key, match_value) ++ ++ ++class PowerCloudAPI: ++ """Provides methods to manage Power Virtual Server resources through its REST API.""" ++ ++ _URL_IAM_GLOBAL = "https://iam.cloud.ibm.com/identity/token" ++ _URL_IAM_PRIVATE = "https://private.iam.cloud.ibm.com/identity/token" ++ _URL_API_PUBLIC = "https://{}.power-iaas.cloud.ibm.com" ++ _URL_API_PRIVATE = "https://private.{}.power-iaas.cloud.ibm.com" ++ _URL_API_BASE = "/pcloud/v1/cloud-instances/{}" ++ ++ _HTTP_MAX_RETRIES = 10 ++ _HTTP_BACKOFF_FACTOR = 0.4 ++ _HTTP_STATUS_FORCE_RETRIES = (500, 502, 503, 504) ++ _HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "DELETE"}) ++ ++ _START_TIME = time.time() ++ _RESOURCE_ACTION_TIMEOUT = int( ++ int(os.environ.get("OCF_RESKEY_CRM_meta_timeout", 7200000)) / 1000 ++ ) ++ ++ def __init__( ++ self, ++ ip="", ++ cidr="", ++ subnet_name="", ++ api_key="", ++ api_type="", ++ region="", ++ crn_host_map="", ++ vsi_host_map="", ++ proxy="", ++ jumbo="", ++ use_remote_workspace=False, ++ ): ++ """Initialize class variables, including the API token, Cloud Resource Name (CRN), IBM Power Cloud API endpoint URL, and HTTP header.""" ++ ++ self._res_options = locals() ++ ++ self._validate_and_set_options() ++ self._set_api_key() ++ self._set_token() ++ self._set_header() ++ ++ self._instance_check_status() ++ self.network_id = self._subnet_search_by_cidr() ++ ++ def _rest_create_session(self): ++ """Create a request session with a retry strategy.""" ++ ++ # Define the retry strategy ++ retry_strategy = urllib3.util.Retry( ++ total=self._HTTP_MAX_RETRIES, # Maximum number of retries ++ status_forcelist=self._HTTP_STATUS_FORCE_RETRIES, # HTTP status codes to retry on ++ allowed_methods=self._HTTP_RETRY_ALLOWED_METHODS, # Allowed methods for retry operation ++ backoff_factor=self._HTTP_BACKOFF_FACTOR, # Sleep for {backoff factor} * (2 ** ({number of previous retries})) ++ ) ++ ++ # Create an HTTP adapter with the retry strategy and mount it to session ++ adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) ++ ++ # Create a new session object ++ session = requests.Session() ++ session.mount("https://", adapter) ++ ++ self._session = session ++ ++ return session ++ ++ def _rest_api_call(self, method, resource, **kwargs): ++ """Perform a REST call to the specified URL.""" ++ ++ url = self._url + self._base + resource ++ method = method.upper() ++ ocf.logger.debug(f"_rest_api_call: {method} {resource}") ++ ++ session = self._session or self._rest_create_session() ++ ++ r = session.request( ++ method, url, headers=self._header, proxies=self._proxy, **kwargs ++ ) ++ if not r.ok: ++ raise PowerCloudAPIError( ++ f"_rest_api_call: {method} call {resource} to {url} failed with reason: {r.reason}, status code: {r.status_code}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ return r.json() ++ ++ def _set_api_key(self): ++ """Store an API key in a class variable. ++ ++ api_key is a string. If the first character of the string is @, ++ the rest of the string is assumed to be the name of a file containing the API key. ++ """ ++ ++ api_key = self._res_options["api_key"] ++ if api_key[0] == "@": ++ api_key_file = api_key[1:] ++ try: ++ with open(api_key_file, "r") as f: ++ # read the API key from a file ++ try: ++ keys = json.loads(f.read()) ++ # data seems to be in json format ++ # return the value of the item with the key 'Apikey' ++ # backward compatibility: In the past, the key name was 'apikey' ++ api_key = keys.get("Apikey", "") ++ if not api_key: ++ api_key = keys.get("apikey", "") ++ except ValueError: ++ # data is text, return as is ++ api_key = f.read().strip() ++ except FileNotFoundError: ++ raise PowerCloudAPIError( ++ f"_set_api_key: API key file '{api_key_file}' not found", ++ ocf.OCF_ERR_ARGS, ++ ) ++ ++ self._api_key = api_key ++ ++ def _set_token(self): ++ """Use the stored API key to obtain an IBM Cloud IAM access token.""" ++ ++ url = self._URL_IAM ++ ++ headers = { ++ "content-type": "application/x-www-form-urlencoded", ++ "accept": "application/json", ++ } ++ data = { ++ "grant_type": "urn:ibm:params:oauth:grant-type:apikey", ++ "apikey": f"{self._api_key}", ++ } ++ token_response = requests.post( ++ url, headers=headers, data=data, proxies=self._proxy ++ ) ++ if token_response.status_code != 200: ++ raise PowerCloudAPIError( ++ f"_set_token: failed to obtain token from IBM Cloud IAM: {token_response.status_code}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ self._token = json.loads(token_response.text)["access_token"] ++ ++ def _set_header(self): ++ """Set the Cloud Resource Name (CRN), IBM Power Cloud API endpoint URL, and HTTP header.""" ++ ++ self._header = { ++ "Authorization": f"Bearer {self._token}", ++ "CRN": f"{self._crn}", ++ "Content-Type": "application/json", ++ } ++ ++ def _instance_check_status(self): ++ """Check if instance exists in workspace and log the current status.""" ++ ++ resource = f"/pvm-instances/{self.instance_id}" ++ instance = self._rest_api_call("GET", resource) ++ ++ server_name = instance["serverName"] ++ status = instance["status"] ++ health = instance["health"]["status"] ++ ++ if status == "SHUTOFF" or (status == "ACTIVE" and health == "OK"): ++ ocf.logger.debug( ++ f"_instance_check_status: OK server_name: {server_name}, status: {status}, health: {health}" ++ ) ++ else: ++ if not (self._ocf_action == "monitor"): ++ raise PowerCloudAPIError( ++ f"_instance_check_status: FAIL server_name: {server_name}, status: {status}, health: {health}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ def _instance_subnet_is_attached(self): ++ """Check if a virtual server instance is connected to a specific subnet.""" ++ ++ for net in self._instance_subnet_list(): ++ if self.network_id == net["networkID"]: ++ return True ++ return False ++ ++ def _instance_subnet_get(self): ++ """Obtain information about a particular subnet connected to a virtual server instance.""" ++ ++ resource = f"/pvm-instances/{self.instance_id}/networks/{self.network_id}" ++ response = self._rest_api_call("GET", resource) ++ return response["networks"][0] ++ ++ def _instance_subnet_list(self): ++ """List all subnets connected to a virtual server instance.""" ++ ++ resource = f"/pvm-instances/{self.instance_id}/networks" ++ response = self._rest_api_call("GET", resource) ++ return response["networks"] ++ ++ def _instance_subnet_attach(self): ++ """Attach a subnet to a virtual server instance.""" ++ ++ data = ( ++ f'{{"networkID":"{self.network_id}","ipAddress":"{self.ip}"}}' ++ if self.ip ++ else f'{{"networkID":"{self.network_id}"}}' ++ ) ++ ++ resource = f"/pvm-instances/{self.instance_id}/networks/" ++ _ = self._rest_api_call("POST", resource, data=data) ++ ++ def _instance_subnet_detach(self): ++ """Detach a subnet from a virtual server instance.""" ++ ++ resource = f"/pvm-instances/{self.instance_id}/networks/{self.network_id}" ++ _ = self._rest_api_call("DELETE", resource) ++ ++ def _subnet_create(self): ++ """Create a subnet in the workspace.""" ++ ++ data = ( ++ f'{{"type":"vlan","cidr":"{self.cidr}","mtu":9000,"name":"{self.subnet_name}"}}' ++ if self.jumbo ++ else f'{{"type":"vlan","cidr":"{self.cidr}","name":"{self.subnet_name}"}}' ++ ) ++ resource = "/networks" ++ response = self._rest_api_call("POST", resource, data=data) ++ self.network_id = response["networkID"] ++ ++ def _subnet_delete(self): ++ """Delete a subnet in the workspace.""" ++ ++ resource = f"/networks/{self.network_id}" ++ _ = self._rest_api_call("DELETE", resource) ++ ++ def _subnet_get(self, network_id): ++ """Get information about a specific subnet in the workspace.""" ++ ++ resource = f"/networks/{network_id}" ++ response = self._rest_api_call("GET", resource) ++ return response ++ ++ def _subnet_list(self): ++ """List all subnets in the workspace.""" ++ ++ resource = "/networks/" ++ response = self._rest_api_call("GET", resource) ++ return response ++ ++ def _subnet_search_by_cidr(self): ++ """Find the subnet for a given CIDR.""" ++ ++ for network in self._subnet_list()["networks"]: ++ network_id = network["networkID"] ++ if self.cidr == self._subnet_get(network_id)["cidr"]: ++ return network_id ++ ++ return None ++ ++ def _subnet_port_get_all(self): ++ """Obtain information about the ports for a specific subnet.""" ++ ++ resource = f"/networks/{self.network_id}/ports" ++ response = self._rest_api_call("GET", resource) ++ return response["ports"] ++ ++ def _subnet_port_delete(self, port_id): ++ """Delete an orphaned port for a particular subnet.""" ++ ++ resource = f"/networks/{self.network_id}/ports/{port_id}" ++ _ = self._rest_api_call("DELETE", resource) ++ ++ def _subnet_port_get_reserved(self): ++ """Check if a port is already reserved on the subnet for the IP address.""" ++ ++ for port in self._subnet_port_get_all(): ++ if self.ip == port["ipAddress"]: ++ return port["portID"] ++ ++ return None ++ ++ def _validate_and_set_options(self): ++ """Validate the options of the resource agent and derive class variables from the options.""" ++ ++ self._ocf_action = os.environ.get("__OCF_ACTION") ++ if self._ocf_action is None and len(sys.argv) == 2: ++ self._ocf_action = sys.argv[1] ++ ++ ip = self._res_options["ip"] ++ try: ++ validated_ip = ipaddress.ip_address(ip) ++ except ValueError: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: {ip} is not a valid IP address.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ self.ip = ip ++ ++ cidr = self._res_options["cidr"] ++ try: ++ validated_cidr = ipaddress.ip_network(cidr) ++ except ValueError: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: {cidr} is not a valid CIDR notation.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ self.cidr = cidr ++ ++ if validated_ip not in validated_cidr: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: {ip} is not in {cidr} range.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ subnet_name = self._res_options["subnet_name"] ++ self.subnet_name = subnet_name if subnet_name else self.cidr ++ ++ crn_host_map = self._res_options["crn_host_map"] ++ try: ++ self._crn_host_map = dict( ++ item.split(":", 1) for item in crn_host_map.split(";") ++ ) ++ except ValueError: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: crn_host_map: {crn_host_map} has an invalid format.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ self._hostname = os.uname().nodename ++ if self._res_options["use_remote_workspace"]: ++ self._nodename = [k for k in self._crn_host_map if k != self._hostname][0] ++ else: ++ self._nodename = self._hostname ++ ++ if self._nodename not in self._crn_host_map: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: {self._nodename} not found in crn_host_map: {crn_host_map}.", ++ ocf.OCF_ERR_ARGS, ++ ) ++ self._crn = self._crn_host_map[self._nodename] ++ ++ try: ++ self._cloud_instance_id = self._crn.split(":")[7] ++ except IndexError: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: {self._crn} is not a valid CRN.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ vsi_host_map = self._res_options["vsi_host_map"] ++ try: ++ self._vsi_host_map = dict( ++ item.split(":") for item in vsi_host_map.split(";") ++ ) ++ except ValueError: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: Option vsi_host_map: {vsi_host_map} has an invalid format.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ if self._nodename not in self._vsi_host_map: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: {self._nodename} not found in vsi_host_map: {vsi_host_map}.", ++ ocf.OCF_ERR_ARGS, ++ ) ++ self.instance_id = self._vsi_host_map[self._nodename] ++ ++ jumbo = self._res_options["jumbo"].lower() ++ if ocf.is_true(jumbo): ++ self.jumbo = True ++ else: ++ if jumbo not in ("no", "false", "0", 0, "nein", "off", False): ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: option jumbo: {jumbo} does not match True or False.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ self.jumbo = False ++ ++ # Check connect to proxy server ++ self._proxy = "" ++ proxy = self._res_options["proxy"] ++ ++ if proxy: ++ # extract ip address and port ++ match = re.search(r"^https?://([^:]+):(\d+)$", proxy) ++ if match: ++ proxy_ip, proxy_port = match.group(1), match.group(2) ++ ++ try: ++ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: ++ s.settimeout(30) ++ s.connect((proxy_ip, int(proxy_port))) ++ except socket.error: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: cannot connect to port {proxy_port} at {proxy_ip}, check option proxy: {proxy}.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ self._proxy = {"https": f"{proxy}"} ++ else: ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: the option proxy: {proxy} has an invalid format.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ ++ api_type = self._res_options["api_type"] ++ if api_type not in ("public", "private"): ++ raise PowerCloudAPIError( ++ f"_validate_and_set_options: option api_type: {api_type} does not match public or private.", ++ ocf.OCF_ERR_CONFIGURED, ++ ) ++ # Set API endpoint url ++ url_api_fmt = ( ++ self._URL_API_PRIVATE if api_type == "private" else self._URL_API_PUBLIC ++ ) ++ self._url = url_api_fmt.format(self._res_options["region"]) ++ self._URL_IAM = ( ++ self._URL_IAM_PRIVATE if api_type == "private" else self._URL_IAM_GLOBAL ++ ) ++ self._base = self._URL_API_BASE.format(self._cloud_instance_id) ++ self._session = None ++ ++ def subnet_add(self): ++ """Create and attach subnet in local workspace""" ++ ++ ocf.logger.debug( ++ f"subnet_add: options: ip: {self.ip}, cidr: {self.cidr}, name: {self.subnet_name}" ++ ) ++ ++ if self.network_id: ++ ocf.logger.debug( ++ f"subnet_add: subnet cidr: {self.cidr} already exists with network id: {self.network_id}" ++ ) ++ else: ++ ocf.logger.debug( ++ f"subnet_add: create subnet name: {self.subnet_name} with cidr: {self.cidr} and jumbo: {self.jumbo}" ++ ) ++ self._subnet_create() ++ ++ if self._instance_subnet_is_attached(): ++ ocf.logger.debug( ++ f"subnet_add: subnet id {self.network_id} is already attached to instance id {self.instance_id}" ++ ) ++ else: ++ ocf.logger.debug( ++ f"subnet_add: attach subnet id: {self.network_id} to instance id: {self.instance_id} (IP address {self.ip})" ++ ) ++ self._instance_subnet_attach() ++ ++ subnet = self._subnet_get(self.network_id) ++ gateway = subnet["gateway"] ++ port = self._instance_subnet_get() ++ mac = port["macAddress"] ++ ip_address = port["ipAddress"] ++ self.jumbo = subnet.get("mtu", "") == 9000 ++ ++ timeout = self._RESOURCE_ACTION_TIMEOUT - int(time.time() - self._START_TIME) ++ nic = nmcli.wait_for_nic(mac, timeout) ++ ++ return nic, ip_address, mac, gateway ++ ++ def subnet_remove(self): ++ """Detach and delete subnet in local or remote workspace""" ++ ++ ocf.logger.debug( ++ f"subnet_remove: options: cidr: {self.cidr}, network id: {self.network_id}, instance id: {self.instance_id}" ++ ) ++ ++ if self.network_id: ++ ocf.logger.debug( ++ f"subnet_remove: subnet id: {self.network_id} with cidr: {self.cidr} exists" ++ ) ++ if self._instance_subnet_is_attached(): ++ ocf.logger.debug( ++ f"subnet_remove: subnet id: {self.network_id} is attached to instance id {self.instance_id}" ++ ) ++ port = self._instance_subnet_get() ++ mac = port["macAddress"] ++ dev = nmcli.device.find("GENERAL.HWADDR", mac.upper()) ++ ++ if dev: ++ nm_object = nmcli.connection.find( ++ "GENERAL.IP-IFACE", dev["GENERAL.DEVICE"] ++ ) ++ if nm_object: ++ conn_name = nm_object["connection.id"] ++ ocf.logger.debug( ++ f"stop_action: unconfigure network connection conn_name: {conn_name} with mac address {mac}" ++ ) ++ nmcli.connection.down(conn_name) ++ nmcli.connection.delete(conn_name) ++ ocf.logger.debug( ++ f"subnet_remove: detach network id: {self.network_id} from instance id: {self.instance_id}" ++ ) ++ self._instance_subnet_detach() ++ ++ port_id = self._subnet_port_get_reserved() ++ if port_id: ++ ocf.logger.debug( ++ f"subnet_remove: delete port port_id: {port_id} for subnet network id: {self.network_id}" ++ ) ++ self._subnet_port_delete(port_id) ++ ++ ocf.logger.debug(f"subnet_remove: delete network id: {self.network_id}") ++ self._subnet_delete() ++ ++ ++def os_ping(ip): ++ """Ping an IP address.""" ++ ++ command = ["ping", "-c", "1", ip] ++ response = subprocess.call(command) ++ return response == 0 ++ ++ ++def start_action( ++ ip="", ++ cidr="", ++ subnet_name="", ++ api_key="", ++ api_type="", ++ region="", ++ crn_host_map="", ++ vsi_host_map="", ++ proxy="", ++ jumbo="", ++): ++ """start_action: assign the service ip. ++ ++ Create a subnet in the workspace, connect it to the virtual server instance, and configure the NIC. ++ """ ++ ++ res_options = locals() ++ ++ ocf.logger.info(f"start_action: options: {res_options}") ++ ++ # Detach and remove subnet in remote workspace ++ remote_ws = PowerCloudAPI(**res_options, use_remote_workspace=True) ++ ocf.logger.debug( ++ f"start_action: remove subnet from remote workspace: cidr: {remote_ws.cidr}" ++ ) ++ remote_ws.subnet_remove() ++ ++ # Delete orphaned Network Manager connections ++ nmcli.cleanup() ++ ++ # Create and attach subnet in local workspace ++ ws = PowerCloudAPI(**res_options) ++ ++ nic, ip_address, mac, gateway = ws.subnet_add() ++ ++ ocf.logger.debug( ++ f"start_action: add nmcli connection: nic: {nic}, ip: {ip_address}, mac: {mac}, gateway: {gateway}, jumbo: {ws.jumbo}, table {nmcli.ROUTING_TABLE}" ++ ) ++ ++ conn_name = f"{nmcli.CONN_PREFIX}{nic}" ++ conn_options = { ++ "ifname": nic, ++ "autoconnect": "no", ++ "ipv4.method": "manual", ++ "ipv4.addresses": ip_address, ++ "ipv4.routes": f"0.0.0.0/0 {gateway} table={nmcli.ROUTING_TABLE}", ++ "ipv4.routing-rules": f"priority {nmcli.ROUTING_PRIO} from {ws.cidr} table {nmcli.ROUTING_TABLE}", ++ } ++ if ws.jumbo: ++ conn_options.update({"802-3-ethernet.mtu": "9000", "ethtool.feature-tso": "on"}) ++ ++ nmcli.connection.add(conn_name, options=conn_options) ++ nmcli.connection.up(conn_name) ++ ++ if monitor_action(**res_options) != ocf.OCF_SUCCESS: ++ raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed") ++ ++ ocf.logger.info( ++ f"start_action: finished, added connection {conn_name} for subnet {ws.subnet_name}" ++ ) ++ ++ return ocf.OCF_SUCCESS ++ ++ ++def stop_action( ++ ip="", ++ cidr="", ++ subnet_name="", ++ api_key="", ++ api_type="", ++ region="", ++ crn_host_map="", ++ vsi_host_map="", ++ proxy="", ++ jumbo="", ++): ++ """stop_action: unassign the service ip. ++ ++ Delete NIC, detach subnet from virtual server instance, and delete subnet. ++ """ ++ ++ res_options = locals() ++ ++ ocf.logger.info(f"stop_action: options: {res_options}") ++ ++ ws = PowerCloudAPI(**res_options) ++ ++ ws.subnet_remove() ++ ++ if monitor_action(**res_options) != ocf.OCF_NOT_RUNNING: ++ raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed") ++ ++ ocf.logger.info( ++ f"stop_action: finished, deleted connection for subnet {ws.subnet_name}" ++ ) ++ ++ return ocf.OCF_SUCCESS ++ ++ ++def monitor_action( ++ ip="", ++ cidr="", ++ subnet_name="", ++ api_key="", ++ api_type="", ++ region="", ++ crn_host_map="", ++ vsi_host_map="", ++ proxy="", ++ jumbo="", ++): ++ """monitor_action: check if service ip and gateway are responding.""" ++ ++ res_options = locals() ++ is_probe = ocf.is_probe() ++ ++ ocf.logger.debug(f"monitor_action: options: {res_options}, is_probe: {is_probe}") ++ ++ gateway = nmcli.find_gateway(ip) ++ if gateway and os_ping(gateway): ++ if os_ping(ip): ++ ocf.logger.debug( ++ f"monitor_action: ping to gateway: {gateway} and ip: {ip} successful" ++ ) ++ return ocf.OCF_SUCCESS ++ else: ++ raise PowerCloudAPIError( ++ f"monitor_action: ping to ip: {ip} failed", ocf.OCF_ERR_GENERIC ++ ) ++ ++ if not is_probe: ++ ocf.logger.error(f"monitor_action: ping to gateway: {gateway} failed") ++ ++ ws = PowerCloudAPI(**res_options) ++ ++ ocf.logger.debug(f"monitor_action: instance id: {ws.instance_id}") ++ ++ if not ws.network_id or is_probe: ++ return ocf.OCF_NOT_RUNNING ++ ++ # monitor should never reach this code, exit with raise ++ raise PowerCloudAPIError( ++ f"monitor_action: unknown problem with subnet id: {ws.network_id}", ++ ocf.OCF_ERR_GENERIC, ++ ) ++ ++ ++def validate_all_action( ++ ip="", ++ cidr="", ++ subnet_name="", ++ api_key="", ++ api_type="", ++ region="", ++ crn_host_map="", ++ vsi_host_map="", ++ proxy="", ++ jumbo="", ++): ++ """validate_all_action: Validate the resource agent parameters.""" ++ ++ res_options = locals() ++ ++ # The class instantiation validates the resource agent options and that the instance exists ++ try: ++ # Check instance in local workspace ++ _ = PowerCloudAPI(**res_options, use_remote_workspace=False) ++ except Exception: ++ ocf.logger.error( ++ "validate_all_action: failed to instantiate class in local workspace." ++ ) ++ raise ++ ++ try: ++ # Check instance in remote workspace ++ _ = PowerCloudAPI(**res_options, use_remote_workspace=True) ++ except Exception: ++ ocf.logger.error( ++ "validate_all_action: failed to instantiate class in remote workspace." ++ ) ++ raise ++ ++ return ocf.OCF_SUCCESS ++ ++ ++def main(): ++ """Instantiate the resource agent.""" ++ ++ agent_description = textwrap.dedent("""\ ++ Resource Agent to move a Power Virtual Server subnet and its IP address ++ from one virtual server instance to another. ++ The prerequisites for the use of this resource agent are as follows: ++ ++ 1. Red Hat Enterprise Linux 9.2 or higher: ++ Install with @server group to ensure that NetworkManager settings are correct. ++ Verify that the NetworkManager-config-server package is installed. ++ ++ 2. A two-node cluster that is distributed across two different Power Virtual Server workspaces in two data centers in a region. ++ ++ 3. IBM Cloud API Key: ++ Create a service API key that is privileged for both Power Virtual Server ++ workspaces. Save the service API key in a file and copy the file to both ++ cluster nodes. Use same filename and directory location on both cluster nodes. ++ Reference the path to the key file in the resource definition. ++ ++ For comprehensive documentation on implementing high availability for ++ SAP applications on IBM Power Virtual Server, visit https://cloud.ibm.com/docs/sap?topic=sap-ha-overview. ++ """) ++ ++ agent = ocf.Agent( ++ "powervs-subnet", ++ shortdesc="Manages moving a Power Virtual Server subnet", ++ longdesc=agent_description, ++ version=1.04, ++ ) ++ ++ agent.add_parameter( ++ "ip", ++ shortdesc="IP address", ++ longdesc=( ++ "IP address within the subnet. The IP address moves together with the subnet." ++ ), ++ content_type="string", ++ required=True, ++ ) ++ ++ agent.add_parameter( ++ "cidr", ++ shortdesc="CIDR", ++ longdesc="Classless Inter-Domain Routing (CIDR) of the subnet.", ++ content_type="string", ++ required=True, ++ ) ++ ++ agent.add_parameter( ++ "subnet_name", ++ shortdesc="Name of the subnet", ++ longdesc="Name of the subnet. If not specified, CIDR is used as name.", ++ content_type="string", ++ required=False, ++ ) ++ ++ agent.add_parameter( ++ "api_type", ++ shortdesc="API type", ++ longdesc="Connect to Power Virtual Server regional endpoints over a public or private network (public|private).", ++ content_type="string", ++ required=False, ++ default="private", ++ ) ++ ++ agent.add_parameter( ++ "region", ++ shortdesc="Power Virtual Server region", ++ longdesc=( ++ "Region that represents the geographic area where the instance is located. " ++ "The region is used to identify the Cloud API endpoint." ++ ), ++ content_type="string", ++ required=True, ++ ) ++ ++ agent.add_parameter( ++ "api_key", ++ shortdesc="API Key or @API_KEY_FILE_PATH", ++ longdesc=( ++ "API Key or @API_KEY_FILE_PATH for IBM Cloud access. " ++ "The API key content or the path of an API key file that is indicated by the @ symbol." ++ ), ++ content_type="string", ++ required=True, ++ ) ++ ++ agent.add_parameter( ++ "crn_host_map", ++ shortdesc="Mapping of hostnames to IBM Cloud CRN", ++ longdesc=( ++ "Map the hostname of the Power Virtual Server instance to the CRN of the Power Virtual Server workspaces hosting the instance. " ++ "Separate hostname and CRN with a colon ':', separate different hostname and CRN pairs with a semicolon ';'. " ++ "Example: hostname01:CRN-of-Instance01;hostname02:CRN-of-Instance02" ++ ), ++ content_type="string", ++ required=True, ++ ) ++ ++ agent.add_parameter( ++ "vsi_host_map", ++ shortdesc="Mapping of hostnames to PowerVS instance ids", ++ longdesc=( ++ "Map the hostname of the Power Virtual Server instance to its instance id. " ++ "Separate hostname and instance id with a colon ':', separate different hostname and instance id pairs with a semicolon ';'. " ++ "Example: hostname01:instance-id-01;hostname02:instance-id-02" ++ ), ++ content_type="string", ++ required=True, ++ ) ++ ++ agent.add_parameter( ++ "proxy", ++ shortdesc="Proxy", ++ longdesc="Proxy server to access IBM Cloud API endpoints.", ++ content_type="string", ++ required=False, ++ ) ++ ++ agent.add_parameter( ++ "jumbo", ++ shortdesc="Use Jumbo frames", ++ longdesc="Create a Power Virtual Server subnet with an MTU size of 9000 (true|false).", ++ content_type="string", ++ required=False, ++ default="false", ++ ) ++ ++ agent.add_parameter( ++ "route_table", ++ shortdesc="route table ID", ++ longdesc="ID of the route table for the interface. Default is 500.", ++ content_type="string", ++ required=False, ++ default="500", ++ ) ++ ++ ++ agent.add_action("start", timeout=900, handler=start_action) ++ agent.add_action("stop", timeout=450, handler=stop_action) ++ agent.add_action( ++ "monitor", depth=0, timeout=60, interval=60, handler=monitor_action ++ ) ++ agent.add_action("validate-all", timeout=300, handler=validate_all_action) ++ agent.run() ++ ++ ++if __name__ == "__main__": ++ main() diff --git a/SOURCES/Patch1000-Podman-Improve-Handling-Of-Stopping-Container-Removal.patch b/SOURCES/RHEL-58008-podman-force-remove-container-if-necessary.patch similarity index 55% rename from SOURCES/Patch1000-Podman-Improve-Handling-Of-Stopping-Container-Removal.patch rename to SOURCES/RHEL-58008-podman-force-remove-container-if-necessary.patch index f6f0da4..fed67ca 100644 --- a/SOURCES/Patch1000-Podman-Improve-Handling-Of-Stopping-Container-Removal.patch +++ b/SOURCES/RHEL-58008-podman-force-remove-container-if-necessary.patch @@ -1,13 +1,22 @@ -podman: Improve handling of "stopping" container removal in remove_container() -Cherry-picked from e3ba7ba866d2d2cd7b1fe37a111445dc3c95182d +From 2ab2c832180dacb2e66d38541beae0957416eb96 Mon Sep 17 00:00:00 2001 +From: Antonio Romito +Date: Mon, 9 Sep 2024 17:30:38 +0200 +Subject: [PATCH] Improve handling of "stopping" container removal in + remove_container() -Signed-off-by: Pooja Senthil Kumar -Reviewed-by: Laurence Rochfort +- Added handling for containers in a stopping state by checking the state and force-removing if necessary. +- Improved log messages to provide clearer information when force removal is needed. + +Related: https://issues.redhat.com/browse/RHEL-58008 --- -diff -uNr a/heartbeat/podman b/heartbeat/podman ---- a/heartbeat/podman 2025-02-05 16:42:14.853652067 -0800 -+++ b/heartbeat/podman 2025-02-05 16:44:31.256704873 -0800 -@@ -254,6 +254,13 @@ + heartbeat/podman | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/podman b/heartbeat/podman +index 53867bff20..643ec4d894 100755 +--- a/heartbeat/podman ++++ b/heartbeat/podman +@@ -254,6 +254,13 @@ remove_container() ocf_run podman rm -v $CONTAINER rc=$? if [ $rc -ne 0 ]; then @@ -21,13 +30,13 @@ diff -uNr a/heartbeat/podman b/heartbeat/podman # due to a podman bug (rhbz#1841485), sometimes a stopped # container can still be associated with Exec sessions, in # which case the "podman rm" has to be forced -@@ -508,8 +515,8 @@ +@@ -517,8 +524,8 @@ podman_stop() # but the associated container exit code is -1. If that's the case, # assume there's no failure and continue with the rm as usual. if [ $rc -eq 125 ] && \ - podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then - ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." -+ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -Eq '^(exited|stopped):-1$'; then ++ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -Eq '^(exited|stopped):-1$'; then + ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." else ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." diff --git a/SOURCES/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch b/SOURCES/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch new file mode 100644 index 0000000..1ae87c4 --- /dev/null +++ b/SOURCES/RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch @@ -0,0 +1,106 @@ +From d66a52cfb25f5436255ecc65a407c0166a720146 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Tue, 3 Sep 2024 12:55:28 +0200 +Subject: [PATCH 1/2] Filesystem: dont sleep during stop-action when there are + no processes to kill + +Thanks @SatomiOSAWA for the initial code. +--- + heartbeat/Filesystem | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 3eb520e0c..f54969f20 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -685,12 +685,13 @@ signal_processes() { + pids=$(get_pids "$dir") + if [ -z "$pids" ]; then + ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'" +- return ++ return 1 + fi + for pid in $pids; do + ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)" + kill -s $sig $pid + done ++ return 0 + } + try_umount() { + local SUB="$1" +@@ -717,12 +718,13 @@ timeout_child() { + return $ret + } + fs_stop_loop() { +- local SUB="$1" signals="$2" sig ++ local SUB="$1" signals="$2" sig send_signal + while true; do ++ send_signal=false + for sig in $signals; do +- signal_processes "$SUB" $sig ++ signal_processes "$SUB" $sig && send_signal=true + done +- sleep $OCF_RESKEY_signal_delay ++ $send_signal && sleep $OCF_RESKEY_signal_delay + try_umount "$SUB" && return $OCF_SUCCESS + done + } + +From cb6aaffc260eea0f0fee6fab44393c6cf12b8a83 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 9 Sep 2024 10:58:12 +0200 +Subject: [PATCH 2/2] Filesystem: only use $umount_force after sending + kill_signals + +--- + heartbeat/Filesystem | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index f54969f20..4dd962fd9 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -694,8 +694,8 @@ signal_processes() { + return 0 + } + try_umount() { +- local SUB="$1" +- $UMOUNT $umount_force "$SUB" ++ local force_arg="$1" SUB="$2" ++ $UMOUNT $force_arg "$SUB" + list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || { + ocf_log info "unmounted $SUB successfully" + return $OCF_SUCCESS +@@ -718,14 +718,14 @@ timeout_child() { + return $ret + } + fs_stop_loop() { +- local SUB="$1" signals="$2" sig send_signal ++ local force_arg="$1" SUB="$2" signals="$3" sig send_signal + while true; do + send_signal=false + for sig in $signals; do + signal_processes "$SUB" $sig && send_signal=true + done + $send_signal && sleep $OCF_RESKEY_signal_delay +- try_umount "$SUB" && return $OCF_SUCCESS ++ try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS + done + } + fs_stop() { +@@ -733,13 +733,13 @@ fs_stop() { + grace_time=$((timeout/2)) + + # try gracefully terminating processes for up to half of the configured timeout +- fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" & ++ fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret + + # try killing them for the rest of the timeout +- fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" & ++ fs_stop_loop "$umount_force" "$SUB" "$OCF_RESKEY_kill_signals" & + timeout_child $! $grace_time + ret=$? + [ $ret -eq $OCF_SUCCESS ] && return $ret diff --git a/SOURCES/RHEL-58632-azure-events-use-node-name-from-cluster.patch b/SOURCES/RHEL-58632-azure-events-use-node-name-from-cluster.patch new file mode 100644 index 0000000..65a16eb --- /dev/null +++ b/SOURCES/RHEL-58632-azure-events-use-node-name-from-cluster.patch @@ -0,0 +1,37 @@ +From c72dc2f2e502486d93aeec26abc12e720b14a0a7 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 10 Oct 2024 16:41:03 +0200 +Subject: [PATCH] azure-events*: use node name from cluster instead of hostname + to avoid failing if they're not the same + +--- + heartbeat/azure-events-az.in | 2 +- + heartbeat/azure-events.in | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in +index 6d31e5aba..0ed001037 100644 +--- a/heartbeat/azure-events-az.in ++++ b/heartbeat/azure-events-az.in +@@ -441,7 +441,7 @@ class Node: + self.raOwner = ra + self.azInfo = azHelper.getInstanceInfo() + self.azName = self.azInfo.name +- self.hostName = socket.gethostname() ++ self.hostName = clusterHelper._exec("crm_node", "-n") + self.setAttr("azName", self.azName) + clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName) + +diff --git a/heartbeat/azure-events.in b/heartbeat/azure-events.in +index 90acaba62..32f71ee26 100644 +--- a/heartbeat/azure-events.in ++++ b/heartbeat/azure-events.in +@@ -411,7 +411,7 @@ class Node: + self.raOwner = ra + self.azInfo = azHelper.getInstanceInfo() + self.azName = self.azInfo.name +- self.hostName = socket.gethostname() ++ self.hostName = clusterHelper._exec("crm_node", "-n") + self.setAttr("azName", self.azName) + clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName) + diff --git a/SOURCES/RHEL-59172-nfsserver-also-stop-rpc-statd-for-nfsv4_only.patch b/SOURCES/RHEL-59172-nfsserver-also-stop-rpc-statd-for-nfsv4_only.patch new file mode 100644 index 0000000..73e2324 --- /dev/null +++ b/SOURCES/RHEL-59172-nfsserver-also-stop-rpc-statd-for-nfsv4_only.patch @@ -0,0 +1,38 @@ +From 38eaf00bc81af7530c56eba282918762a47a9326 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 19 Sep 2024 13:01:53 +0200 +Subject: [PATCH] nfsserver: also stop rpc-statd for nfsv4_only to avoid stop + failing in some cases + +E.g. nfs_no_notify=true nfsv4_only=true nfs_shared_infodir=/nfsmq/nfsinfo would cause a "Failed to unmount a bind mount" error +--- + heartbeat/nfsserver | 16 +++++++--------- + 1 file changed, 7 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver +index 5793d7a70..fd9268afc 100755 +--- a/heartbeat/nfsserver ++++ b/heartbeat/nfsserver +@@ -947,15 +947,13 @@ nfsserver_stop () + sleep 1 + done + +- if ! ocf_is_true "$OCF_RESKEY_nfsv4_only"; then +- nfs_exec stop rpc-statd > /dev/null 2>&1 +- ocf_log info "Stop: rpc-statd" +- rpcinfo -t localhost 100024 > /dev/null 2>&1 +- rc=$? +- if [ "$rc" -eq "0" ]; then +- ocf_exit_reason "Failed to stop rpc-statd" +- return $OCF_ERR_GENERIC +- fi ++ nfs_exec stop rpc-statd > /dev/null 2>&1 ++ ocf_log info "Stop: rpc-statd" ++ rpcinfo -t localhost 100024 > /dev/null 2>&1 ++ rc=$? ++ if [ "$rc" -eq "0" ]; then ++ ocf_exit_reason "Failed to stop rpc-statd" ++ return $OCF_ERR_GENERIC + fi + + nfs_exec stop nfs-idmapd > /dev/null 2>&1 diff --git a/SOURCES/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch b/SOURCES/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch new file mode 100644 index 0000000..561e29a --- /dev/null +++ b/SOURCES/RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch @@ -0,0 +1,100 @@ +From f02afd0fadb581ca0fc9798beaf28044cf211200 Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Wed, 18 Sep 2024 11:53:52 +0200 +Subject: [PATCH 1/2] Filesystem: on stop, try umount directly, before scanning + for users + +48ed6e6d (Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems, 2023-07-04) +changed the logic from +"try umount; if that fails, find and kill users; repeat" to +"try to find and kill users; then try umount; repeat" + +But even just walking /proc may take "a long time" on busy systems, +and may still turn up with "no users found". + +It will take even longer for "force_umount=safe" +(observed 8 to 10 seconds just for "get_pids() with "safe" to return nothing) +than for "force_umount=yes" (still ~ 2 to 3 seconds), +but it will take "a long time" in any case. +(BTW, that may be longer than the hardcoded default of 6 seconds for "fast_stop", +which is also the default on many systems now) + +If the dependencies are properly configured, +there should be no users left, +and the umount should just work. + +Revert back to "try umount first", and only then try to find "rogue" users. +--- + heartbeat/Filesystem | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 4dd962fd9..99bddaf62 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -732,6 +732,11 @@ fs_stop() { + local SUB="$1" timeout=$2 grace_time ret + grace_time=$((timeout/2)) + ++ # Just walking /proc may take "a long time", even if we don't find any users of this FS. ++ # If dependencies are properly configured, umount should just work. ++ # Only if that fails, try to find and kill processes that still use it. ++ try_umount "" "$SUB" && return $OCF_SUCCESS ++ + # try gracefully terminating processes for up to half of the configured timeout + fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" & + timeout_child $! $grace_time + +From b42d698f12aaeb871f4cc6a3c0327a27862b4376 Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Wed, 18 Sep 2024 13:42:38 +0200 +Subject: [PATCH 2/2] Filesystem: stop/get_pids to be signaled + +The "safe" way to get process ids that may be using a particular filesystem +currently uses shell globs ("find /proc/[0-9]*"). +With a million processes (and/or a less capable shell), +that may result in "Argument list too long". + +Replace with find /proc -path "/proc/[0-9]*" instead. +While at it, also fix the non-posix -or to be -o, +and add explicit grouping parentheses \( \) and explicit -print. + +Add a comment to not include "interesting" characters in mount point names. +--- + heartbeat/Filesystem | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 99bddaf62..3405e2c26 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -669,9 +669,26 @@ get_pids() + $FUSER -Mm $dir 2>/dev/null + fi + elif [ "$FORCE_UNMOUNT" = "safe" ]; then +- procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}') +- mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}') +- printf "${procs}\n${mmap_procs}" | sort | uniq ++ # Yes, in theory, ${dir} could contain "intersting" characters ++ # and would need to be quoted for glob (find) and regex (grep). ++ # Don't do that, then. ++ ++ # Avoid /proc/[0-9]*, it may cause "Argument list too long". ++ # There are several ways to filter for /proc/ ++ # -mindepth 1 -not -path "/proc/[0-9]*" -prune -o ... ++ # -path "/proc/[!0-9]*" -prune -o ... ++ # -path "/proc/[0-9]*" -a ... ++ # the latter seemd to be significantly faster for this one in my naive test. ++ procs=$(exec 2>/dev/null; ++ find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print | ++ awk -F/ '{print $3}' | uniq) ++ ++ # This finds both /proc//maps and /proc//task//maps; ++ # if you don't want the latter, add -maxdepth. ++ mmap_procs=$(exec 2>/dev/null; ++ find /proc -path "/proc/[0-9]*/maps" -print | ++ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq) ++ printf "${procs}\n${mmap_procs}" | sort -u + fi + } + diff --git a/SOURCES/RHEL-61888-ocf-shellfuncs-only-create-update-reload-systemd-drop-in-if-needed.patch b/SOURCES/RHEL-61888-ocf-shellfuncs-only-create-update-reload-systemd-drop-in-if-needed.patch new file mode 100644 index 0000000..9ff4966 --- /dev/null +++ b/SOURCES/RHEL-61888-ocf-shellfuncs-only-create-update-reload-systemd-drop-in-if-needed.patch @@ -0,0 +1,48 @@ +From 82958dc115c47232ae0468b1ddf64e728ec325e4 Mon Sep 17 00:00:00 2001 +From: Georg Pfuetzenreuter +Date: Wed, 9 Oct 2024 00:16:44 +0200 +Subject: [PATCH] ocf-shellfuncs: systemd_drop_in only if needed + +Avoid dbus overload upon many simultaneous "daemon-reload" invocations +(when a resource agent using systemd_drop_in() is called multiple times +as part of parallel resource operations in Pacemaker) by skipping the +file creation and reload if the expected data already exists. + +Whilst at it, align the indentation of the heredoc with the other parts +of the function. + +Signed-off-by: Georg Pfuetzenreuter +--- + heartbeat/ocf-shellfuncs.in | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in +index 9335cbf00..5c4bb3264 100644 +--- a/heartbeat/ocf-shellfuncs.in ++++ b/heartbeat/ocf-shellfuncs.in +@@ -662,14 +662,17 @@ systemd_drop_in() + systemdrundir="/run/systemd/system/resource-agents-deps.target.d" + mkdir -p "$systemdrundir" + conf_file="$systemdrundir/$1.conf" +- cat >"$conf_file" < "$conf_file" <<-EOF ++ [Unit] ++ $conf_line ++ EOF ++ # The information is accessible through systemd API and systemd would ++ # complain about improper permissions. ++ chmod o+r "$conf_file" ++ systemctl daemon-reload ++ fi + } + + # usage: curl_retry RETRIES SLEEP ARGS URL diff --git a/SOURCES/RHEL-62200-IPaddr2-improve-fail-logic-check-ip_status-after-adding-IP.patch b/SOURCES/RHEL-62200-IPaddr2-improve-fail-logic-check-ip_status-after-adding-IP.patch new file mode 100644 index 0000000..96612df --- /dev/null +++ b/SOURCES/RHEL-62200-IPaddr2-improve-fail-logic-check-ip_status-after-adding-IP.patch @@ -0,0 +1,132 @@ +From 6fab544e702a7601714cd017aecc00193f23ae72 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Fri, 11 Oct 2024 13:13:10 +0200 +Subject: [PATCH] IPaddr2: improve fail logic and check ip_status after adding + IP + +* check that the label got applied +* return OCF_ERR_GENERIC to avoid false-positive when IP was manually added before starting the resource +* check ip_status after adding IP to fail without having to wait for the first monitor-action + +Co-authored-by: Evan J. Felix +--- + heartbeat/IPaddr2 | 35 ++++++++++++++++++++++++++--------- + 1 file changed, 26 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2 +index e325aa574..27cae2d11 100755 +--- a/heartbeat/IPaddr2 ++++ b/heartbeat/IPaddr2 +@@ -586,7 +586,7 @@ ip_init() { + exit $rc + fi + fi +- ++ + SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip" + + if [ -n "$IFLABEL" ]; then +@@ -985,6 +985,7 @@ run_send_ua() { + # ok = served (for CIP: + hash bucket) + # partial = served and no hash bucket (CIP only) + # partial2 = served and no CIP iptables rule ++# partial3 = served with no label + # no = nothing + # + ip_served() { +@@ -1002,6 +1003,11 @@ ip_served() { + + if [ -z "$IP_CIP" ]; then + for i in $cur_nic; do ++ # check address label ++ if [ -n "$IFLABEL" ] && [ -z "`$IP2UTIL -o -f $FAMILY addr show $nic label $IFLABEL`" ]; then ++ echo partial3 ++ return 0 ++ fi + # only mark as served when on the same interfaces as $NIC + [ "$i" = "$NIC" ] || continue + echo "ok" +@@ -1065,7 +1071,12 @@ ip_start() { + if [ "$ip_status" = "ok" ]; then + exit $OCF_SUCCESS + fi +- ++ ++ if [ "$ip_status" = "partial3" ]; then ++ ocf_exit_reason "IP $OCF_RESKEY_ip available, but label missing" ++ exit $OCF_ERR_GENERIC ++ fi ++ + if [ -n "$IP_CIP" ] && ([ $ip_status = "no" ] || [ $ip_status = "partial2" ]); then + $MODPROBE ip_conntrack + $IPADDR2_CIP_IPTABLES -I INPUT -d $OCF_RESKEY_ip -i $NIC -j CLUSTERIP \ +@@ -1083,7 +1094,7 @@ ip_start() { + if [ -n "$IP_CIP" ] && [ $ip_status = "partial" ]; then + echo "+$IP_INC_NO" >$IP_CIP_FILE + fi +- ++ + if [ "$ip_status" = "no" ]; then + if ocf_is_true ${OCF_RESKEY_lvs_support}; then + for i in `find_interface $OCF_RESKEY_ip 32`; do +@@ -1094,7 +1105,7 @@ ip_start() { + esac + done + fi +- ++ + add_interface "$OCF_RESKEY_ip" "$NETMASK" "${BRDCAST:-none}" "$NIC" "$IFLABEL" "$METRIC" + rc=$? + +@@ -1102,6 +1113,12 @@ ip_start() { + ocf_exit_reason "Failed to add $OCF_RESKEY_ip" + exit $rc + fi ++ ++ ip_status=`ip_served` ++ if [ "$ip_status" != "ok" ]; then ++ ocf_exit_reason "Failed to add $OCF_RESKEY_ip with error $ip_status" ++ exit $OCF_ERR_GENERIC ++ fi + fi + + case $NIC in +@@ -1134,7 +1151,7 @@ ip_stop() { + ocf_take_lock $CIP_lockfile + ocf_release_lock_on_exit $CIP_lockfile + fi +- ++ + if [ -f "$SENDARPPIDFILE" ] ; then + kill `cat "$SENDARPPIDFILE"` + if [ $? -ne 0 ]; then +@@ -1171,17 +1188,17 @@ ip_stop() { + i=`expr $i + 1` + done + else +- ip_del_if="no" ++ ip_del_if="no" + fi + fi +- ++ + if [ "$ip_del_if" = "yes" ]; then + delete_interface $OCF_RESKEY_ip $NIC $NETMASK + if [ $? -ne 0 ]; then + ocf_exit_reason "Unable to remove IP [${OCF_RESKEY_ip} from interface [ $NIC ]" + exit $OCF_ERR_GENERIC + fi +- ++ + if ocf_is_true ${OCF_RESKEY_lvs_support}; then + restore_loopback "$OCF_RESKEY_ip" + fi +@@ -1200,7 +1217,7 @@ ip_monitor() { + run_arp_sender refresh + return $OCF_SUCCESS + ;; +- partial|no|partial2) ++ no) + exit $OCF_NOT_RUNNING + ;; + *) diff --git a/SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch b/SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch new file mode 100644 index 0000000..c7bf67f --- /dev/null +++ b/SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch @@ -0,0 +1,184 @@ +From 392d40048a25d7cb73ec5b5e9f7a5862f7a3fd48 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 11 Nov 2024 12:22:27 +0100 +Subject: [PATCH 1/2] aws.sh: add get_interface_mac() + +--- + heartbeat/aws.sh | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh +index 64f2e13a7..ebb4eb1f4 100644 +--- a/heartbeat/aws.sh ++++ b/heartbeat/aws.sh +@@ -69,3 +69,24 @@ get_instance_id() { + echo "$INSTANCE_ID" + return "$OCF_SUCCESS" + } ++ ++get_interface_mac() { ++ local MAC_FILE MAC_ADDR rc ++ MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address" ++ if [ -f "$MAC_FILE" ]; then ++ cmd="cat ${MAC_FILE}" ++ else ++ cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3" ++ fi ++ ocf_log debug "executing command: $cmd" ++ MAC_ADDR="$(eval $cmd)" ++ rc=$? ++ if [ $rc != 0 ]; then ++ ocf_log warn "command failed, rc: $rc" ++ return $OCF_ERR_GENERIC ++ fi ++ ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}" ++ ++ echo $MAC_ADDR ++ return $OCF_SUCCESS ++} + +From 87337ac4da931d5a53c83d53d4bab17ee123ba9f Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 11 Nov 2024 12:26:38 +0100 +Subject: [PATCH 2/2] awsvip: let user specify which interface to use, and make + the parameter optional in aws-vpc-move-ip + +--- + heartbeat/aws-vpc-move-ip | 20 ++++---------------- + heartbeat/aws.sh | 4 +++- + heartbeat/awsvip | 24 +++++++++++++++++------- + 3 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip +index 09ae68b57..2afc0ba53 100755 +--- a/heartbeat/aws-vpc-move-ip ++++ b/heartbeat/aws-vpc-move-ip +@@ -157,7 +157,7 @@ Role to use to query/update the route table + + + +- ++ + + Name of the network interface, i.e. eth0 + +@@ -321,7 +321,7 @@ ec2ip_monitor() { + ocf_log debug "monitor: Enhanced Monitoring disabled - omitting API call" + fi + +- cmd="ip addr show to $OCF_RESKEY_ip up" ++ cmd="ip addr show dev $OCF_RESKEY_interface to $OCF_RESKEY_ip up" + ocf_log debug "executing command: $cmd" + RESULT=$($cmd | grep "$OCF_RESKEY_ip") + if [ -z "$RESULT" ]; then +@@ -331,7 +331,7 @@ ec2ip_monitor() { + level="info" + fi + +- ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to running interface" ++ ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to interface $OCF_RESKEY_interface" + return $OCF_NOT_RUNNING + fi + +@@ -369,19 +369,7 @@ ec2ip_drop() { + } + + ec2ip_get_instance_eni() { +- MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address" +- if [ -f $MAC_FILE ]; then +- cmd="cat ${MAC_FILE}" +- else +- cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3" +- fi +- ocf_log debug "executing command: $cmd" +- MAC_ADDR="$(eval $cmd)" +- rc=$? +- if [ $rc != 0 ]; then +- ocf_log warn "command failed, rc: $rc" +- return $OCF_ERR_GENERIC +- fi ++ MAC_ADDR=$(get_interface_mac) + ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}" + + cmd="curl_retry \"$OCF_RESKEY_curl_retries\" \"$OCF_RESKEY_curl_sleep\" \"--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'\" \"http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDR}/interface-id\"" +diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh +index ebb4eb1f4..216033afe 100644 +--- a/heartbeat/aws.sh ++++ b/heartbeat/aws.sh +@@ -73,7 +73,9 @@ get_instance_id() { + get_interface_mac() { + local MAC_FILE MAC_ADDR rc + MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address" +- if [ -f "$MAC_FILE" ]; then ++ if [ -z "$OCF_RESKEY_interface" ]; then ++ cmd="curl_retry \"$OCF_RESKEY_curl_retries\" \"$OCF_RESKEY_curl_sleep\" \"--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'\" \"http://169.254.169.254/latest/meta-data/mac\"" ++ elif [ -f "$MAC_FILE" ]; then + cmd="cat ${MAC_FILE}" + else + cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3" +diff --git a/heartbeat/awsvip b/heartbeat/awsvip +index 0856ac5e4..015180d5a 100755 +--- a/heartbeat/awsvip ++++ b/heartbeat/awsvip +@@ -49,12 +49,14 @@ OCF_RESKEY_auth_type_default="key" + OCF_RESKEY_profile_default="default" + OCF_RESKEY_region_default="" + OCF_RESKEY_api_delay_default="3" ++OCF_RESKEY_interface_default="" + + : ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} + : ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}} + : ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}} + : ${OCF_RESKEY_region=${OCF_RESKEY_region_default}} + : ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}} ++: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}} + + meta_data() { + cat < + + ++ ++ ++Name of the network interface, i.e. eth0 ++ ++network interface name ++ ++ ++ + + + curl retries before failing +@@ -207,16 +217,16 @@ awsvip_stop() { + } + + awsvip_monitor() { +- $AWSCLI_CMD ec2 describe-instances \ +- --instance-id "${INSTANCE_ID}" \ +- --query 'Reservations[].Instances[].NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \ ++ $AWSCLI_CMD ec2 describe-network-interfaces \ ++ --network-interface-ids "${NETWORK_ID}" \ ++ --query 'NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \ + --output text | \ + grep -qE "(^|\s)${SECONDARY_PRIVATE_IP}(\s|$)" +- RET=$? +- +- if [ $RET -ne 0 ]; then ++ if [ $? -ne 0 ]; then ++ [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe && ocf_log error "IP $SECONDARY_PRIVATE_IP not assigned to interface ${NETWORK_ID}" + return $OCF_NOT_RUNNING + fi ++ + return $OCF_SUCCESS + } + +@@ -267,7 +277,7 @@ TOKEN=$(get_token) + [ $? -ne 0 ] && exit $OCF_ERR_GENERIC + INSTANCE_ID=$(get_instance_id) + [ $? -ne 0 ] && exit $OCF_ERR_GENERIC +-MAC_ADDRESS=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/mac") ++MAC_ADDRESS=$(get_interface_mac) + [ $? -ne 0 ] && exit $OCF_ERR_GENERIC + NETWORK_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDRESS}/interface-id") + [ $? -ne 0 ] && exit $OCF_ERR_GENERIC diff --git a/SOURCES/RHEL-79819-portblock-fix-version-detection.patch b/SOURCES/RHEL-79819-portblock-fix-version-detection.patch new file mode 100644 index 0000000..a8466ee --- /dev/null +++ b/SOURCES/RHEL-79819-portblock-fix-version-detection.patch @@ -0,0 +1,360 @@ +--- a/heartbeat/portblock 2021-11-03 10:12:01.000000000 +0100 ++++ b/heartbeat/portblock 2025-02-20 14:09:44.546869740 +0100 +@@ -25,6 +25,7 @@ + # Defaults + OCF_RESKEY_protocol_default="" + OCF_RESKEY_portno_default="" ++OCF_RESKEY_direction_default="in" + OCF_RESKEY_action_default="" + OCF_RESKEY_ip_default="0.0.0.0/0" + OCF_RESKEY_reset_local_on_unblock_stop_default="false" +@@ -33,6 +34,7 @@ + + : ${OCF_RESKEY_protocol=${OCF_RESKEY_protocol_default}} + : ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}} ++: ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}} + : ${OCF_RESKEY_action=${OCF_RESKEY_action_default}} + : ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} + : ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}} +@@ -217,6 +219,18 @@ + Connection state file synchronization script + + ++ ++ ++ ++Whether to block incoming or outgoing traffic. Can be either "in", ++"out", or "both". ++If "in" is used, the incoming ports are blocked on the INPUT chain. ++If "out" is used, the outgoing ports are blocked on the OUTPUT chain. ++If "both" is used, both the incoming and outgoing ports are blocked. ++ ++Whether to block incoming or outgoing traffic, or both ++ ++ + + + +@@ -240,19 +254,34 @@ + # and disable us -- but we're still in some sense active... + # + +-#active_grep_pat {udp|tcp} portno,portno ++#active_grep_pat {udp|tcp} portno,portno ip {d|s} ++# d = look for destination ports ++# s = look for source ports + active_grep_pat() + { + w="[ ][ ]*" + any="0\\.0\\.0\\.0/0" +- echo "^DROP${w}${1}${w}--${w}${any}${w}${3}${w}multiport${w}dports${w}${2}\>" ++ src=$any dst=$3 ++ if [ "$4" = "s" ]; then ++ local src=$3 ++ local dst=$any ++ fi ++ # iptables 1.8.9 briefly broke the output format, returning the ++ # numeric protocol value instead of a string. Support both variants. ++ if [ "$1" = "tcp" ]; then ++ local prot="(tcp|6)" ++ else ++ local prot="(udp|17)" ++ fi ++ echo "^DROP${w}${prot}${w}--${w}${src}${w}${dst}${w}multiport${w}${4}ports${w}${2}$" + } + +-#chain_isactive {udp|tcp} portno,portno ip ++#chain_isactive {udp|tcp} portno,portno ip chain + chain_isactive() + { +- PAT=`active_grep_pat "$1" "$2" "$3"` +- $IPTABLES $wait -n -L INPUT | grep "$PAT" >/dev/null ++ [ "$4" = "OUTPUT" ] && ds="s" || ds="d" ++ PAT=$(active_grep_pat "$1" "$2" "$3" "$ds") ++ $IPTABLES $wait -n -L "$4" | grep -qE "$PAT" + } + + # netstat -tn and ss -Htn, split on whitespace and colon, +@@ -299,7 +328,6 @@ + tickle_remote() + { + [ -z "$OCF_RESKEY_tickle_dir" ] && return +- echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle + f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip + [ -r $f ] || return + $TICKLETCP -n 3 < $f +@@ -331,112 +359,140 @@ + + SayActive() + { +- echo "$CMD DROP rule for INPUT chain [$*] is running (OK)" ++ ocf_log debug "$CMD DROP rule [$*] is running (OK)" + } + + SayConsideredActive() + { +- echo "$CMD DROP rule for INPUT chain [$*] considered to be running (OK)" ++ ocf_log debug "$CMD DROP rule [$*] considered to be running (OK)" + } + + SayInactive() + { +- echo "$CMD DROP rule for INPUT chain [$*] is inactive" ++ ocf_log debug "$CMD DROP rule [$*] is inactive" + } + +-#IptablesStatus {udp|tcp} portno,portno ip {block|unblock} ++#IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock} + IptablesStatus() { +- local rc +- rc=$OCF_ERR_GENERIC +- activewords="$CMD $1 $2 is running (OK)" +- if chain_isactive "$1" "$2" "$3"; then +- case $4 in +- block) +- SayActive $* +- rc=$OCF_SUCCESS +- ;; +- *) +- SayInactive $* +- rc=$OCF_NOT_RUNNING +- ;; +- esac +- else +- case $4 in +- block) +- if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then +- SayConsideredActive $* +- rc=$OCF_SUCCESS +- else +- SayInactive $* +- rc=$OCF_NOT_RUNNING +- fi +- ;; +- +- *) +- if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then +- SayActive $* +- #This is only run on real monitor events. +- save_tcp_connections +- rc=$OCF_SUCCESS +- else +- SayInactive $* +- rc=$OCF_NOT_RUNNING +- fi +- ;; +- esac +- fi +- +- return $rc ++ local rc ++ rc=$OCF_ERR_GENERIC ++ is_active=0 ++ if [ "$4" = "in" ] || [ "$4" = "both" ]; then ++ chain_isactive "$1" "$2" "$3" INPUT ++ is_active=$? ++ fi ++ if [ "$4" = "out" ] || [ "$4" = "both" ]; then ++ chain_isactive "$1" "$2" "$3" OUTPUT ++ r=$? ++ [ $r -gt $is_active ] && is_active=$r ++ fi ++ if [ $is_active -eq 0 ]; then ++ case $5 in ++ block) ++ SayActive $* ++ rc=$OCF_SUCCESS ++ ;; ++ *) ++ SayInactive $* ++ rc=$OCF_NOT_RUNNING ++ ;; ++ esac ++ else ++ case $5 in ++ block) ++ if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then ++ SayConsideredActive $* ++ rc=$OCF_SUCCESS ++ else ++ SayInactive $* ++ rc=$OCF_NOT_RUNNING ++ fi ++ ;; ++ *) ++ if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then ++ SayActive $* ++ #This is only run on real monitor events. ++ save_tcp_connections ++ rc=$OCF_SUCCESS ++ else ++ SayInactive $* ++ rc=$OCF_NOT_RUNNING ++ fi ++ ;; ++ esac ++ fi ++ return $rc + } + +-#IptablesBLOCK {udp|tcp} portno,portno ip +-IptablesBLOCK() ++#DoIptables {-I|-D} {udp|tcp} portno,portno ip chain ++DoIptables() + { +- local rc=0 +- local try_reset=false +- if [ "$1/$4/$__OCF_ACTION" = tcp/unblock/stop ] && +- ocf_is_true $reset_local_on_unblock_stop +- then +- try_reset=true +- fi +- if +- chain_isactive "$1" "$2" "$3" +- then +- : OK -- chain already active ++ op=$1 proto=$2 ports=$3 ip=$4 chain=$5 ++ active=0; chain_isactive "$proto" "$ports" "$ip" "$chain" && active=1 ++ want_active=0; [ "$op" = "-I" ] && want_active=1 ++ ocf_log debug "active: $active want_active: $want_active" ++ if [ $active -eq $want_active ] ; then ++ : Chain already in desired state + else +- if $try_reset ; then +- $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset +- tickle_local ++ [ "$chain" = "OUTPUT" ] && ds="s" || ds="d" ++ $IPTABLES $wait "$op" "$chain" -p "$proto" -${ds} "$ip" -m multiport --${ds}ports "$ports" -j DROP ++ fi ++} ++ ++#IptablesBLOCK {udp|tcp} portno,portno ip {in|out|both} {block|unblock} ++IptablesBLOCK() ++{ ++ local rc_in=0 ++ local rc_out=0 ++ if [ "$4" = "in" ] || [ "$4" = "both" ]; then ++ local try_reset=false ++ if [ "$1/$5/$__OCF_ACTION" = tcp/unblock/stop ] && ++ ocf_is_true $reset_local_on_unblock_stop ++ then ++ try_reset=true + fi +- $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP +- rc=$? +- if $try_reset ; then +- $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset ++ if ++ chain_isactive "$1" "$2" "$3" INPUT ++ then ++ : OK -- chain already active ++ else ++ if $try_reset ; then ++ $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset ++ tickle_local ++ fi ++ $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP ++ rc_in=$? ++ if $try_reset ; then ++ $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset ++ fi + fi + fi ++ if [ "$4" = "out" ] || [ "$4" = "both" ]; then ++ DoIptables -I "$1" "$2" "$3" OUTPUT ++ rc_out=$? ++ fi + +- return $rc ++ [ $rc_in -gt $rc_out ] && return $rc_in || return $rc_out + } + +-#IptablesUNBLOCK {udp|tcp} portno,portno ip ++#IptablesUNBLOCK {udp|tcp} portno,portno ip {in|out|both} + IptablesUNBLOCK() + { +- if +- chain_isactive "$1" "$2" "$3" +- then +- $IPTABLES $wait -D INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP +- else +- : Chain Not active ++ if [ "$4" = "in" ] || [ "$4" = "both" ]; then ++ DoIptables -D "$1" "$2" "$3" INPUT ++ fi ++ if [ "$4" = "out" ] || [ "$4" = "both" ]; then ++ DoIptables -D "$1" "$2" "$3" OUTPUT + fi + + return $? + } + +-#IptablesStart {udp|tcp} portno,portno ip {block|unblock} ++#IptablesStart {udp|tcp} portno,portno ip {in|out|both} {block|unblock} + IptablesStart() + { + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start +- case $4 in ++ case $5 in + block) IptablesBLOCK "$@";; + unblock) + IptablesUNBLOCK "$@" +@@ -451,11 +507,11 @@ + return $? + } + +-#IptablesStop {udp|tcp} portno,portno ip {block|unblock} ++#IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock} + IptablesStop() + { + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop +- case $4 in ++ case $5 in + block) IptablesUNBLOCK "$@";; + unblock) + save_tcp_connections +@@ -473,7 +529,7 @@ + CheckPort() { + # Examples of valid port: "1080", "1", "0080" + # Examples of invalid port: "1080bad", "0", "0000", "" +- echo $1 |egrep -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*' ++ echo $1 | $EGREP -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*' + } + + IptablesValidateAll() +@@ -562,7 +618,7 @@ + fi + + # iptables v1.4.20+ is required to use -w (wait) +-version=$(iptables -V | awk -F ' v' '{print $NF}') ++version=$(iptables -V | grep -oE '[0-9]+[\.0-9]+') + ocf_version_cmp "$version" "1.4.19.1" + if [ "$?" -eq "2" ]; then + wait="-w" +@@ -572,6 +628,7 @@ + + protocol=$OCF_RESKEY_protocol + portno=$OCF_RESKEY_portno ++direction=$OCF_RESKEY_direction + action=$OCF_RESKEY_action + ip=$OCF_RESKEY_ip + reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop +@@ -592,15 +649,15 @@ + + case $1 in + start) +- IptablesStart $protocol $portno $ip $action ++ IptablesStart $protocol $portno $ip $direction $action + ;; + + stop) +- IptablesStop $protocol $portno $ip $action ++ IptablesStop $protocol $portno $ip $direction $action + ;; + + status|monitor) +- IptablesStatus $protocol $portno $ip $action ++ IptablesStatus $protocol $portno $ip $direction $action + ;; + + validate-all) diff --git a/SOURCES/RHEL-88035-Filesystem-add-support-for-aznfs.patch b/SOURCES/RHEL-88035-Filesystem-add-support-for-aznfs.patch new file mode 100644 index 0000000..65466e4 --- /dev/null +++ b/SOURCES/RHEL-88035-Filesystem-add-support-for-aznfs.patch @@ -0,0 +1,171 @@ +From 3bffa541f7bf66e143f14e51551fc91dfebec86c Mon Sep 17 00:00:00 2001 +From: Tobias Schug +Date: Mon, 28 Oct 2024 09:14:41 +0100 +Subject: [PATCH] Add azure aznfs filesystem support + +--- + heartbeat/Filesystem | 37 ++++++++++++++++++++----------------- + 1 file changed, 20 insertions(+), 17 deletions(-) + +diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem +index 3405e2c26..b48bee142 100755 +--- a/heartbeat/Filesystem ++++ b/heartbeat/Filesystem +@@ -2,7 +2,7 @@ + # + # Support: users@clusterlabs.org + # License: GNU General Public License (GPL) +-# ++# + # Filesystem + # Description: Manages a Filesystem on a shared storage medium. + # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) +@@ -142,7 +142,7 @@ meta_data() { + + + Resource script for Filesystem. It manages a Filesystem on a +-shared storage medium. ++shared storage medium. + + The standard monitor operation of depth 0 (also known as probe) + checks if the filesystem is mounted. If you want deeper tests, +@@ -260,7 +260,7 @@ currently accessing the mount directory. + "true" : Kill processes accessing mount point + "safe" : Kill processes accessing mount point using methods that + avoid functions that could potentially block during process +- detection ++ detection + "false" : Do not kill any processes. + + The 'safe' option uses shell logic to walk the /procs/ directory +@@ -373,7 +373,7 @@ determine_blockdevice() { + # Get the current real device name, if possible. + # (specified devname could be -L or -U...) + case "$FSTYPE" in +- nfs4|nfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre) ++ nfs4|nfs|aznfs|efs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|none|lustre) + : ;; + *) + match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}" +@@ -455,7 +455,7 @@ is_fsck_needed() { + no) false;; + ""|auto) + case "$FSTYPE" in +- ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs) ++ ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|aznfs|efs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs) + false;; + *) + true;; +@@ -478,7 +478,7 @@ fstype_supported() + fi + + if [ -z "$FSTYPE" -o "$FSTYPE" = none ]; then +- : No FSTYPE specified, rely on the system has the right file-system support already ++ : No FSTYPE specified, rely on the system has the right file-system support already + return $OCF_SUCCESS + fi + +@@ -487,6 +487,7 @@ fstype_supported() + case "$FSTYPE" in + fuse.*|glusterfs|rozofs) support="fuse";; + efs) check_binary "mount.efs"; support="nfs4";; ++ aznfs) check_binary "mount.aznfs"; support="nfs4";; + esac + + if [ "$support" != "$FSTYPE" ]; then +@@ -530,7 +531,7 @@ fstype_supported() + # node on the shared storage, and is not visible yet. Then try + # partprobe to refresh /dev/disk/by-{label,uuid}/* up to date. + # +-# DEVICE can be /dev/xxx, -U, -L ++# DEVICE can be /dev/xxx, -U, -L + # + trigger_udev_rules_if_needed() + { +@@ -545,12 +546,12 @@ trigger_udev_rules_if_needed() + fi + else + tmp="$(echo $DEVICE|awk '{$1=""; print substr($0,2)}')" +- case "$DEVICE" in +- -U*|--uuid*) +- tmp="/dev/disk/by-uuid/$tmp" ++ case "$DEVICE" in ++ -U*|--uuid*) ++ tmp="/dev/disk/by-uuid/$tmp" + ;; + -L*|--label*) +- tmp="/dev/disk/by-label/$tmp" ++ tmp="/dev/disk/by-label/$tmp" + ;; + *) + # bind mount? +@@ -595,7 +596,7 @@ Filesystem_start() + + fstype_supported || exit $OCF_ERR_INSTALLED + +- # Check the filesystem & auto repair. ++ # Check the filesystem & auto repair. + # NOTE: Some filesystem types don't need this step... Please modify + # accordingly + +@@ -697,7 +698,7 @@ signal_processes() { + local sig=$2 + local pids pid + # fuser returns a non-zero return code if none of the +- # specified files is accessed or in case of a fatal ++ # specified files is accessed or in case of a fatal + # error. + pids=$(get_pids "$dir") + if [ -z "$pids" ]; then +@@ -745,6 +746,7 @@ fs_stop_loop() { + try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS + done + } ++ + fs_stop() { + local SUB="$1" timeout=$2 grace_time ret + grace_time=$((timeout/2)) +@@ -797,7 +799,7 @@ Filesystem_stop() + + # For networked filesystems, there's merit in trying -f: + case "$FSTYPE" in +- nfs4|nfs|efs|cifs|smbfs) umount_force="-f" ;; ++ nfs4|nfs|aznfs|efs|cifs|smbfs) umount_force="-f" ;; + esac + + # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. +@@ -942,6 +944,7 @@ Filesystem_monitor_20() + fi + return $OCF_SUCCESS + } ++ + Filesystem_monitor() + { + Filesystem_status +@@ -1016,7 +1019,7 @@ set_blockdevice_var() { + + # these are definitely not block devices + case "$FSTYPE" in +- nfs4|nfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;; ++ nfs4|nfs|aznfs|efs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|zfs|cvfs|lustre) return;; + esac + + if $(is_option "loop"); then +@@ -1098,7 +1101,7 @@ set_blockdevice_var + if [ -z "$OCF_RESKEY_directory" ]; then + if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then + ocf_exit_reason "Please specify the directory" +- exit $OCF_ERR_CONFIGURED ++ exit $OCF_ERR_CONFIGURED + fi + else + MOUNTPOINT="$(echo "$OCF_RESKEY_directory" | sed 's/\/*$//')" +@@ -1166,7 +1169,7 @@ is_option "ro" && + CLUSTERSAFE=2 + + case "$FSTYPE" in +-nfs4|nfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre) ++nfs4|nfs|aznfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre) + CLUSTERSAFE=1 # this is kind of safe too + systemd_drop_in "99-Filesystem-remote" "After" "remote-fs.target" + ;; diff --git a/SOURCES/RHEL-88429-1-podman-etcd-new-ra.patch b/SOURCES/RHEL-88429-1-podman-etcd-new-ra.patch new file mode 100644 index 0000000..f294582 --- /dev/null +++ b/SOURCES/RHEL-88429-1-podman-etcd-new-ra.patch @@ -0,0 +1,1643 @@ +From 959b5c88c6a5e6a7a537eb6fc7e5033db8387777 Mon Sep 17 00:00:00 2001 +From: Carlo Lobrano +Date: Thu, 24 Apr 2025 13:16:59 +0200 +Subject: [PATCH] podman-etcd: new resource agent (#2023) + +Introduce a resource agent for Podman to manage etcd instances. + +This agent enables Pacemaker to control etcd containers, handling +start, stop, monitor, and recovery operations. +--- + doc/man/Makefile.am | 1 + + heartbeat/Makefile.am | 1 + + heartbeat/podman-etcd | 1597 +++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 1599 insertions(+) + create mode 100755 heartbeat/podman-etcd + +diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am +index bc8935782..0d34c7c65 100644 +--- a/doc/man/Makefile.am ++++ b/doc/man/Makefile.am +@@ -187,6 +187,7 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \ + ocf_heartbeat_pgsqlms.7 \ + ocf_heartbeat_pingd.7 \ + ocf_heartbeat_podman.7 \ ++ ocf_heartbeat_podman-etcd.7 \ + ocf_heartbeat_portblock.7 \ + ocf_heartbeat_postfix.7 \ + ocf_heartbeat_pound.7 \ +diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am +index 5c41e0038..839505af9 100644 +--- a/heartbeat/Makefile.am ++++ b/heartbeat/Makefile.am +@@ -159,6 +159,7 @@ ocf_SCRIPTS = AoEtarget \ + pgsqlms \ + pingd \ + podman \ ++ podman-etcd \ + portblock \ + postfix \ + pound \ +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +new file mode 100755 +index 000000000..514dd2e5b +--- /dev/null ++++ b/heartbeat/podman-etcd +@@ -0,0 +1,1597 @@ ++#!/bin/sh ++# ++# The podman etcd HA resource agent creates and launches a etcd podman ++# container based off a supplied podman image. Containers managed by ++# this agent are both created and removed upon the agent's start and ++# stop actions. ++# ++# Based on the podman resource agent. ++# ++# Copyright (c) 2014 David Vossel ++# Michele Baldessari ++# All Rights Reserved. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of version 2 of the GNU General Public License as ++# published by the Free Software Foundation. ++# ++# This program is distributed in the hope that it would be useful, but ++# WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ++# ++# Further, this software is distributed without any warranty that it is ++# free of the rightful claim of any third person regarding infringement ++# or the like. Any license provided herein, whether implied or ++# otherwise, applies only to this software file. Patent licenses, if ++# any, provided herein do not apply to combinations of this program with ++# other software, or any other product whatsoever. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write the Free Software Foundation, ++# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. ++# ++ ++####################################################################### ++# Initialization: ++ ++: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} ++. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ++ ++# Parameter defaults ++OCF_RESKEY_image_default="default" ++OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml" ++OCF_RESKEY_name_default="etcd" ++OCF_RESKEY_nic_default="br-ex" ++OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json" ++OCF_RESKEY_allow_pull_default="1" ++OCF_RESKEY_reuse_default="0" ++ ++: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}} ++: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}} ++: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}} ++: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}} ++: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}} ++: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}} ++: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}} ++ ++####################################################################### ++ ++meta_data() ++{ ++ cat < ++ ++ ++1.0 ++ ++ ++The podman-etcd HA resource agent creates and launches a etcd podman ++container based off a supplied podman image. Containers managed by ++this agent are both created and removed upon the agent's start and ++stop actions. ++ ++Podman etcd container resource agent. ++ ++ ++ ++ ++The Pod manifest with the configuration for Etcd. ++ ++Etcd pod manifest ++ ++ ++ ++ ++ ++The podman image to base this container off of. ++ ++podman image ++ ++ ++ ++ ++ ++The name to give the created container. By default this will ++be that resource's instance name. ++ ++podman container name ++ ++ ++ ++ ++ ++A mapping of node names to IPs. ++ ++This takes the form of: ++n1:ip1;n2:ip2 ++ ++where the etcd container on n1 would have IP ip1 ++ ++Container node name to IP mapping ++ ++ ++ ++ ++ ++Network interface to lookup interface for host. ++ ++Network interface ++ ++ ++ ++ ++ ++Path of the authentication file. ++ ++The file is created by podman login. ++ ++Path of the authentication file ++ ++ ++ ++ ++ ++Allow the image to be pulled from the configured podman registry when ++the image does not exist locally. NOTE, this can drastically increase ++the time required to start the container if the image repository is ++pulled over the network. ++ ++Allow pulling non-local images ++ ++ ++ ++ ++ ++Add options to be appended to the 'podman run' command which is used ++when creating the container during the start action. This option allows ++users to do things such as setting a custom entry point and injecting ++environment variables into the newly created container. Note the '-d' ++option is supplied regardless of this value to force containers to run ++in the background. ++ ++NOTE: Do not explicitly specify the --name argument in the run_opts. This ++agent will set --name using either the resource's instance or the name ++provided in the 'name' argument of this agent. ++ ++ ++run options ++ ++ ++ ++ ++ ++Specify a command to launch within the container once ++it has initialized. ++ ++run command ++ ++ ++ ++ ++ ++Options to be added to the 'run_cmd'. ++ ++run command options ++ ++ ++ ++ ++ ++A comma separated list of directories that the container is expecting to use. ++The agent will ensure they exist by running 'mkdir -p' ++ ++Required mount points ++ ++ ++ ++ ++ ++Specify the full path of a command to launch within the container to check ++the health of the container. This command must return 0 to indicate that ++the container is healthy. A non-zero return code will indicate that the ++container has failed and should be recovered. ++ ++Note: Using this method for monitoring processes inside a container ++is not recommended, as containerd tries to track processes running ++inside the container and does not deal well with many short-lived ++processes being spawned. Ensure that your container monitors its ++own processes and terminates on fatal error rather than invoking ++a command from the outside. ++ ++monitor command ++ ++ ++ ++ ++ ++Kill a container immediately rather than waiting for it to gracefully ++shutdown ++ ++force kill ++ ++ ++ ++ ++ ++Allow the container to be reused once it is stopped. By default, ++containers get removed once they are stopped. Enable this option ++to have the particular one persist when this happens. ++ ++reuse container ++ ++ ++ ++ ++ ++Use transient drop-in files to add extra dependencies to the systemd ++scopes associated to the container. During reboot, this prevents systemd ++to stop the container before pacemaker. ++ ++drop-in dependency ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++ ++END ++} ++ ++####################################################################### ++REQUIRE_IMAGE_PULL=0 ++ ++podman_usage() ++{ ++ cat <&1) ++ rc=$? ++ # 125: no container with name or ID ${CONTAINER} found ++ # 126: container state improper (not running) ++ # 127: any other error ++ # 255: podman 2+: container not running ++ case "$rc" in ++ 125|126|255) ++ rc=$OCF_NOT_RUNNING ++ ;; ++ 0) ++ ocf_log debug "monitor cmd passed: exit code = $rc" ++ ;; ++ *) ++ ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" ++ rc=$OCF_ERR_GENERIC ++ ;; ++ esac ++ ++ return $rc ++} ++ ++container_exists() ++{ ++ local rc ++ local out ++ ++ out=$(podman exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) ++ rc=$? ++ # 125: no container with name or ID ${CONTAINER} found ++ if [ $rc -ne 125 ]; then ++ return 0 ++ fi ++ return 1 ++} ++ ++remove_container() ++{ ++ local rc ++ local execids ++ ++ if ocf_is_true "$OCF_RESKEY_reuse"; then ++ # never remove the container if we have reuse enabled. ++ return 0 ++ fi ++ ++ if ! container_exists; then ++ # don't attempt to remove a container that doesn't exist ++ return 0 ++ fi ++ ocf_log notice "Cleaning up inactive container, ${CONTAINER}." ++ ocf_run podman rm -v "$CONTAINER" ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ if [ $rc -eq 2 ]; then ++ if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then ++ ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it." ++ ocf_run podman rm -f "$CONTAINER" ++ rc=$? ++ fi ++ fi ++ # due to a podman bug (rhbz#1841485), sometimes a stopped ++ # container can still be associated with Exec sessions, in ++ # which case the "podman rm" has to be forced ++ execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}') ++ if [ "$execids" -ne "0" ]; then ++ ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it." ++ ocf_run podman rm -f "$CONTAINER" ++ rc=$? ++ fi ++ fi ++ return $rc ++} ++ ++attribute_node_ip() ++{ ++ local action="$1" ++ local attribute="node_ip" ++ local value ++ ++ if ! value=$(ip -brief addr show "$OCF_RESKEY_nic" | awk '{gsub("/.*", "", $3); print $3}'); then ++ rc=$? ++ ocf_log err "could not get node ip, error code: $rc" ++ return "$rc" ++ fi ++ ++ case "$action" in ++ get) ++ echo "$value" ++ ;; ++ update) ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then ++ rc="$?" ++ ocf_log err "could not set $attribute to $value, error code: $rc" ++ return "$rc" ++ fi ++ ;; ++ clear) ++ crm_attribute --name "$attribute" --delete ++ ;; ++ *) ++ ocf_log err "unsupported $action for $attribute" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++} ++ ++attribute_node_ip_peer() { ++ local peer_name ++ peer_name=$(get_peer_node_name) ++ crm_attribute --query --name "node_ip" --node "$peer_name" | awk -F"value=" '{print $2}' ++} ++ ++get_env_from_manifest() { ++ local env_var_name="$1" ++ local env_var_value ++ ++ # The agent waits for the manifest to exist before starting, so the ++ # file should exist already, but this check is included for robustness. ++ if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then ++ ocf_log err "external etcd pod manifest ($OCF_RESKEY_pod_manifest) not found" ++ exit "$OCF_ERR_INSTALLED" ++ fi ++ ++ if ! env_var_value=$(jq -r ".spec.containers[].env[] | select( .name == \"$env_var_name\" ).value" "$OCF_RESKEY_pod_manifest"); then ++ rc=$? ++ ocf_log err "could not find environment variable $env_var_name in etcd pod manifest, error code: $rc" ++ exit "$OCF_ERR_INSTALLED" ++ fi ++ ++ ocf_log debug "ETCD pod environment variable $env_var_name: $env_var_value" ++ ++ echo "$env_var_value" ++} ++ ++prepare_env() { ++ local name ip standalone_node ++ ++ NODEIP="$(attribute_node_ip get)" ++ ++ if is_force_new_cluster; then ++ ALL_ETCD_ENDPOINTS="https://$NODEIP:2379" ++ ETCD_INITIAL_CLUSTER_STATE="new" ++ ETCD_INITIAL_CLUSTER="$NODENAME=https://$NODEIP:2380" ++ else ++ ETCD_INITIAL_CLUSTER_STATE="existing" ++ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do ++ name=$(echo "$node" | awk -F":" '{print $1}') ++ ip=$(echo "$node" | awk -F":" '{print $2}') ++ if [ -z "$name" ] || [ -z "$ip" ]; then ++ ocf_exit_reason "name or ip missing for 1 or more nodes" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379" ++ [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=https://$ip:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=https://$ip:2380" ++ done ++ fi ++ ++ ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API") ++ ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES") ++ ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR") ++ ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT") ++ ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF") ++ ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS") ++ ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION") ++ ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL") ++ ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL") ++ ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES") ++ ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS") ++ ++ SERVER_CACERT=$(get_env_from_manifest "ETCDCTL_CACERT") ++ ETCD_PEER_CERT=$(get_env_from_manifest "ETCDCTL_CERT") ++ ETCD_PEER_KEY=$(get_env_from_manifest "ETCDCTL_KEY") ++ ++ if is_learner; then ++ LISTEN_CLIENT_URLS="$NODEIP" ++ LISTEN_PEER_URLS="$NODEIP" ++ LISTEN_METRICS_URLS="$NODEIP" ++ else ++ LISTEN_CLIENT_URLS="0.0.0.0" ++ LISTEN_PEER_URLS="0.0.0.0" ++ LISTEN_METRICS_URLS="0.0.0.0" ++ fi ++} ++ ++archive_data_folder() ++{ ++ # TODO: use etcd snapshots ++ local dest_dir_name ++ local data_dir="/var/lib/etcd/member" ++ ++ dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)" ++ if [ ! -d $data_dir ]; then ++ ocf_log info "no data dir to backup" ++ return $OCF_SUCCESS ++ fi ++ ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name" ++ mv "$data_dir" "$HA_RSCTMP/$dest_dir_name" ++ sync ++} ++ ++etcd_pod_container_exists() { ++ local count_matches ++ # Check whether the etcd pod exists on the same node (header line included) ++ count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l) ++ if [ "$count_matches" -eq 1 ]; then ++ # etcd pod found ++ return 0 ++ fi ++ # etcd pod not found ++ return 1 ++} ++ ++attribute_node_cluster_id() ++{ ++ local action="$1" ++ local value ++ if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then ++ rc=$? ++ ocf_log err "could not get cluster_id, error code: $rc" ++ return "$rc" ++ fi ++ ++ case "$action" in ++ get) ++ echo "$value" ++ ;; ++ update) ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "cluster_id" --update "$value"; then ++ rc=$? ++ ocf_log err "could not update cluster_id, error code: $rc" ++ return "$rc" ++ fi ++ ;; ++ *) ++ ocf_log err "unsupported $action for attribute_node_cluster_id" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++} ++ ++attribute_node_cluster_id_peer() ++{ ++ local nodename ++ ++ nodename=$(get_peer_node_name) ++ crm_attribute --query --type nodes --node "$nodename" --name "cluster_id" | awk -F"value=" '{print $2}' ++} ++ ++attribute_node_revision() ++{ ++ local action="$1" ++ local value ++ local attribute="revision" ++ ++ if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then ++ rc=$? ++ ocf_log err "could not get $attribute, error code: $rc" ++ return "$rc" ++ fi ++ ++ case "$action" in ++ get) ++ echo "$value" ++ ;; ++ update) ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then ++ rc=$? ++ ocf_log err "could not update etcd $revision, error code: $rc" ++ return "$rc" ++ fi ++ ;; ++ *) ++ ocf_log err "unsupported $action for attribute_node_revision" ++ return "$OCF_ERR_GENERIC" ++ ;; ++ esac ++} ++ ++attribute_node_revision_peer() ++{ ++ local nodename ++ nodename=$(get_peer_node_name) ++ crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}' ++} ++ ++attribute_node_member_id() ++{ ++ local action="$1" ++ local attribute="member_id" ++ ++ if ! container_exists; then ++ # we need a running container to execute etcdctl. ++ return 0 ++ fi ++ ++ case "$action" in ++ get) ++ # When we need this value at the agent startup we don't have a etcd ++ # container running, so we always get this value from CIB ++ crm_attribute --query --type nodes --node "$NODENAME" --name "$attribute" | awk -F"value=" '{print $2}' ++ ;; ++ update) ++ local member_list_json ++ member_list_json=$(get_member_list_json) ++ ocf_log info "member list: $member_list_json" ++ if [ -z "$member_list_json" ] ; then ++ ocf_log err "could not get $attribute: could not get member list JSON" ++ return "$rc" ++ fi ++ ++ local value ++ if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then ++ rc=$? ++ ocf_log err "could not get $attribute from member list JSON, error code: $rc" ++ return "$rc" ++ fi ++ ++ # JSON member_id is decimal, while etcdctl command needs the hex version ++ value=$(printf "%x" "$value") ++ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then ++ rc=$? ++ ocf_log err "could not update etcd $attribute, error code: $rc" ++ return "$rc" ++ fi ++ ;; ++ clear) ++ crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --delete ++ ;; ++ *) ++ ocf_log err "unsupported $action for attribute_node_member_id" ++ return "$OCF_ERR_GENERIC" ++ ;; ++ esac ++} ++ ++add_member_as_learner() ++{ ++ local rc ++ local member_name=$1 ++ local member_ip=$2 ++ ++ ocf_log info "add $member_name ($member_ip) to the member list as learner" ++ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="https://$(attribute_node_ip get):2379" member add "$member_name" --peer-urls="https://$member_ip:2380" --learner) ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not add $member_name as learner, error code: $rc" ++ return $rc ++ fi ++ ocf_log info "$out" ++ ++ attribute_learner_node update "$member_name" ++ return $? ++} ++ ++set_force_new_cluster() ++{ ++ local rc ++ crm_attribute --lifetime reboot --node "$NODENAME" --name "force_new_cluster" --update "$NODENAME" ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not set force_new_cluster attribute to $NODENAME" ++ fi ++ return $rc ++} ++ ++get_force_new_cluster() ++{ ++ crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}' ++} ++ ++clear_force_new_cluster() ++{ ++ local force_new_cluster_node ++ ++ force_new_cluster_node=$(get_force_new_cluster) ++ if [ -z "$force_new_cluster_node" ]; then ++ ocf_log info "$NODENAME: force_new_cluster attribute not set" ++ return $OCF_SUCCESS ++ fi ++ ++ # only the holder of "force_new_cluster" attribute can delete it ++ if [ "$NODENAME" = "$force_new_cluster_node" ]; then ++ crm_attribute --lifetime reboot --name "force_new_cluster" --delete ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not clear force_new_cluster attribute, error code: $rc" ++ else ++ ocf_log info "$NODENAME: force_new_cluster attribute cleared" ++ fi ++ return $rc ++ else ++ ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)" ++ return $OCF_SUCCESS ++ fi ++} ++ ++is_force_new_cluster() ++{ ++ # Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise. ++ local value ++ ++ value=$(get_force_new_cluster) ++ if [ -z "$value" ]; then ++ ocf_log debug "force_new_cluster attribute is not set" ++ return 1 ++ fi ++ ++ if [ "$value" = "$NODENAME" ]; then ++ ocf_log debug "$NODENAME has force_new_cluster set" ++ return 0 ++ fi ++ ++ ocf_log info "force_new_cluster attribute set on peer node $value" ++ return 1 ++} ++ ++is_standalone() ++{ ++ local standalone_node ++ ++ standalone_node=$(get_standalone_node) ++ if [ -z "$standalone_node" ]; then ++ ocf_log debug "no node running standalone" ++ return 1 ++ fi ++ ++ if [ "$NODENAME" = "$standalone_node" ]; then ++ ocf_log debug "$NODENAME is set as standalone" ++ return 0 ++ fi ++ ocf_log debug "$NODENAME is set as learner" ++ return 1 ++ ++} ++ ++set_standalone_node() ++{ ++ local rc ++ ++ ocf_log info "add $NODENAME as standalone" ++ crm_attribute --name "standalone_node" --update "$NODENAME" ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not set standalone_node attribute to $NODENAME" ++ fi ++ return $rc ++} ++ ++get_standalone_node() ++{ ++ crm_attribute --query --name "standalone_node" | awk -F"value=" '{print $2}' ++} ++ ++clear_standalone_node() ++{ ++ crm_attribute --name "standalone_node" --delete ++} ++ ++clear_standalone_and_learner_if_not_learners() ++{ ++ local rc ++ local member_list_json="$1" ++ ++ number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l) ++ if [ "$number_of_members" -ne 2 ]; then ++ ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2" ++ return $OCF_SUCCESS ++ fi ++ ++ id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID") ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_log err "could not get isLearner field from member list, error code: $rc" ++ return $rc ++ fi ++ ++ if [ -z "$id" ]; then ++ clear_standalone_node ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_og error "could not clear standalone_node attribute, error code: $rc" ++ return $rc ++ fi ++ fi ++ if [ -z "$id" ]; then ++ attribute_learner_node clear ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ ocf_og error "could not clear learner_node attribute, error code: $rc" ++ return $rc ++ fi ++ fi ++ ++ return $rc ++} ++ ++attribute_learner_node() ++{ ++ local action="$1" ++ local value="$2" ++ local attribute="learner_node" ++ ++ case "$action" in ++ get) ++ crm_attribute --query --name "$attribute" | awk -F"value=" '{print $2}' ++ ;; ++ update) ++ if ! crm_attribute --name "$attribute" --update "$value"; then ++ rc="$?" ++ ocf_log err "could not set $attribute to $value, error code: $rc" ++ return "$rc" ++ fi ++ ;; ++ clear) ++ crm_attribute --name "$attribute" --delete ++ ;; ++ *) ++ ocf_log err "unsupported $action for $attribute" ++ return $OCF_ERR_GENERIC ++ ;; ++ esac ++} ++ ++is_learner() ++{ ++ if [ "$NODENAME" = "$(attribute_learner_node get)" ]; then ++ return 0 ++ fi ++ return 1 ++} ++ ++get_peer_node_name() { ++ crm_node -l | awk '{print $2}' | grep -v "$NODENAME" ++} ++ ++get_all_etcd_endpoints() { ++ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do ++ name=$(echo "$node" | awk -F":" '{print $1}') ++ ip=$(echo "$node" | awk -F":" '{print $2}') ++ if [ -z "$name" ] || [ -z "$ip" ]; then ++ ocf_exit_reason "name or ip missing for 1 or more nodes" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379" ++ done ++ echo "$ALL_ETCD_ENDPOINTS" ++} ++ ++get_endpoint_status_json() ++{ ++ # Get the status of all endpoints ++ local all_etcd_endpoints ++ ++ all_etcd_endpoints=$(get_all_etcd_endpoints) ++ podman exec "${CONTAINER}" etcdctl endpoint status --endpoints="$all_etcd_endpoints" -w json ++} ++ ++get_member_list_json() { ++ # Get the list of members visible to the current node ++ local this_node_endpoint ++ ++ this_node_endpoint="https://$(attribute_node_ip get):2379" ++ podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json ++} ++ ++check_peers() ++{ ++ # Check peers endpoint status and locally accessible member list ++ local member_list_json ++ ++ if ! container_exists; then ++ # we need a running container to execute etcdctl. ++ return $OCF_SUCCESS ++ fi ++ ++ member_list_json=$(get_member_list_json) ++ rc=$? ++ ocf_log debug "member list: $member_list_json" ++ if [ $rc -ne 0 ]; then ++ ocf_log info "podman failed to get member list, error code: $rc" ++ ++ endpoint_status_json=$(get_endpoint_status_json) ++ ocf_log info "endpoint status: $endpoint_status_json" ++ ++ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l) ++ if [ "$count_endpoints" -eq 1 ]; then ++ ocf_log info "one endpoint only: checking status errors" ++ endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors") ++ if echo "$endpoint_status_errors" | grep -q "no leader"; then ++ set_force_new_cluster ++ set_standalone_node ++ ocf_exit_reason "$NODENAME must force a new cluster" ++ return $OCF_ERR_GENERIC ++ fi ++ if [ "$endpoint_status_errors" != "null" ]; then ++ ocf_log err "unmanaged endpoint status error: $endpoint_status_errors" ++ fi ++ fi ++ ++ return $OCF_SUCCESS ++ fi ++ ++ # Example of .members[] instance fields in member list json format: ++ # NOTE that "name" is present in voting members only, while "isLearner" in learner members only ++ # and the value is always true (not a string) in that case. ++ # { ++ # "ID": , ++ # "name": "", ++ # "peerURLs": [ ++ # "https://:2380" ++ # ], ++ # "clientURLs": [ ++ # "https://:2379" ++ # ] ++ # } ++ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do ++ name=$(echo "$node" | awk -F":" '{print $1}') ++ # do not check itself ++ if [ "$name" = "$NODENAME" ]; then ++ continue ++ fi ++ ++ # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name. ++ ip=$(echo "$node" | awk -F":" '{print $2}') ++ id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID") ++ if [ -z "$id" ]; then ++ ocf_log info "$name is not in the members list" ++ add_member_as_learner "$name" "$ip" ++ set_standalone_node ++ else ++ ocf_log debug "$name is in the members list by IP: $ip" ++ clear_standalone_and_learner_if_not_learners "$member_list_json" ++ fi ++ done ++ return $OCF_SUCCESS ++} ++ ++podman_simple_status() ++{ ++ local rc ++ ++ # simple status is implemented via podman exec ++ # everything besides success is considered "not running" ++ monitor_cmd_exec ++ rc=$? ++ if [ $rc -ne $OCF_SUCCESS ]; then ++ rc=$OCF_NOT_RUNNING; ++ fi ++ return $rc ++} ++ ++podman_monitor() ++{ ++ # We rely on running podman exec to monitor the container ++ # state because that command seems to be less prone to ++ # performance issue under IO load. ++ # ++ # For probes to work, we expect cmd_exec to be able to report ++ # when a container is not running. Here, we're not interested ++ # in distinguishing whether it's stopped or non existing ++ # (there's function container_exists for that) ++ monitor_cmd_exec ++ rc=$? ++ if [ $rc -ne 0 ]; then ++ return $rc ++ fi ++ ++ if is_learner; then ++ ocf_log info "$NODENAME is learner. Cannot get member id" ++ return "$OCF_SUCCESS" ++ fi ++ # Failing to cache data and check member list should not cause the ++ # monitor operation to fail. ++ # TODO: move this inside check_peers where we already query member list json ++ attribute_node_member_id update ++ if ! check_peers; then ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # node revision comes from the disk, so if it is not available is a fatal failure ++ attribute_node_revision update ++ return $? ++} ++ ++podman_create_mounts() { ++ oldIFS="$IFS" ++ IFS="," ++ for directory in $OCF_RESKEY_mount_points; do ++ mkdir -p "$directory" ++ done ++ IFS="$oldIFS" ++} ++ ++podman_container_id() ++{ ++ # Retrieve the container ID by doing a "podman ps" rather than ++ # a "podman inspect", because the latter has performance issues ++ # under IO load. ++ # We could have run "podman start $CONTAINER" to get the ID back ++ # but if the container is stopped, the command will return a ++ # name instead of a container ID. This would break us. ++ podman ps --no-trunc --format '{{.ID}} {{.Names}}' | grep -F -w -m1 "$CONTAINER" | cut -d' ' -f1 ++} ++ ++ ++create_transient_drop_in_dependency() ++{ ++ local cid=$1 ++ local rc=$OCF_SUCCESS ++ ++ if [ -z "$cid" ]; then ++ ocf_exit_reason "Container ID not found for \"$CONTAINER\". Not creating drop-in dependency" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ ocf_log info "Creating drop-in dependency for \"$CONTAINER\" ($cid)" ++ for scope in "libpod-$cid.scope.d" "libpod-conmon-$cid.scope.d"; do ++ if [ $rc -eq $OCF_SUCCESS ] && [ ! -d /run/systemd/transient/"$scope" ]; then ++ mkdir -p /run/systemd/transient/"$scope" && \ ++ printf "[Unit]\nBefore=pacemaker.service" > /run/systemd/transient/"$scope"/dep.conf && \ ++ chmod ago+r /run/systemd/transient/"$scope" /run/systemd/transient/"$scope"/dep.conf ++ rc=$? ++ fi ++ done ++ ++ if [ $rc -ne $OCF_SUCCESS ]; then ++ ocf_log err "Could not create drop-in dependency for \"$CONTAINER\" ($cid)" ++ else ++ systemctl daemon-reload ++ rc=$? ++ if [ $rc -ne $OCF_SUCCESS ]; then ++ ocf_log err "Could not refresh service definition after creating drop-in for \"$CONTAINER\"" ++ fi ++ fi ++ ++ return $rc ++} ++ ++ ++run_new_container() ++{ ++ local opts=$1 ++ local image=$2 ++ local cmd=$3 ++ local rc ++ ++ ocf_log info "running container $CONTAINER for the first time" ++ out=$(podman run $opts $image $cmd 2>&1) ++ rc=$? ++ ++ if [ -n "$out" ]; then ++ out="$(echo "$out" | tr -s ' \t\r\n' ' ')" ++ if [ $rc -eq 0 ]; then ++ ocf_log info "$out" ++ else ++ ocf_log err "$out" ++ fi ++ fi ++ ++ if [ $rc -eq 125 ]; then ++ # If an internal podman error occurred, it might be because ++ # the internal storage layer still references an old container ++ # with the same name, even though podman itself thinks there ++ # is no such container. If so, purge the storage layer to try ++ # to clean the corruption and try again. ++ if echo "$out" | grep -q "unknown.*flag"; then ++ ocf_exit_reason "$out" ++ return $rc ++ fi ++ ++ ocf_log warn "Internal podman error while creating new container $CONTAINER. Retrying." ++ ocf_run podman rm --storage "$CONTAINER" ++ ocf_run podman run $opts $image $cmd ++ rc=$? ++ elif [ $rc -eq 127 ]; then ++ # rhbz#1972209: podman 3.0.x seems to be hit by a race ++ # where the cgroup is not yet set up properly when the OCI ++ # runtime configures the container. If that happens, recreate ++ # the container as long as we get the same error code or ++ # until start timeout preempts us. ++ while [ $rc -eq 127 ] && (echo "$out" | grep -q "cgroup.*scope not found") ; do ++ ocf_log warn "Internal podman error while assigning cgroup. Retrying." ++ # Arbitrary sleep to prevent consuming all CPU while looping ++ sleep 1 ++ podman rm -f "$CONTAINER" ++ out=$(podman run $opts $image $cmd 2>&1) ++ rc=$? ++ done ++ # Log the created container ID if it succeeded ++ if [ $rc -eq 0 ]; then ++ ocf_log info "$out" ++ fi ++ fi ++ ++ return $rc ++} ++ ++compare_revision() ++{ ++ # Compare local revision (from disk) against peer revision (from CIB). ++ # returns "older", "equal" or "newer" ++ local revision ++ local peer_node_name ++ local peer_revision ++ ++ revision=$(attribute_node_revision get) ++ peer_revision=$(attribute_node_revision_peer) ++ ++ if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then ++ ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ if [ "$revision" -gt "$peer_revision" ]; then ++ ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision" ++ echo "newer" ++ elif [ "$revision" -eq "$peer_revision" ]; then ++ ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision" ++ echo "equal" ++ else ++ ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision" ++ echo "older" ++ fi ++ return "$OCF_SUCCESS" ++} ++ ++ensure_pod_manifest_exists() ++{ ++ local wait_timeout_sec=$((10 * 60)) ++ local poll_interval_sec=5 ++ local poll_retries=$((wait_timeout_sec/poll_interval_sec)) ++ ++ for try in $(seq "$poll_retries"); do ++ if [ -f "$OCF_RESKEY_pod_manifest" ]; then ++ ocf_log info "pod manifest ($OCF_RESKEY_pod_manifest) found" ++ break ++ fi ++ ocf_log debug "pod manifest ($OCF_RESKEY_pod_manifest) does not exist yet: retry in $poll_interval_sec seconds." ++ sleep "$poll_interval_sec" ++ done ++ ++ if [ ! -f "$OCF_RESKEY_pod_manifest" ]; then ++ ocf_log err "pod manifest ($OCF_RESKEY_pod_manifest) still missing after $wait_timeout_sec seconds." ++ return "$OCF_ERR_CONFIGURED" ++ fi ++ ++ return "$OCF_SUCCESS" ++} ++ ++podman_start() ++{ ++ local cid ++ local rc ++ local etcd_pod_wait_timeout_sec=$((10 * 60)) ++ local etcd_pod_poll_interval_sec=10 ++ local etcd_pod_poll_retries=$((etcd_pod_wait_timeout_sec/etcd_pod_poll_interval_sec)) ++ local pod_was_running=false ++ ++ ocf_log notice "podman-etcd start" ++ attribute_node_ip update ++ attribute_node_cluster_id update ++ attribute_node_revision update ++ ++ # ensure the etcd pod is not running before starting the container ++ ocf_log info "ensure etcd pod is not running (retries: $etcd_pod_poll_retries, interval: $etcd_pod_poll_interval_sec)" ++ for try in $(seq $etcd_pod_poll_retries); do ++ if ! etcd_pod_container_exists; then ++ break ++ fi ++ ocf_log info "etcd pod running: retry in $etcd_pod_poll_interval_sec seconds." ++ pod_was_running=true ++ sleep $etcd_pod_poll_interval_sec ++ done ++ if etcd_pod_container_exists; then ++ ocf_exit_reason "etcd pod is still running after $etcd_pod_wait_timeout_sec seconds." ++ return $OCF_ERR_GENERIC ++ fi ++ ++ if ! ensure_pod_manifest_exists; then ++ ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1. ++ # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots. ++ # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set ++ # during the current node boot session, implying a deliberate request to recover the cluster. ++ if ocf_is_true "$pod_was_running"; then ++ ocf_log info "static pod was running: start normally" ++ else ++ if is_force_new_cluster; then ++ ocf_log notice "$NODENAME marked to force-new-cluster" ++ else ++ # When the local agent starts, we can infer the cluster state by counting ++ # how many agents are starting or already active: ++ # - 1 active agent: it's the peer (we are just starting) ++ # - 0 active agents, 1 starting: we are starting; the peer is not starting ++ # - 0 active agents, 2 starting: both agents are starting simultaneously ++ local active_resources_count ++ active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w) ++ case "$active_resources_count" in ++ 1) ++ if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then ++ ocf_log info "peer active but in learner mode: start normally" ++ else ++ ocf_log info "peer is active standalone: joining as learner" ++ JOIN_AS_LEARNER=true ++ fi ++ ;; ++ 0) ++ # we need to compare the revisions in any of the following branches ++ # so call the function only once here ++ if ! revision_compare_result=$(compare_revision); then ++ ocf_log err "could not compare revisions, error code: $?" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ # count how many agents are starting now ++ local start_resources_count ++ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w) ++ ++ case "$start_resources_count" in ++ 1) ++ ocf_log debug "peer not starting: ensure we can start a new cluster" ++ if [ "$revision_compare_result" != "older" ]; then ++ # If our revision is the same as or newer than the peer's last saved ++ # revision, and the peer agent isn't currently starting, we can ++ # restore e-quorum by forcing a new cluster. ++ set_force_new_cluster ++ else ++ ocf_log err "local revision is older and peer is not starting: cannot start" ++ ocf_exit_reason "local revision is older and peer is not starting: cannot start" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ;; ++ 2) ++ ocf_log info "peer starting" ++ if [ "$revision_compare_result" = "newer" ]; then ++ set_force_new_cluster ++ elif [ "$revision_compare_result" = "older" ]; then ++ ocf_log info "$NODENAME shall join as learner" ++ JOIN_AS_LEARNER=true ++ else ++ if [ "$(attribute_node_cluster_id get)" = "$(attribute_node_cluster_id_peer)" ]; then ++ ocf_log info "same cluster_id and revision: start normal" ++ else ++ ocf_exit_reason "same revision but different cluster id" ++ return "$OCF_ERR_GENERIC" ++ fi ++ fi ++ ;; ++ *) ++ ocf_log err "Unexpected start resource count: $start_resources_count" ++ podman_notify ++ return "$OCF_ERR_GENERIC" ++ ;; ++ esac ++ ;; ++ *) ++ ocf_log err "Unexpected active resource count: $active_resources_count" ++ podman_notify ++ return "$OCF_ERR_GENERIC" ++ ;; ++ esac ++ fi ++ fi ++ ++ podman_create_mounts ++ local run_opts="-d --name=${CONTAINER}" ++ # check to see if the container has already started ++ podman_simple_status ++ if [ $? -eq $OCF_SUCCESS ]; then ++ return "$OCF_SUCCESS" ++ fi ++ ++ if ocf_is_true "$JOIN_AS_LEARNER"; then ++ local wait_timeout_sec=$((10*60)) ++ local poll_interval_sec=5 ++ local retries=$(( wait_timeout_sec / poll_interval_sec )) ++ ++ ocf_log info "ensure the leader node added $NODENAME as learner member before continuing (timeout: $wait_timeout_sec seconds)" ++ for try in $(seq $retries); do ++ learner_node=$(attribute_learner_node get) ++ if [ "$NODENAME" != "$learner_node" ]; then ++ ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds." ++ sleep $poll_interval_sec ++ continue ++ fi ++ ocf_log info "learner node $learner_node in the member list" ++ break ++ done ++ if [ "$NODENAME" != "$(attribute_learner_node get)" ]; then ++ ocf_log err "wait for $NODENAME to be in the member list timed out" ++ return "$OCF_ERR_GENERIC" ++ fi ++ ++ archive_data_folder ++ fi ++ ++ prepare_env ++ ++ # add etcd-specific opts ++ run_opts="$run_opts \ ++ --network=host \ ++ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \ ++ -v /var/lib/etcd:/var/lib/etcd \ ++ --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \ ++ --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \ ++ --env ETCD_DATA_DIR=$ETCD_DATA_DIR \ ++ --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \ ++ --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \ ++ --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \ ++ --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \ ++ --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \ ++ --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \ ++ --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \ ++ --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \ ++ --env ETCD_NAME=$NODENAME \ ++ --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \ ++ --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \ ++ --env ETCDCTL_API=$ETCDCTL_API \ ++ --env ETCDCTL_CACERT=$SERVER_CACERT \ ++ --env ETCDCTL_CERT=$ETCD_PEER_CERT \ ++ --env ETCDCTL_KEY=$ETCD_PEER_KEY \ ++ --authfile=$OCF_RESKEY_authfile \ ++ --security-opt label=disable" ++ if [ -n "$OCF_RESKEY_run_opts" ]; then ++ run_opts="$run_opts $OCF_RESKEY_run_opts" ++ fi ++ ++ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \ ++ --log-level=info \ ++ --experimental-initial-corrupt-check=true \ ++ --snapshot-count=10000 \ ++ --initial-advertise-peer-urls=https://${NODEIP}:2380 \ ++ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \ ++ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \ ++ --trusted-ca-file=$SERVER_CACERT \ ++ --client-cert-auth=true \ ++ --peer-cert-file=$ETCD_PEER_CERT \ ++ --peer-key-file=$ETCD_PEER_KEY \ ++ --peer-trusted-ca-file=$SERVER_CACERT \ ++ --peer-client-cert-auth=true \ ++ --advertise-client-urls=https://${NODEIP}:2379 \ ++ --listen-client-urls=https://${LISTEN_CLIENT_URLS}:2379,unixs://${NODEIP}:0 \ ++ --listen-peer-urls=https://${LISTEN_PEER_URLS}:2380 \ ++ --metrics=extensive \ ++ --listen-metrics-urls=https://${LISTEN_METRICS_URLS}:9978" ++ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then ++ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts" ++ fi ++ ++ if is_force_new_cluster; then ++ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster" ++ fi ++ ++ if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then ++ # no container image provided via input parameters. Read it from the pod manifest. ++ OCF_RESKEY_image=$(jq -r '.spec.containers[] | select( .name=="etcd").image' "$OCF_RESKEY_pod_manifest") ++ ocf_log info "using container image ($OCF_RESKEY_image) from Pod manifest ($OCF_RESKEY_pod_manifest)" ++ else ++ # use the container image provided as input parameter ++ ocf_log info "using container image ($OCF_RESKEY_image) via input parameters" ++ fi ++ ++ if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then ++ ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" ++ if ! podman pull --authfile="$OCF_RESKEY_authfile" "${OCF_RESKEY_image}"; then ++ ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" ++ return $OCF_ERR_GENERIC ++ fi ++ else ++ ocf_log notice "Pull image not required, ${OCF_RESKEY_image}" ++ fi ++ ++ if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then ++ ocf_log info "starting existing container $CONTAINER." ++ ocf_run podman start "$CONTAINER" ++ else ++ # make sure any previous container matching our container name is cleaned up first. ++ # we already know at this point it wouldn't be running ++ remove_container ++ run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd" ++ if [ $? -eq 125 ]; then ++ return $OCF_ERR_GENERIC ++ fi ++ fi ++ rc=$? ++ ++ # if the container was stopped or didn't exist before, systemd ++ # removed the libpod* scopes. So always try to recreate the drop-ins ++ if [ $rc -eq 0 ] && ocf_is_true "$OCF_RESKEY_drop_in_dependency"; then ++ cid=$(podman_container_id) ++ create_transient_drop_in_dependency "$cid" ++ rc=$? ++ fi ++ ++ if [ $rc -ne 0 ]; then ++ ocf_exit_reason "podman failed to launch container (error code: $rc)" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ # wait for monitor to pass before declaring that the container is started ++ while true; do ++ podman_simple_status ++ if [ $? -ne $OCF_SUCCESS ]; then ++ ocf_exit_reason "Newly created podman container exited after start" ++ ocf_run podman logs --tail 20 "${CONTAINER}" ++ return $OCF_ERR_GENERIC ++ fi ++ ++ monitor_cmd_exec ++ if [ $? -eq $OCF_SUCCESS ]; then ++ ocf_log notice "Container $CONTAINER started successfully" ++ if is_force_new_cluster; then ++ clear_force_new_cluster ++ ++ local peer_node_name ++ local peer_node_ip ++ peer_node_name="$(get_peer_node_name)" ++ peer_node_ip="$(attribute_node_ip_peer)" ++ if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then ++ add_member_as_learner "$peer_node_name" "$peer_node_ip" ++ else ++ ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})" ++ fi ++ fi ++ return $OCF_SUCCESS ++ fi ++ ++ ocf_exit_reason "waiting on monitor_cmd to pass after start" ++ sleep 1 ++ done ++} ++ ++podman_stop() ++{ ++ local timeout=60 ++ local rc ++ podman_simple_status ++ if [ $? -eq $OCF_NOT_RUNNING ]; then ++ remove_container ++ ocf_log info "could not leave members list: etcd container not running" ++ return $OCF_SUCCESS ++ fi ++ ++ attribute_node_revision update ++ attribute_node_cluster_id update ++ ++ if ! member_id=$(attribute_node_member_id get); then ++ ocf_log err "error leaving members list: could not get member-id" ++ else ++ # TODO: is it worth/possible to check the current status instead than relying on cached attributes? ++ if is_standalone; then ++ ocf_log info "last member. Not leaving the member list" ++ else ++ ocf_log info "leaving members list as member with ID $member_id" ++ endpoint="https://$(attribute_node_ip get):2379" ++ if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then ++ rc=$? ++ ocf_log err "error leaving members list, error code: $rc" ++ fi ++ fi ++ fi ++ attribute_node_member_id clear ++ ++ if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then ++ timeout=$(((OCF_RESKEY_CRM_meta_timeout/1000) -10 )) ++ if [ $timeout -lt 10 ]; then ++ timeout=10 ++ fi ++ fi ++ ++ if ocf_is_true "$OCF_RESKEY_force_kill"; then ++ ocf_run podman kill "$CONTAINER" ++ rc=$? ++ else ++ ocf_log debug "waiting $timeout second[s] before killing container" ++ ocf_run podman stop -t="$timeout" "$CONTAINER" ++ rc=$? ++ # on stop, systemd will automatically delete any transient ++ # drop-in conf that has been created earlier ++ fi ++ ++ if [ $rc -ne 0 ]; then ++ # If the stop failed, it could be because the controlling conmon ++ # process died unexpectedly. If so, a generic error code is returned ++ # but the associated container exit code is -1. If that's the case, ++ # assume there's no failure and continue with the rm as usual. ++ if [ $rc -eq 125 ] && \ ++ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' "$CONTAINER" | grep -Eq '^(exited|stopped):-1$'; then ++ ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway." ++ else ++ ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." ++ return $OCF_ERR_GENERIC ++ fi ++ fi ++ ++ if ! remove_container; then ++ ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." ++ return $OCF_ERR_GENERIC ++ fi ++ ++ return $OCF_SUCCESS ++} ++ ++image_exists() ++{ ++ if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then ++ # the actual container image was not defined yet. Nor by ++ # the user via OCF_RESKEY, nor by reading the Pod manifest ++ return 0 ++ fi ++ if podman image exists "${OCF_RESKEY_image}"; then ++ # image found ++ return 0 ++ fi ++ ++ if ocf_is_true "$OCF_RESKEY_allow_pull"; then ++ REQUIRE_IMAGE_PULL=1 ++ ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" ++ return 0 ++ fi ++ # image not found. ++ return 1 ++} ++ ++podman_validate() ++{ ++ check_binary curl ++ check_binary crictl ++ check_binary oc ++ check_binary podman ++ check_binary jq ++ ++ if [ -z "$OCF_RESKEY_node_ip_map" ]; then ++ ocf_exit_reason "'node_ip_map' option is required" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ -z "$OCF_RESKEY_pod_manifest" ]; then ++ ocf_exit_reason "'pod_manifest' option is required" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ if [ -z "$OCF_RESKEY_image" ]; then ++ ocf_exit_reason "'image' option is required" ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ if ! image_exists; then ++ ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ ++ return $OCF_SUCCESS ++} ++ ++podman_notify() ++{ ++ ocf_log info "notify: type=${OCF_RESKEY_CRM_meta_notify_type}, operation=${OCF_RESKEY_CRM_meta_notify_operation}, nodes { active=[${OCF_RESKEY_CRM_meta_notify_active_uname}], start=[${OCF_RESKEY_CRM_meta_notify_start_uname}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_uname}] }, resources { active=[${OCF_RESKEY_CRM_meta_notify_active_resource}], start =[${OCF_RESKEY_CRM_meta_notify_start_resource}], stop=[${OCF_RESKEY_CRM_meta_notify_stop_resource}] }" ++} ++ ++# TODO : ++# When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. ++# When a user appoints reuse, the resource agent cannot connect plural clones with a container. ++ ++if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then ++ if [ -n "$OCF_RESKEY_name" ]; then ++ if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] ++ then ++ ocf_exit_reason "Cannot make plural clones from the same name parameter." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] ++ then ++ ocf_exit_reason "Cannot make plural master from the same name parameter." ++ exit $OCF_ERR_CONFIGURED ++ fi ++ fi ++ : ${OCF_RESKEY_name=$(echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-')} ++else ++ : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} ++fi ++ ++CONTAINER=$OCF_RESKEY_name ++ ++# Note: we currently monitor podman containers by with the "podman exec" ++# command, so make sure that invocation is always valid by enforcing the ++# exec command to be non-empty ++: ${OCF_RESKEY_monitor_cmd:=/bin/true} ++ ++# When OCF_RESKEY_drop_in_dependency is not populated, we ++# look at another file-based way of enabling the option. ++# Otherwise, consider it disabled. ++if [ -z "$OCF_RESKEY_drop_in_dependency" ]; then ++ if [ -f "/etc/sysconfig/podman_drop_in" ] || \ ++ [ -f "/etc/default/podman_drop_in" ]; then ++ OCF_RESKEY_drop_in_dependency=yes ++ fi ++fi ++ ++ ++case $__OCF_ACTION in ++meta-data) meta_data ++ exit $OCF_SUCCESS;; ++usage|help) podman_usage ++ exit $OCF_SUCCESS ++ ;; ++esac ++ ++NODENAME=$(ocf_local_nodename) ++JOIN_AS_LEARNER=false ++ ++case $__OCF_ACTION in ++start) ++ podman_validate || exit $? ++ podman_start;; ++stop) podman_stop;; ++monitor) podman_monitor;; ++notify) podman_notify;; ++validate-all) podman_validate;; ++*) podman_usage ++ exit $OCF_ERR_UNIMPLEMENTED ++ ;; ++esac ++rc=$? ++ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" ++exit $rc diff --git a/SOURCES/RHEL-88429-2-podman-etcd-remove-unused-actions-from-metadata.patch b/SOURCES/RHEL-88429-2-podman-etcd-remove-unused-actions-from-metadata.patch new file mode 100644 index 0000000..99d3744 --- /dev/null +++ b/SOURCES/RHEL-88429-2-podman-etcd-remove-unused-actions-from-metadata.patch @@ -0,0 +1,31 @@ +From 6a3249aae260c081ccbcfd09444d5d85ebc4e3b3 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Mon, 28 Apr 2025 15:48:29 +0200 +Subject: [PATCH] podman-etcd: remove unused actions from metadata + +--- + heartbeat/podman-etcd | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 514dd2e5b..3a2323260 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -236,8 +236,6 @@ to stop the container before pacemaker. + + + +- +- + + + +@@ -251,7 +249,7 @@ REQUIRE_IMAGE_PULL=0 + podman_usage() + { + cat < +Date: Tue, 20 May 2025 09:34:03 +0200 +Subject: [PATCH] podman-etcd: fix listen-peer-urls binding (#2049) + +This change ensures learner etcd listens on all interfaces for peer +connections, resolving accessibility issues. + +Fix: OCPBUGS-56447 +--- + heartbeat/podman-etcd | 12 +++--------- + 1 file changed, 3 insertions(+), 9 deletions(-) + +diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd +index 3a2323260..6762112ec 100755 +--- a/heartbeat/podman-etcd ++++ b/heartbeat/podman-etcd +@@ -436,15 +436,9 @@ prepare_env() { + ETCD_PEER_CERT=$(get_env_from_manifest "ETCDCTL_CERT") + ETCD_PEER_KEY=$(get_env_from_manifest "ETCDCTL_KEY") + +- if is_learner; then +- LISTEN_CLIENT_URLS="$NODEIP" +- LISTEN_PEER_URLS="$NODEIP" +- LISTEN_METRICS_URLS="$NODEIP" +- else +- LISTEN_CLIENT_URLS="0.0.0.0" +- LISTEN_PEER_URLS="0.0.0.0" +- LISTEN_METRICS_URLS="0.0.0.0" +- fi ++ LISTEN_CLIENT_URLS="0.0.0.0" ++ LISTEN_PEER_URLS="0.0.0.0" ++ LISTEN_METRICS_URLS="0.0.0.0" + } + + archive_data_folder() diff --git a/SOURCES/ha-cloud-support-aws.patch b/SOURCES/ha-cloud-support-aws.patch deleted file mode 100644 index 2f6f4db..0000000 --- a/SOURCES/ha-cloud-support-aws.patch +++ /dev/null @@ -1,49 +0,0 @@ -diff --color -uNr a/heartbeat/awseip b/heartbeat/awseip ---- a/heartbeat/awseip 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/awseip 2021-02-15 16:47:36.624610378 +0100 -@@ -43,7 +43,7 @@ - # - # Defaults - # --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" - OCF_RESKEY_api_delay_default="3" -diff --color -uNr a/heartbeat/awsvip b/heartbeat/awsvip ---- a/heartbeat/awsvip 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/awsvip 2021-02-15 16:47:48.960632484 +0100 -@@ -42,7 +42,7 @@ - # - # Defaults - # --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" -diff --color -uNr a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip ---- a/heartbeat/aws-vpc-move-ip 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/aws-vpc-move-ip 2021-02-15 16:47:55.484644118 +0100 -@@ -35,7 +35,7 @@ - . ${OCF_FUNCTIONS_DIR}/aws.sh - - # Defaults --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" -diff --color -uNr a/heartbeat/aws-vpc-route53.in b/heartbeat/aws-vpc-route53.in ---- a/heartbeat/aws-vpc-route53.in 2020-12-03 14:31:17.000000000 +0100 -+++ b/heartbeat/aws-vpc-route53.in 2021-02-15 16:47:59.808651828 +0100 -@@ -45,7 +45,7 @@ - . ${OCF_FUNCTIONS_DIR}/aws.sh - - # Defaults --OCF_RESKEY_awscli_default="/usr/bin/aws" -+OCF_RESKEY_awscli_default="/usr/lib/fence-agents/support/awscli/bin/aws" - OCF_RESKEY_auth_type_default="key" - OCF_RESKEY_profile_default="default" - OCF_RESKEY_region_default="" diff --git a/SOURCES/ha-cloud-support-ibm.patch b/SOURCES/ha-cloud-support-ibm.patch new file mode 100644 index 0000000..7b5ee4a --- /dev/null +++ b/SOURCES/ha-cloud-support-ibm.patch @@ -0,0 +1,19 @@ +--- a/heartbeat/powervs-subnet.in 2024-10-18 10:59:30.418142172 +0200 ++++ b/heartbeat/powervs-subnet.in 2024-10-18 12:30:15.954883160 +0200 +@@ -33,9 +33,13 @@ + import textwrap + import time + +-import requests +-import requests.adapters +-import urllib3.util ++try: ++ sys.path.insert(0, '/usr/lib/fence-agents/support/ibm') ++ import requests ++ import requests.adapters ++ import urllib3.util ++except ImportError: ++ pass + + OCF_FUNCTIONS_DIR = os.environ.get( + "OCF_FUNCTIONS_DIR", "%s/lib/heartbeat" % os.environ.get("OCF_ROOT") diff --git a/SPECS/resource-agents.spec b/SPECS/resource-agents.spec index 1added9..b08aa8a 100644 --- a/SPECS/resource-agents.spec +++ b/SPECS/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 64%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.4 +Release: 71%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.5 License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -134,17 +134,30 @@ Patch81: RHEL-40393-Filesystem-2-update-bsd-logic.patch Patch82: RHEL-32829-db2-fix-OCF_SUCESS-typo.patch Patch83: RHEL-43579-galera-mysql-redis-remove-Unpromoted-monitor-action.patch Patch84: RHEL-22715-LVM-activate-fix-false-positive.patch -Patch94: RHEL-66292-1-aws-agents-reuse-imds-token-until-it-expires.patch -Patch95: RHEL-66292-2-aws-agents-reuse-imds-token-improvements.patch +Patch85: RHEL-58038-Filesystem-dont-sleep-no-processes-only-send-force-net-fs-after-kill.patch +Patch86: RHEL-59576-Filesystem-try-umount-first-avoid-arguments-list-too-long.patch +Patch87: RHEL-59172-nfsserver-also-stop-rpc-statd-for-nfsv4_only.patch +Patch88: RHEL-58008-podman-force-remove-container-if-necessary.patch +Patch89: RHEL-61888-ocf-shellfuncs-only-create-update-reload-systemd-drop-in-if-needed.patch +Patch90: RHEL-62200-IPaddr2-improve-fail-logic-check-ip_status-after-adding-IP.patch +Patch91: RHEL-40589-azure-events-az-update-API-versions-add-retry-for-metadata.patch +Patch92: RHEL-58632-azure-events-use-node-name-from-cluster.patch +Patch93: RHEL-42513-powervs-subnet-new-ra.patch +Patch94: RHEL-66292-1-aws-agents-reuse-imds-token-until-it-expires.patch +Patch95: RHEL-66292-2-aws-agents-reuse-imds-token-improvements.patch +Patch96: RHEL-68739-awsvip-add-interface-parameter.patch Patch97: RHEL-69734-1-openstack-cinder-volume-wait-for-volume-to-be-available.patch Patch98: RHEL-69734-2-openstack-cinder-volume-fix-detach-not-working-during-start-action.patch +Patch105: RHEL-79819-portblock-fix-version-detection.patch +Patch106: RHEL-88035-Filesystem-add-support-for-aznfs.patch +Patch107: RHEL-88429-1-podman-etcd-new-ra.patch +Patch108: RHEL-88429-2-podman-etcd-remove-unused-actions-from-metadata.patch +Patch109: RHEL-88429-3-podman-etcd-fix-listen-peer-urls-binding.patch # bundled ha-cloud-support libs -Patch500: ha-cloud-support-aws.patch -Patch501: ha-cloud-support-aliyun.patch -Patch502: ha-cloud-support-gcloud.patch - -Patch1000: Patch1000-Podman-Improve-Handling-Of-Stopping-Container-Removal.patch +Patch500: ha-cloud-support-aliyun.patch +Patch501: ha-cloud-support-gcloud.patch +Patch502: ha-cloud-support-ibm.patch Obsoletes: heartbeat-resources <= %{version} Provides: heartbeat-resources = %{version} @@ -230,7 +243,7 @@ A set of scripts to interface with several services to operate in a High Availability environment for both Pacemaker and rgmanager service managers. -%ifarch x86_64 +%ifarch x86_64 ppc64le %package cloud License: GPLv2+ and LGPLv2+ Summary: Cloud resource agents @@ -348,18 +361,31 @@ exit 1 %patch -p1 -P 82 %patch -p1 -P 83 %patch -p1 -P 84 +%patch -p1 -P 85 +%patch -p1 -P 86 +%patch -p1 -P 87 +%patch -p1 -P 88 +%patch -p1 -P 89 +%patch -p1 -P 90 +%patch -p1 -P 91 +%patch -p1 -P 92 +%patch -p1 -P 93 %patch -p1 -P 94 %patch -p1 -P 95 +%patch -p1 -P 96 %patch -p1 -P 97 %patch -p1 -P 98 +%patch -p1 -P 105 +%patch -p1 -P 106 +%patch -p1 -P 107 +%patch -p1 -P 108 +%patch -p1 -P 109 # bundled ha-cloud-support libs %patch -p1 -P 500 %patch -p1 -P 501 %patch -p1 -P 502 -%patch -p1 -P 1000 - chmod 755 heartbeat/nova-compute-wait chmod 755 heartbeat/NovaEvacuate chmod 755 heartbeat/pgsqlms @@ -489,6 +515,8 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %exclude %{_mandir}/man7/*aliyun-vpc-move-ip* %exclude /usr/lib/ocf/resource.d/heartbeat/gcp* %exclude %{_mandir}/man7/*gcp* +%exclude /usr/lib/ocf/resource.d/heartbeat/powervs-* +%exclude %{_mandir}/man7/*powervs-* %exclude /usr/lib/ocf/resource.d/heartbeat/pgsqlms %exclude %{_mandir}/man7/*pgsqlms* %exclude %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm @@ -652,8 +680,9 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_libexecdir}/heartbeat %endif -%ifarch x86_64 +%ifarch x86_64 ppc64le %files cloud +%ifarch x86_64 /usr/lib/ocf/resource.d/heartbeat/aliyun-* %{_mandir}/man7/*aliyun-* /usr/lib/ocf/resource.d/heartbeat/aws* @@ -665,6 +694,11 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %exclude /usr/lib/ocf/resource.d/heartbeat/gcp-vpc-move-ip %exclude %{_mandir}/man7/*gcp-vpc-move-ip* %endif +%ifarch ppc64le +/usr/lib/ocf/resource.d/heartbeat/powervs-* +%{_mandir}/man7/*powervs-* +%endif +%endif %files paf %doc paf_README.md @@ -675,14 +709,65 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog -* Tue May 06 2025 Alan Steinberg - 4.10.0-64.4 +* Mon Apr 28 2025 Oyvind Albrigtsen - 4.10.0-71.5 +- podman-etcd: new resource agent +- Filesystem: add support for aznfs +- portblock: fix iptables version detection + + Resolves: RHEL-88429 + Resolves: RHEL-88035 + Resolves: HEL-79819 + +* Fri Jan 10 2025 Oyvind Albrigtsen - 4.10.0-71 +- openstack-cinder-volume: wait for volume to be available + + Resolves: RHEL-69734 + +* Wed Nov 27 2024 Oyvind Albrigtsen - 4.10.0-69 - AWS agents: reuse IMDS token until it expires +- awsvip: add interface parameter Resolves: RHEL-66292 + Resolves: RHEL-68739 -* Wed Feb 05 2025 Pooja Senthil Kumar - 4.10.0-64.3 -- [RHOSP17.1] Podman resource failed to stop after common process failure. [rhel-9.5.z] (JIRA:RHEL-61165) -- The "openstack-cinder-volume" resource agent's start operation can report "started" before it's actually ready to provide storage [rhel-9.5.z] (JIRA:RHEL-72960) +* Wed Oct 23 2024 Oyvind Albrigtsen - 4.10.0-68 +- powervs-subnet: new resource agent + + Resolves: RHEL-42513 + +* Mon Oct 14 2024 Oyvind Albrigtsen - 4.10.0-67 +- ocf-shellfuncs: only create/update and reload systemd drop-in if + needed +- IPaddr2: improve fail logic and check ip_status after adding IP +- azure-events-az: update API versions, and add retry functionality + for metadata requests +- azure-events*: use node name from cluster instead of hostname to + avoid failing if they're not the same + + Resolves: RHEL-61888 + Resolves: RHEL-62200 + Resolves: RHEL-40589 + Resolves: RHEL-58632 + +* Wed Oct 2 2024 Oyvind Albrigtsen - 4.10.0-66 +- nfsserver: also stop rpc-statd for nfsv4_only to avoid stop failing + in some cases +- podman: force-remove containers in stopping state if necessary + + Resolves: RHEL-59172 + Resolves: RHEL-58008 + +* Wed Sep 25 2024 Oyvind Albrigtsen - 4.10.0-65 +- Filesystem: dont sleep during stop-action when there are no + processes to kill, and only use force argument for network + filesystems after sending kill_signals +- Filesystem: try umount first during stop-action, and avoid potential + "Argument list too long" for force_unmount=safe +- AWS agents: use awscli2 + + Resolves: RHEL-58038 + Resolves: RHEL-59576 + Resolves: RHEL-46233 * Thu Aug 29 2024 Oyvind Albrigtsen - 4.10.0-64 - IPsrcaddr: add IPv6 support