Compare commits
No commits in common. "c8" and "c9-beta" have entirely different histories.
13
.gitignore
vendored
13
.gitignore
vendored
@ -1,12 +1 @@
|
|||||||
SOURCES/ClusterLabs-resource-agents-55a4e2c9.tar.gz
|
SOURCES/ClusterLabs-resource-agents-fd0720f7.tar.gz
|
||||||
SOURCES/aliyun-cli-2.1.10.tar.gz
|
|
||||||
SOURCES/aliyun-python-sdk-core-2.13.1.tar.gz
|
|
||||||
SOURCES/aliyun-python-sdk-ecs-4.9.3.tar.gz
|
|
||||||
SOURCES/aliyun-python-sdk-vpc-3.0.2.tar.gz
|
|
||||||
SOURCES/colorama-0.3.3.tar.gz
|
|
||||||
SOURCES/google-cloud-sdk-360.0.0-linux-x86_64.tar.gz
|
|
||||||
SOURCES/httplib2-0.20.4.tar.gz
|
|
||||||
SOURCES/pycryptodome-3.20.0.tar.gz
|
|
||||||
SOURCES/pyparsing-2.4.7-py2.py3-none-any.whl
|
|
||||||
SOURCES/pyroute2-0.4.13.tar.gz
|
|
||||||
SOURCES/urllib3-1.26.18.tar.gz
|
|
||||||
|
|||||||
@ -1,12 +1 @@
|
|||||||
dfc65f4cac3f95026b2f5674019814a527333004 SOURCES/ClusterLabs-resource-agents-55a4e2c9.tar.gz
|
3b517ecdbe2103df77813050e5c998e102c5de7e SOURCES/ClusterLabs-resource-agents-fd0720f7.tar.gz
|
||||||
306e131d8908ca794276bfe3a0b55ccc3bbd482f SOURCES/aliyun-cli-2.1.10.tar.gz
|
|
||||||
0a56f6d9ed2014a363486d33b63eca094379be06 SOURCES/aliyun-python-sdk-core-2.13.1.tar.gz
|
|
||||||
c2a98b9a1562d223a76514f05028488ca000c395 SOURCES/aliyun-python-sdk-ecs-4.9.3.tar.gz
|
|
||||||
f14647a4d37a9a254c4e711b95a7654fc418e41e SOURCES/aliyun-python-sdk-vpc-3.0.2.tar.gz
|
|
||||||
0fe5bd8bca54dd71223778a1e0bcca9af324abb1 SOURCES/colorama-0.3.3.tar.gz
|
|
||||||
81f039cf075e9c8b70d5af99c189296a9e031de3 SOURCES/google-cloud-sdk-360.0.0-linux-x86_64.tar.gz
|
|
||||||
7caf4412d9473bf17352316249a8133fa70b7e37 SOURCES/httplib2-0.20.4.tar.gz
|
|
||||||
c55d177e9484d974c95078d4ae945f89ba2c7251 SOURCES/pycryptodome-3.20.0.tar.gz
|
|
||||||
c8307f47e3b75a2d02af72982a2dfefa3f56e407 SOURCES/pyparsing-2.4.7-py2.py3-none-any.whl
|
|
||||||
147149db11104c06d405fd077dcd2aa1c345f109 SOURCES/pyroute2-0.4.13.tar.gz
|
|
||||||
84e2852d8da1655373f7ce5e7d5d3e256b62b4e4 SOURCES/urllib3-1.26.18.tar.gz
|
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
diff -uNr a/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/calliope/usage_text.py b/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/calliope/usage_text.py
|
|
||||||
--- a/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/calliope/usage_text.py 1980-01-01 09:00:00.000000000 +0100
|
|
||||||
+++ b/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/calliope/usage_text.py 2019-04-04 11:59:47.592768577 +0200
|
|
||||||
@@ -900,6 +900,9 @@
|
|
||||||
return """\
|
|
||||||
For detailed information on this command and its flags, run:
|
|
||||||
{command_path} --help
|
|
||||||
+
|
|
||||||
+WARNING: {command_path} is only supported for "{command_path} init" and for use
|
|
||||||
+with the agents in resource-agents.
|
|
||||||
""".format(command_path=' '.join(command.GetPath()))
|
|
||||||
|
|
||||||
|
|
||||||
diff -uNr a/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/gcloud_main.py b/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/gcloud_main.py
|
|
||||||
--- a/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/gcloud_main.py 1980-01-01 09:00:00.000000000 +0100
|
|
||||||
+++ b/bundled/gcp/google-cloud-sdk/lib/googlecloudsdk/gcloud_main.py 2019-04-04 12:00:23.991142694 +0200
|
|
||||||
@@ -84,7 +84,7 @@
|
|
||||||
|
|
||||||
pkg_root = os.path.dirname(os.path.dirname(surface.__file__))
|
|
||||||
loader = cli.CLILoader(
|
|
||||||
- name='gcloud',
|
|
||||||
+ name='gcloud-ra',
|
|
||||||
command_root_directory=os.path.join(pkg_root, 'surface'),
|
|
||||||
allow_non_existing_modules=True,
|
|
||||||
version_func=VersionFunc,
|
|
||||||
@ -1,45 +0,0 @@
|
|||||||
diff --color -uNr a/heartbeat/gcp-pd-move.in b/heartbeat/gcp-pd-move.in
|
|
||||||
--- a/heartbeat/gcp-pd-move.in 2024-07-22 10:59:42.170483160 +0200
|
|
||||||
+++ b/heartbeat/gcp-pd-move.in 2024-07-22 11:01:51.455543850 +0200
|
|
||||||
@@ -32,6 +32,7 @@
|
|
||||||
from ocf import logger
|
|
||||||
|
|
||||||
try:
|
|
||||||
+ sys.path.insert(0, '/usr/lib/resource-agents/bundled/gcp')
|
|
||||||
import googleapiclient.discovery
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
diff --color -uNr a/heartbeat/gcp-vpc-move-ip.in b/heartbeat/gcp-vpc-move-ip.in
|
|
||||||
--- a/heartbeat/gcp-vpc-move-ip.in 2024-07-22 10:59:42.170483160 +0200
|
|
||||||
+++ b/heartbeat/gcp-vpc-move-ip.in 2024-07-22 11:01:18.010752081 +0200
|
|
||||||
@@ -36,7 +36,7 @@
|
|
||||||
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
|
|
||||||
|
|
||||||
# Defaults
|
|
||||||
-OCF_RESKEY_gcloud_default="/usr/bin/gcloud"
|
|
||||||
+OCF_RESKEY_gcloud_default="/usr/bin/gcloud-ra"
|
|
||||||
OCF_RESKEY_configuration_default="default"
|
|
||||||
OCF_RESKEY_vpc_network_default="default"
|
|
||||||
OCF_RESKEY_interface_default="eth0"
|
|
||||||
diff --color -uNr a/heartbeat/gcp-vpc-move-route.in b/heartbeat/gcp-vpc-move-route.in
|
|
||||||
--- a/heartbeat/gcp-vpc-move-route.in 2024-07-22 10:59:42.170483160 +0200
|
|
||||||
+++ b/heartbeat/gcp-vpc-move-route.in 2024-07-22 11:01:18.011752105 +0200
|
|
||||||
@@ -45,6 +45,7 @@
|
|
||||||
from ocf import *
|
|
||||||
|
|
||||||
try:
|
|
||||||
+ sys.path.insert(0, '/usr/lib/resource-agents/bundled/gcp')
|
|
||||||
import googleapiclient.discovery
|
|
||||||
import pyroute2
|
|
||||||
try:
|
|
||||||
diff --color -uNr a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in
|
|
||||||
--- a/heartbeat/gcp-vpc-move-vip.in 2024-07-22 10:59:42.170483160 +0200
|
|
||||||
+++ b/heartbeat/gcp-vpc-move-vip.in 2024-07-22 11:01:18.012752128 +0200
|
|
||||||
@@ -29,6 +29,7 @@
|
|
||||||
from ocf import *
|
|
||||||
|
|
||||||
try:
|
|
||||||
+ sys.path.insert(0, '/usr/lib/resource-agents/bundled/gcp')
|
|
||||||
import googleapiclient.discovery
|
|
||||||
try:
|
|
||||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials
|
|
||||||
@ -1,129 +0,0 @@
|
|||||||
diff -uNr a/bundled/gcp/google-cloud-sdk/lib/third_party/oauth2client/_pure_python_crypt.py b/bundled/gcp/google-cloud-sdk/lib/third_party/oauth2client/_pure_python_crypt.py
|
|
||||||
--- a/bundled/gcp/google-cloud-sdk/lib/third_party/oauth2client/_pure_python_crypt.py 1980-01-01 09:00:00.000000000 +0100
|
|
||||||
+++ b/bundled/gcp/google-cloud-sdk/lib/third_party/oauth2client/_pure_python_crypt.py 2019-04-04 11:56:00.292677044 +0200
|
|
||||||
@@ -19,8 +19,14 @@
|
|
||||||
certificates.
|
|
||||||
"""
|
|
||||||
|
|
||||||
+from pyasn1.codec.der import decoder
|
|
||||||
from pyasn1_modules import pem
|
|
||||||
-import rsa
|
|
||||||
+from pyasn1_modules.rfc2459 import Certificate
|
|
||||||
+from pyasn1_modules.rfc5208 import PrivateKeyInfo
|
|
||||||
+from cryptography.hazmat.primitives import serialization, hashes
|
|
||||||
+from cryptography.hazmat.primitives.asymmetric import padding
|
|
||||||
+from cryptography import x509
|
|
||||||
+from cryptography.hazmat.backends import default_backend
|
|
||||||
import six
|
|
||||||
|
|
||||||
from oauth2client import _helpers
|
|
||||||
@@ -40,7 +46,7 @@
|
|
||||||
'-----END RSA PRIVATE KEY-----')
|
|
||||||
_PKCS8_MARKER = ('-----BEGIN PRIVATE KEY-----',
|
|
||||||
'-----END PRIVATE KEY-----')
|
|
||||||
-_PKCS8_SPEC = None
|
|
||||||
+_PKCS8_SPEC = PrivateKeyInfo()
|
|
||||||
|
|
||||||
|
|
||||||
def _bit_list_to_bytes(bit_list):
|
|
||||||
@@ -67,7 +73,8 @@
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, pubkey):
|
|
||||||
- self._pubkey = pubkey
|
|
||||||
+ self._pubkey = serialization.load_pem_public_key(pubkey,
|
|
||||||
+ backend=default_backend())
|
|
||||||
|
|
||||||
def verify(self, message, signature):
|
|
||||||
"""Verifies a message against a signature.
|
|
||||||
@@ -84,8 +91,9 @@
|
|
||||||
"""
|
|
||||||
message = _helpers._to_bytes(message, encoding='utf-8')
|
|
||||||
try:
|
|
||||||
- return rsa.pkcs1.verify(message, signature, self._pubkey)
|
|
||||||
- except (ValueError, rsa.pkcs1.VerificationError):
|
|
||||||
+ return self._pubkey.verify(signature, message, padding.PKCS1v15(),
|
|
||||||
+ hashes.SHA256())
|
|
||||||
+ except (ValueError, TypeError, InvalidSignature):
|
|
||||||
return False
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
@@ -109,19 +117,18 @@
|
|
||||||
"""
|
|
||||||
key_pem = _helpers._to_bytes(key_pem)
|
|
||||||
if is_x509_cert:
|
|
||||||
- from pyasn1.codec.der import decoder
|
|
||||||
- from pyasn1_modules import rfc2459
|
|
||||||
-
|
|
||||||
- der = rsa.pem.load_pem(key_pem, 'CERTIFICATE')
|
|
||||||
- asn1_cert, remaining = decoder.decode(der, asn1Spec=rfc2459.Certificate())
|
|
||||||
+ der = x509.load_pem_x509_certificate(pem_data, default_backend())
|
|
||||||
+ asn1_cert, remaining = decoder.decode(der, asn1Spec=Certificate())
|
|
||||||
if remaining != b'':
|
|
||||||
raise ValueError('Unused bytes', remaining)
|
|
||||||
|
|
||||||
cert_info = asn1_cert['tbsCertificate']['subjectPublicKeyInfo']
|
|
||||||
key_bytes = _bit_list_to_bytes(cert_info['subjectPublicKey'])
|
|
||||||
- pubkey = rsa.PublicKey.load_pkcs1(key_bytes, 'DER')
|
|
||||||
+ pubkey = serialization.load_der_public_key(decoded_key,
|
|
||||||
+ backend=default_backend())
|
|
||||||
else:
|
|
||||||
- pubkey = rsa.PublicKey.load_pkcs1(key_pem, 'PEM')
|
|
||||||
+ pubkey = serialization.load_pem_public_key(decoded_key,
|
|
||||||
+ backend=default_backend())
|
|
||||||
return cls(pubkey)
|
|
||||||
|
|
||||||
|
|
||||||
@@ -134,6 +141,8 @@
|
|
||||||
|
|
||||||
def __init__(self, pkey):
|
|
||||||
self._key = pkey
|
|
||||||
+ self._pubkey = serialization.load_pem_private_key(pkey,
|
|
||||||
+ backend=default_backend())
|
|
||||||
|
|
||||||
def sign(self, message):
|
|
||||||
"""Signs a message.
|
|
||||||
@@ -145,7 +154,7 @@
|
|
||||||
string, The signature of the message for the given key.
|
|
||||||
"""
|
|
||||||
message = _helpers._to_bytes(message, encoding='utf-8')
|
|
||||||
- return rsa.pkcs1.sign(message, self._key, 'SHA-256')
|
|
||||||
+ return self._key.sign(message, padding.PKCS1v15(), hashes.SHA256())
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_string(cls, key, password='notasecret'):
|
|
||||||
@@ -163,27 +172,24 @@
|
|
||||||
ValueError if the key cannot be parsed as PKCS#1 or PKCS#8 in
|
|
||||||
PEM format.
|
|
||||||
"""
|
|
||||||
- global _PKCS8_SPEC
|
|
||||||
key = _helpers._from_bytes(key) # pem expects str in Py3
|
|
||||||
marker_id, key_bytes = pem.readPemBlocksFromFile(
|
|
||||||
six.StringIO(key), _PKCS1_MARKER, _PKCS8_MARKER)
|
|
||||||
|
|
||||||
if marker_id == 0:
|
|
||||||
- pkey = rsa.key.PrivateKey.load_pkcs1(key_bytes,
|
|
||||||
- format='DER')
|
|
||||||
- elif marker_id == 1:
|
|
||||||
- from pyasn1.codec.der import decoder
|
|
||||||
- from pyasn1_modules import rfc5208
|
|
||||||
+ pkey = serialization.load_der_private_key(
|
|
||||||
+ key_bytes, password=None,
|
|
||||||
+ backend=default_backend())
|
|
||||||
|
|
||||||
- if _PKCS8_SPEC is None:
|
|
||||||
- _PKCS8_SPEC = rfc5208.PrivateKeyInfo()
|
|
||||||
+ elif marker_id == 1:
|
|
||||||
key_info, remaining = decoder.decode(
|
|
||||||
key_bytes, asn1Spec=_PKCS8_SPEC)
|
|
||||||
if remaining != b'':
|
|
||||||
raise ValueError('Unused bytes', remaining)
|
|
||||||
pkey_info = key_info.getComponentByName('privateKey')
|
|
||||||
- pkey = rsa.key.PrivateKey.load_pkcs1(pkey_info.asOctets(),
|
|
||||||
- format='DER')
|
|
||||||
+ pkey = serialization.load_der_private_key(
|
|
||||||
+ pkey_info.asOctets(), password=None,
|
|
||||||
+ backend=default_backend())
|
|
||||||
else:
|
|
||||||
raise ValueError('No key could be detected.')
|
|
||||||
|
|
||||||
85
SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch
Normal file
85
SOURCES/RHEL-102610-podman-etcd-add-oom-parameter.patch
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
From d08a7f74427ea2cf7d355a0f7f6d8f583e2d0cba Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Thu, 3 Jul 2025 12:22:12 +0200
|
||||||
|
Subject: [PATCH] OCPBUGS-58324: podman-etcd Add OOM score adjustment for etcd
|
||||||
|
containers
|
||||||
|
|
||||||
|
This change introduces a new `oom` parameter to the `podman-etcd` OCF
|
||||||
|
agent. This allows tuning the Out-Of-Memory (OOM) score adjustment for
|
||||||
|
the etcd container.
|
||||||
|
|
||||||
|
The `oom` parameter accepts integer values from -1000 to 1000,
|
||||||
|
defaulting to -997 (system-node-critical equivalent).
|
||||||
|
|
||||||
|
see https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior
|
||||||
|
|
||||||
|
Key changes:
|
||||||
|
- Added `OCF_RESKEY_oom` parameter to agent definition (`content type="integer"`).
|
||||||
|
- Integrated `--oom-score-adj` option into `podman_start()`.
|
||||||
|
- Implemented input validation for `oom` in `podman_validate()`,
|
||||||
|
ensuring values are within the [-1000:1000] range.
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 22 +++++++++++++++++++++-
|
||||||
|
1 file changed, 21 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 6762112ec..884b7c579 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -45,6 +45,7 @@ OCF_RESKEY_nic_default="br-ex"
|
||||||
|
OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||||
|
OCF_RESKEY_allow_pull_default="1"
|
||||||
|
OCF_RESKEY_reuse_default="0"
|
||||||
|
+OCF_RESKEY_oom_default="-997"
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||||
|
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||||
|
@@ -53,6 +54,7 @@ OCF_RESKEY_reuse_default="0"
|
||||||
|
: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
|
||||||
|
: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
|
||||||
|
: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
|
||||||
|
+: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
@@ -230,6 +232,16 @@ to stop the container before pacemaker.
|
||||||
|
<shortdesc lang="en">drop-in dependency</shortdesc>
|
||||||
|
<content type="boolean"/>
|
||||||
|
</parameter>
|
||||||
|
+
|
||||||
|
+<parameter name="oom" required="0" unique="0">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+Tune the host's Out-Of-Memory (OOM) preferences for containers (accepts values from -1000 to 1000).
|
||||||
|
+Default to same OOM score as system-node-critical
|
||||||
|
+https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#node-out-of-memory-behavior
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">OOM for container</shortdesc>
|
||||||
|
+<content type="integer" default="${OCF_RESKEY_oom_default}"/>
|
||||||
|
+</parameter>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<actions>
|
||||||
|
@@ -1226,7 +1238,10 @@ podman_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
podman_create_mounts
|
||||||
|
- local run_opts="-d --name=${CONTAINER}"
|
||||||
|
+ local run_opts="--detach --name=${CONTAINER}"
|
||||||
|
+
|
||||||
|
+ run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||||
|
+
|
||||||
|
# check to see if the container has already started
|
||||||
|
podman_simple_status
|
||||||
|
if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
@@ -1513,6 +1528,11 @@ podman_validate()
|
||||||
|
exit $OCF_ERR_CONFIGURED
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ if [ "$OCF_RESKEY_oom" -lt -1000 ] || [ "$OCF_RESKEY_oom" -gt 1000 ]; then
|
||||||
|
+ ocf_exit_reason "'oom' value ${OCF_RESKEY_oom} is out of range [-1000:1000]"
|
||||||
|
+ exit $OCF_ERR_CONFIGURED
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
@ -1,47 +0,0 @@
|
|||||||
From 57acb7c26d809cf864ec439b8bcd6364702022d5 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Nate Prewitt <nate.prewitt@gmail.com>
|
|
||||||
Date: Wed, 25 Sep 2024 08:03:20 -0700
|
|
||||||
Subject: [PATCH] Only use hostname to do netrc lookup instead of netloc
|
|
||||||
|
|
||||||
---
|
|
||||||
src/requests/utils.py | 8 +-------
|
|
||||||
1 file changed, 1 insertion(+), 7 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/aliyun/aliyunsdkcore/vendored/requests/utils.py b/aliyun/aliyunsdkcore/vendored/requests/utils.py
|
|
||||||
index 699683e5d9..8a307ca8a0 100644
|
|
||||||
--- a/aliyun/aliyunsdkcore/vendored/requests/utils.py
|
|
||||||
+++ b/aliyun/aliyunsdkcore/vendored/requests/utils.py
|
|
||||||
@@ -182,13 +182,7 @@
|
|
||||||
return
|
|
||||||
|
|
||||||
ri = urlparse(url)
|
|
||||||
-
|
|
||||||
- # Strip port numbers from netloc. This weird `if...encode`` dance is
|
|
||||||
- # used for Python 3.2, which doesn't support unicode literals.
|
|
||||||
- splitstr = b':'
|
|
||||||
- if isinstance(url, str):
|
|
||||||
- splitstr = splitstr.decode('ascii')
|
|
||||||
- host = ri.netloc.split(splitstr)[0]
|
|
||||||
+ host = ri.hostname
|
|
||||||
|
|
||||||
try:
|
|
||||||
_netrc = netrc(netrc_path).authenticators(host)
|
|
||||||
diff --git a/gcp/google-cloud-sdk/lib/third_party/requests/utils.py b/gcp/google-cloud-sdk/lib/third_party/requests/utils.py
|
|
||||||
index 699683e5d9..8a307ca8a0 100644
|
|
||||||
--- a/gcp/google-cloud-sdk/lib/third_party/requests/utils.py
|
|
||||||
+++ b/gcp/google-cloud-sdk/lib/third_party/requests/utils.py
|
|
||||||
@@ -236,13 +236,7 @@ def get_netrc_auth(url, raise_errors=False):
|
|
||||||
return
|
|
||||||
|
|
||||||
ri = urlparse(url)
|
|
||||||
-
|
|
||||||
- # Strip port numbers from netloc. This weird `if...encode`` dance is
|
|
||||||
- # used for Python 3.2, which doesn't support unicode literals.
|
|
||||||
- splitstr = b':'
|
|
||||||
- if isinstance(url, str):
|
|
||||||
- splitstr = splitstr.decode('ascii')
|
|
||||||
- host = ri.netloc.split(splitstr)[0]
|
|
||||||
+ host = ri.hostname
|
|
||||||
|
|
||||||
try:
|
|
||||||
_netrc = netrc(netrc_path).authenticators(host)
|
|
||||||
@ -0,0 +1,686 @@
|
|||||||
|
From 6e9200dc2ffc89382188794742361985309936b2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Wed, 23 Jul 2025 09:34:13 +0200
|
||||||
|
Subject: [PATCH] podman-etcd: preserve containers for debugging
|
||||||
|
|
||||||
|
This change modifies the agent to keep stopped containers for log
|
||||||
|
inspection and debugging, with supporting changes to enable this
|
||||||
|
behavior.
|
||||||
|
|
||||||
|
* Conditionally reuse existing containers when configuration unchanged
|
||||||
|
* Move etcd inline configuration flags to external file to allow
|
||||||
|
restarts without container recreation (mainly for the
|
||||||
|
force-new-cluster flag)
|
||||||
|
* Archive previous container renaming it into *-previous, and its
|
||||||
|
configuration files into /var/lib/etcd/config-previous.tar.gz archive.
|
||||||
|
The tar.gz archive consists in:
|
||||||
|
* the pod manifest created by CEO, used to generated the Etc
|
||||||
|
configuration file
|
||||||
|
* the Etcd configuration file
|
||||||
|
* the auth json file
|
||||||
|
Only one copy is maintained to limit disk usage.
|
||||||
|
* Both configuration and backup files location is configurable with 2
|
||||||
|
new input arguments.
|
||||||
|
|
||||||
|
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 438 ++++++++++++++++++++++++++++++++----------
|
||||||
|
1 file changed, 336 insertions(+), 102 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 4969fbaaf..33804414a 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -46,6 +46,8 @@ OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||||
|
OCF_RESKEY_allow_pull_default="1"
|
||||||
|
OCF_RESKEY_reuse_default="0"
|
||||||
|
OCF_RESKEY_oom_default="-997"
|
||||||
|
+OCF_RESKEY_config_location_default="/var/lib/etcd"
|
||||||
|
+OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||||
|
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||||
|
@@ -55,6 +57,9 @@ OCF_RESKEY_oom_default="-997"
|
||||||
|
: ${OCF_RESKEY_allow_pull=${OCF_RESKEY_allow_pull_default}}
|
||||||
|
: ${OCF_RESKEY_reuse=${OCF_RESKEY_reuse_default}}
|
||||||
|
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||||
|
+: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
||||||
|
+: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
||||||
|
+
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
@@ -242,6 +247,23 @@ https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/#
|
||||||
|
<shortdesc lang="en">OOM for container</shortdesc>
|
||||||
|
<content type="integer" default="${OCF_RESKEY_oom_default}"/>
|
||||||
|
</parameter>
|
||||||
|
+
|
||||||
|
+<parameter name="config_location" required="0" unique="0">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+The directory where the resource agent stores its state files, such as the generated etcd configuration and a copy of the pod manifest.
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">Resource agent state directory</shortdesc>
|
||||||
|
+<content type="string" default="${OCF_RESKEY_config_location_default}"/>
|
||||||
|
+</parameter>
|
||||||
|
+
|
||||||
|
+<parameter name="backup_location" required="0" unique="0">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+The directory where the resource agent stores its backups.
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">Resource agent backup directory</shortdesc>
|
||||||
|
+<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
|
||||||
|
+</parameter>
|
||||||
|
+
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<actions>
|
||||||
|
@@ -309,42 +331,52 @@ container_exists()
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
-remove_container()
|
||||||
|
+# archive_current_container archives the current
|
||||||
|
+# podman etcd container and its configuration files.
|
||||||
|
+archive_current_container()
|
||||||
|
{
|
||||||
|
- local rc
|
||||||
|
- local execids
|
||||||
|
+ # don't attempt to archive a container that doesn't exist
|
||||||
|
+ if ! container_exists; then
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
|
||||||
|
- if ocf_is_true "$OCF_RESKEY_reuse"; then
|
||||||
|
- # never remove the container if we have reuse enabled.
|
||||||
|
- return 0
|
||||||
|
+ # delete any container named "*-previous", or we won't be able to archive the current container.
|
||||||
|
+ if podman inspect "${CONTAINER}-previous" >/dev/null 2>&1; then
|
||||||
|
+ ocf_log info "removing old archived container '$CONTAINER-previous'"
|
||||||
|
+ if ! ocf_run podman rm --volumes --force "$CONTAINER-previous"; then
|
||||||
|
+ ocf_log warn "could not remove old archived container (podman rm failed, error code: $?). Won't be able to archive current container"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if ! container_exists; then
|
||||||
|
- # don't attempt to remove a container that doesn't exist
|
||||||
|
- return 0
|
||||||
|
+ ocf_log info "archiving '$CONTAINER' container as '$CONTAINER-previous' for debugging purposes"
|
||||||
|
+ if ! ocf_run podman rename "$CONTAINER" "$CONTAINER-previous"; then
|
||||||
|
+ ocf_log err "could not archive container '$CONTAINER', error code: $?"
|
||||||
|
+ return
|
||||||
|
fi
|
||||||
|
- ocf_log notice "Cleaning up inactive container, ${CONTAINER}."
|
||||||
|
- ocf_run podman rm -v "$CONTAINER"
|
||||||
|
- rc=$?
|
||||||
|
- if [ $rc -ne 0 ]; then
|
||||||
|
- if [ $rc -eq 2 ]; then
|
||||||
|
- if podman inspect --format '{{.State.Status}}' "$CONTAINER" | grep -wq "stopping"; then
|
||||||
|
- ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
|
||||||
|
- ocf_run podman rm -f "$CONTAINER"
|
||||||
|
- rc=$?
|
||||||
|
- fi
|
||||||
|
- fi
|
||||||
|
- # due to a podman bug (rhbz#1841485), sometimes a stopped
|
||||||
|
- # container can still be associated with Exec sessions, in
|
||||||
|
- # which case the "podman rm" has to be forced
|
||||||
|
- execids=$(podman inspect "$CONTAINER" --format '{{len .ExecIDs}}')
|
||||||
|
- if [ "$execids" -ne "0" ]; then
|
||||||
|
- ocf_log warn "Inactive container ${CONTAINER} has lingering exec sessions. Force-remove it."
|
||||||
|
- ocf_run podman rm -f "$CONTAINER"
|
||||||
|
- rc=$?
|
||||||
|
+
|
||||||
|
+ # archive corresponding etcd configuration files
|
||||||
|
+ local files_to_archive=""
|
||||||
|
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
|
||||||
|
+ if [ -f "$file" ]; then
|
||||||
|
+ files_to_archive="$files_to_archive $file"
|
||||||
|
+ else
|
||||||
|
+ ocf_log warn "file '$file' is missing and won't be archived"
|
||||||
|
fi
|
||||||
|
+ done
|
||||||
|
+
|
||||||
|
+ if [ -z "$files_to_archive" ]; then
|
||||||
|
+ ocf_log warn "could not find any file to archive."
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # NOTE: tar will override any existing archive as wanted
|
||||||
|
+ # shellcheck disable=SC2086
|
||||||
|
+ if ! ocf_run tar --create --verbose --gzip --file "$ETCD_BACKUP_FILE" $files_to_archive; then
|
||||||
|
+ ocf_log warn "container archived successfully, but configuration backup failed (error: $?). Container debugging available, but without matching configuration files"
|
||||||
|
+ else
|
||||||
|
+ ocf_log info "container configuration also archived in '$ETCD_BACKUP_FILE'"
|
||||||
|
fi
|
||||||
|
- return $rc
|
||||||
|
}
|
||||||
|
|
||||||
|
# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address.
|
||||||
|
@@ -365,6 +397,7 @@ attribute_node_ip()
|
||||||
|
local attribute="node_ip"
|
||||||
|
local ip_addr name
|
||||||
|
|
||||||
|
+ # TODO: We can retrieve both the local and peer IP addresses from this map, which eliminates the need to use CIB to share them between nodes
|
||||||
|
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
name=$(echo "$node" | cut -d: -f1)
|
||||||
|
# ignore other nodes
|
||||||
|
@@ -375,7 +408,7 @@ attribute_node_ip()
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -z "$ip_addr" ]; then
|
||||||
|
- ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)"
|
||||||
|
+ ocf_log err "could not get local ip address from node_ip_map: '$OCF_RESKEY_node_ip_map'"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
@@ -384,9 +417,9 @@ attribute_node_ip()
|
||||||
|
echo "$ip_addr"
|
||||||
|
;;
|
||||||
|
update)
|
||||||
|
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||||
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$ip_addr"; then
|
||||||
|
rc="$?"
|
||||||
|
- ocf_log err "could not set $attribute to $value, error code: $rc"
|
||||||
|
+ ocf_log err "could not set $attribute to $ip_addr, error code: $rc"
|
||||||
|
return "$rc"
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
@@ -428,6 +461,48 @@ get_env_from_manifest() {
|
||||||
|
echo "$env_var_value"
|
||||||
|
}
|
||||||
|
|
||||||
|
+# etcd configuration file expects duration to be expressed in nanoseconds
|
||||||
|
+convert_duration_in_nanoseconds() {
|
||||||
|
+ local duration=$1
|
||||||
|
+ local value unit nanoseconds
|
||||||
|
+
|
||||||
|
+ if [ -z "$duration" ]; then
|
||||||
|
+ ocf_log err "convert_duration_in_nanoseconds: no duration provided"
|
||||||
|
+ return 1
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! echo "$duration" | grep -qE '^[0-9]+[numµ]?s$'; then
|
||||||
|
+ ocf_log err "convert_duration_in_nanoseconds: invalid duration format \"$duration\". Expected format: <number><unit> where unit is one of s, ms, us, µs, ns"
|
||||||
|
+ return 1
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Extract numeric value and unit from duration string
|
||||||
|
+ value=$(echo "$duration" | sed 's/[^0-9]*$//')
|
||||||
|
+ unit=$(echo "$duration" | sed 's/^[0-9]*//')
|
||||||
|
+
|
||||||
|
+ case "$unit" in
|
||||||
|
+ ns)
|
||||||
|
+ nanoseconds=$value
|
||||||
|
+ ;;
|
||||||
|
+ us|µs)
|
||||||
|
+ nanoseconds=$((value * 1000))
|
||||||
|
+ ;;
|
||||||
|
+ ms)
|
||||||
|
+ nanoseconds=$((value * 1000000))
|
||||||
|
+ ;;
|
||||||
|
+ s)
|
||||||
|
+ nanoseconds=$((value * 1000000000))
|
||||||
|
+ ;;
|
||||||
|
+ *)
|
||||||
|
+ # this should not happen as the input is already validated
|
||||||
|
+ ocf_log err "convert_duration_in_nanoseconds: unknown duration unit \"$unit\""
|
||||||
|
+ return 1
|
||||||
|
+ ;;
|
||||||
|
+ esac
|
||||||
|
+
|
||||||
|
+ echo "$nanoseconds"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
prepare_env() {
|
||||||
|
local name ip ipurl standalone_node
|
||||||
|
|
||||||
|
@@ -457,9 +532,14 @@ prepare_env() {
|
||||||
|
ETCDCTL_API=$(get_env_from_manifest "ETCDCTL_API")
|
||||||
|
ETCD_CIPHER_SUITES=$(get_env_from_manifest "ETCD_CIPHER_SUITES")
|
||||||
|
ETCD_DATA_DIR=$(get_env_from_manifest "ETCD_DATA_DIR")
|
||||||
|
+ if [ ! -d "$ETCD_DATA_DIR" ]; then
|
||||||
|
+ ocf_log err "could not find data-dir at path \"$ETCD_DATA_DIR\""
|
||||||
|
+ return "$OCF_ERR_ARGS"
|
||||||
|
+ else
|
||||||
|
+ ocf_log info "using data-dir: $ETCD_DATA_DIR"
|
||||||
|
+ fi
|
||||||
|
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
|
||||||
|
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
|
||||||
|
- ETCD_EXPERIMENTAL_MAX_LEARNERS=$(get_env_from_manifest "ETCD_EXPERIMENTAL_MAX_LEARNERS")
|
||||||
|
ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||||
|
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
|
||||||
|
@@ -475,6 +555,62 @@ prepare_env() {
|
||||||
|
LISTEN_METRICS_URLS="0.0.0.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
+
|
||||||
|
+generate_etcd_configuration() {
|
||||||
|
+ if is_force_new_cluster; then
|
||||||
|
+ # The embedded newline is required for correct YAML formatting.
|
||||||
|
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
|
||||||
|
+force-new-cluster-bump-amount: 1000000000"
|
||||||
|
+ else
|
||||||
|
+ FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||||
|
+logger: zap
|
||||||
|
+log-level: info
|
||||||
|
+snapshot-count: 10000
|
||||||
|
+name: $NODENAME
|
||||||
|
+data-dir: $ETCD_DATA_DIR
|
||||||
|
+$FORCE_NEW_CLUSTER_CONFIG
|
||||||
|
+socket-reuse-address: $ETCD_SOCKET_REUSE_ADDRESS
|
||||||
|
+election-timeout: $ETCD_ELECTION_TIMEOUT
|
||||||
|
+enable-pprof: $ETCD_ENABLE_PPROF
|
||||||
|
+heartbeat-interval: $ETCD_HEARTBEAT_INTERVAL
|
||||||
|
+quota-backend-bytes: $ETCD_QUOTA_BACKEND_BYTES
|
||||||
|
+initial-advertise-peer-urls: "$NODEIPURL:2380"
|
||||||
|
+listen-peer-urls: "$(ip_url ${LISTEN_PEER_URLS}):2380"
|
||||||
|
+listen-client-urls: "$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0"
|
||||||
|
+initial-cluster: $ETCD_INITIAL_CLUSTER
|
||||||
|
+initial-cluster-state: $ETCD_INITIAL_CLUSTER_STATE
|
||||||
|
+client-transport-security:
|
||||||
|
+ cert-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt
|
||||||
|
+ key-file: /etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key
|
||||||
|
+ client-cert-auth: true
|
||||||
|
+ trusted-ca-file: $SERVER_CACERT
|
||||||
|
+peer-transport-security:
|
||||||
|
+ cert-file: $ETCD_PEER_CERT
|
||||||
|
+ key-file: $ETCD_PEER_KEY
|
||||||
|
+ client-cert-auth: true
|
||||||
|
+ trusted-ca-file: $SERVER_CACERT
|
||||||
|
+advertise-client-urls: "$NODEIPURL:2379"
|
||||||
|
+listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||||
|
+metrics: extensive
|
||||||
|
+experimental-initial-corrupt-check: true
|
||||||
|
+experimental-max-learners: 1
|
||||||
|
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||||
|
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
+EOF
|
||||||
|
+
|
||||||
|
+ {
|
||||||
|
+ if [ -n "$ETCD_CIPHER_SUITES" ]; then
|
||||||
|
+ echo "cipher-suites:"
|
||||||
|
+ echo "$ETCD_CIPHER_SUITES" | tr ',' '\n' | while read -r cipher; do
|
||||||
|
+ echo " - \"$cipher\""
|
||||||
|
+ done
|
||||||
|
+ fi
|
||||||
|
+ } >> "$ETCD_CONFIGURATION_FILE"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
archive_data_folder()
|
||||||
|
{
|
||||||
|
# TODO: use etcd snapshots
|
||||||
|
@@ -634,7 +770,7 @@ add_member_as_learner()
|
||||||
|
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||||
|
local peer_url=$(ip_url $member_ip)
|
||||||
|
|
||||||
|
- ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||||
|
+ ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
|
||||||
|
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
@@ -1104,18 +1240,18 @@ compare_revision()
|
||||||
|
peer_revision=$(attribute_node_revision_peer)
|
||||||
|
|
||||||
|
if [ "$revision" = "" ] || [ "$revision" = "null" ] || [ "$peer_revision" = "" ] || [ "$peer_revision" = "null" ]; then
|
||||||
|
- ocf_log err "could not compare revisions: $NODENAME local revision: $revision, peer revision: $peer_revision"
|
||||||
|
+ ocf_log err "could not compare revisions: '$NODENAME' local revision='$revision', peer revision='$peer_revision'"
|
||||||
|
return "$OCF_ERR_GENERIC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$revision" -gt "$peer_revision" ]; then
|
||||||
|
- ocf_log info "$NODENAME revision: $revision is newer than peer revision: $peer_revision"
|
||||||
|
+ ocf_log info "$NODENAME revision: '$revision' is newer than peer revision: '$peer_revision'"
|
||||||
|
echo "newer"
|
||||||
|
elif [ "$revision" -eq "$peer_revision" ]; then
|
||||||
|
- ocf_log info "$NODENAME revision: $revision is equal to peer revision: $peer_revision"
|
||||||
|
+ ocf_log info "$NODENAME revision: '$revision' is equal to peer revision: '$peer_revision'"
|
||||||
|
echo "equal"
|
||||||
|
else
|
||||||
|
- ocf_log info "$NODENAME revision: $revision is older than peer revision: $peer_revision"
|
||||||
|
+ ocf_log info "$NODENAME revision: '$revision' is older than peer revision: '$peer_revision'"
|
||||||
|
echo "older"
|
||||||
|
fi
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
@@ -1144,6 +1280,100 @@ ensure_pod_manifest_exists()
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
}
|
||||||
|
|
||||||
|
+filter_pod_manifest() {
|
||||||
|
+ # Remove pod-version related fields from POD manifest
|
||||||
|
+ local pod_manifest="$1"
|
||||||
|
+ local temporary_file
|
||||||
|
+ local jq_filter='del(.metadata.labels.revision) | .spec.containers[] |= ( .env |= map(select( .name != "ETCD_STATIC_POD_VERSION" ))) | .spec.volumes |= map( select( .name != "resource-dir" ))'
|
||||||
|
+
|
||||||
|
+ if ! temporary_file=$(mktemp); then
|
||||||
|
+ ocf_log err "could not create temporary file for '$pod_manifest', error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if ! jq "$jq_filter" "$pod_manifest" > "$temporary_file"; then
|
||||||
|
+ ocf_log err "could not remove pod version related data from '$pod_manifest', error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ echo "$temporary_file"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+can_reuse_container() {
|
||||||
|
+ # Decide whether to reuse the existing container or create a new one based on etcd pod manifest changes.
|
||||||
|
+ # NOTE: explicitly ignore POD version and POD version related data, as the content might be the same even if the revision number has changed.
|
||||||
|
+ local cp_rc
|
||||||
|
+ local diff_rc
|
||||||
|
+ local filtered_original_pod_manifest
|
||||||
|
+ local filtered_copy_pod_manifest
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+ # If the container does not exist it cannot be reused
|
||||||
|
+ if ! container_exists; then
|
||||||
|
+ OCF_RESKEY_reuse=0
|
||||||
|
+ return "$OCF_SUCCESS"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # If the manifest copy doesn't exist, we need a new container.
|
||||||
|
+ if [ ! -f "$POD_MANIFEST_COPY" ]; then
|
||||||
|
+ ocf_log info "a working copy of $OCF_RESKEY_pod_manifest was not found. A new etcd container will be created."
|
||||||
|
+ OCF_RESKEY_reuse=0
|
||||||
|
+ return "$OCF_SUCCESS"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if ! filtered_copy_pod_manifest=$(filter_pod_manifest "$POD_MANIFEST_COPY"); then
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ ocf_log info "comparing $OCF_RESKEY_pod_manifest with local copy $POD_MANIFEST_COPY"
|
||||||
|
+ ocf_run diff -s "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
|
||||||
|
+ diff_rc="$?"
|
||||||
|
+ # clean up temporary files
|
||||||
|
+ rm -f "$filtered_original_pod_manifest" "$filtered_copy_pod_manifest"
|
||||||
|
+ case "$diff_rc" in
|
||||||
|
+ 0)
|
||||||
|
+ ocf_log info "Reusing the existing etcd container"
|
||||||
|
+ OCF_RESKEY_reuse=1
|
||||||
|
+ ;;
|
||||||
|
+ 1)
|
||||||
|
+ ocf_log info "Etcd pod manifest changes detected: creating a new etcd container to apply the changes"
|
||||||
|
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
|
||||||
|
+ cp_rc="$?"
|
||||||
|
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
+ fi
|
||||||
|
+ ocf_log info "A working copy of $OCF_RESKEY_pod_manifest was created"
|
||||||
|
+ OCF_RESKEY_reuse=0
|
||||||
|
+ ;;
|
||||||
|
+ *)
|
||||||
|
+ ocf_log err "Could not check if etcd pod manifest has changed, diff rc: $diff_rc"
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
+ ;;
|
||||||
|
+ esac
|
||||||
|
+
|
||||||
|
+ return "$OCF_SUCCESS"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+ensure_pod_manifest_copy_exists() {
|
||||||
|
+ local cp_rc
|
||||||
|
+
|
||||||
|
+ if [ -f "$POD_MANIFEST_COPY" ]; then
|
||||||
|
+ return "$OCF_SUCCESS"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # If the manifest copy doesn't exist, create it and ensure a new container.
|
||||||
|
+ if ! ocf_run cp -p "$OCF_RESKEY_pod_manifest" "$POD_MANIFEST_COPY"; then
|
||||||
|
+ cp_rc="$?"
|
||||||
|
+ ocf_log err "Could not create a working copy of $OCF_RESKEY_pod_manifest, rc: $cp_rc"
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ ocf_log info "a new working copy of $OCF_RESKEY_pod_manifest was created"
|
||||||
|
+
|
||||||
|
+ return "$OCF_SUCCESS"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
podman_start()
|
||||||
|
{
|
||||||
|
local cid
|
||||||
|
@@ -1173,6 +1403,13 @@ podman_start()
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ # check if the container has already started
|
||||||
|
+ podman_simple_status
|
||||||
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
+ ocf_log info "the '$CONTAINER' has already started. Nothing to do"
|
||||||
|
+ return "$OCF_SUCCESS"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
if ! ensure_pod_manifest_exists; then
|
||||||
|
ocf_exit_reason "could not find etcd pod manifest ($OCF_RESKEY_pod_manifest)"
|
||||||
|
return "$OCF_ERR_GENERIC"
|
||||||
|
@@ -1186,8 +1423,9 @@ podman_start()
|
||||||
|
ocf_log info "static pod was running: start normally"
|
||||||
|
else
|
||||||
|
if is_force_new_cluster; then
|
||||||
|
- ocf_log notice "$NODENAME marked to force-new-cluster"
|
||||||
|
+ ocf_log notice "'$NODENAME' marked to force-new-cluster"
|
||||||
|
else
|
||||||
|
+ ocf_log info "'$NODENAME' is not marked to force-new-cluster"
|
||||||
|
# When the local agent starts, we can infer the cluster state by counting
|
||||||
|
# how many agents are starting or already active:
|
||||||
|
# - 1 active agent: it's the peer (we are just starting)
|
||||||
|
@@ -1195,6 +1433,7 @@ podman_start()
|
||||||
|
# - 0 active agents, 2 starting: both agents are starting simultaneously
|
||||||
|
local active_resources_count
|
||||||
|
active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
||||||
|
+ ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
|
||||||
|
case "$active_resources_count" in
|
||||||
|
1)
|
||||||
|
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||||
|
@@ -1205,17 +1444,17 @@ podman_start()
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
0)
|
||||||
|
+ # count how many agents are starting now
|
||||||
|
+ local start_resources_count
|
||||||
|
+ start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||||
|
+ ocf_log info "found '$start_resources_count' starting etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_start_resource')"
|
||||||
|
+
|
||||||
|
# we need to compare the revisions in any of the following branches
|
||||||
|
# so call the function only once here
|
||||||
|
if ! revision_compare_result=$(compare_revision); then
|
||||||
|
ocf_log err "could not compare revisions, error code: $?"
|
||||||
|
return "$OCF_ERR_GENERIC"
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
- # count how many agents are starting now
|
||||||
|
- local start_resources_count
|
||||||
|
- start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||||
|
-
|
||||||
|
case "$start_resources_count" in
|
||||||
|
1)
|
||||||
|
ocf_log debug "peer not starting: ensure we can start a new cluster"
|
||||||
|
@@ -1231,6 +1470,7 @@ podman_start()
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
2)
|
||||||
|
+ # TODO: can we start "normally", regardless the revisions, if the container-id is the same on both nodes?
|
||||||
|
ocf_log info "peer starting"
|
||||||
|
if [ "$revision_compare_result" = "newer" ]; then
|
||||||
|
set_force_new_cluster
|
||||||
|
@@ -1263,7 +1503,7 @@ podman_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
podman_create_mounts
|
||||||
|
- local run_opts="--detach --name=${CONTAINER}"
|
||||||
|
+ local run_opts="--detach --name=${CONTAINER} --replace"
|
||||||
|
|
||||||
|
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||||
|
|
||||||
|
@@ -1297,61 +1537,59 @@ podman_start()
|
||||||
|
archive_data_folder
|
||||||
|
fi
|
||||||
|
|
||||||
|
- prepare_env
|
||||||
|
+ ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||||
|
+ if ! can_reuse_container ; then
|
||||||
|
+ rc="$?"
|
||||||
|
+ ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
|
||||||
|
+ return "$rc"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Archive current container and its configuration before creating
|
||||||
|
+ # new configuration files.
|
||||||
|
+ if ! ocf_is_true "$OCF_RESKEY_reuse"; then
|
||||||
|
+ # Log archive container failures but don't block, as the priority
|
||||||
|
+ # is ensuring the etcd container starts successfully.
|
||||||
|
+ archive_current_container
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! ensure_pod_manifest_copy_exists; then
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! prepare_env; then
|
||||||
|
+ ocf_log err "Could not prepare environment for podman, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! generate_etcd_configuration; then
|
||||||
|
+ ocf_log err "Could not generate etcd configuration, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
|
||||||
|
- # add etcd-specific opts
|
||||||
|
run_opts="$run_opts \
|
||||||
|
- --network=host \
|
||||||
|
- -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
||||||
|
- -v /var/lib/etcd:/var/lib/etcd \
|
||||||
|
- --env ALL_ETCD_ENDPOINTS=$ALL_ETCD_ENDPOINTS \
|
||||||
|
- --env ETCD_CIPHER_SUITES=$ETCD_CIPHER_SUITES \
|
||||||
|
- --env ETCD_DATA_DIR=$ETCD_DATA_DIR \
|
||||||
|
- --env ETCD_ELECTION_TIMEOUT=$ETCD_ELECTION_TIMEOUT \
|
||||||
|
- --env ETCD_ENABLE_PPROF=$ETCD_ENABLE_PPROF \
|
||||||
|
- --env ETCD_EXPERIMENTAL_MAX_LEARNERS=$ETCD_EXPERIMENTAL_MAX_LEARNERS \
|
||||||
|
- --env ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION \
|
||||||
|
- --env ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL \
|
||||||
|
- --env ETCD_HEARTBEAT_INTERVAL=$ETCD_HEARTBEAT_INTERVAL \
|
||||||
|
- --env ETCD_INITIAL_CLUSTER=$ETCD_INITIAL_CLUSTER \
|
||||||
|
- --env ETCD_INITIAL_CLUSTER_STATE=$ETCD_INITIAL_CLUSTER_STATE \
|
||||||
|
- --env ETCD_NAME=$NODENAME \
|
||||||
|
- --env ETCD_QUOTA_BACKEND_BYTES=$ETCD_QUOTA_BACKEND_BYTES \
|
||||||
|
- --env ETCD_SOCKET_REUSE_ADDRESS=$ETCD_SOCKET_REUSE_ADDRESS \
|
||||||
|
- --env ETCDCTL_API=$ETCDCTL_API \
|
||||||
|
- --env ETCDCTL_CACERT=$SERVER_CACERT \
|
||||||
|
- --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
||||||
|
- --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
||||||
|
- --authfile=$OCF_RESKEY_authfile \
|
||||||
|
- --security-opt label=disable"
|
||||||
|
+ --network=host \
|
||||||
|
+ -v /etc/kubernetes/static-pod-resources/etcd-certs:/etc/kubernetes/static-pod-certs \
|
||||||
|
+ -v /var/lib/etcd:/var/lib/etcd \
|
||||||
|
+ --env ETCDCTL_API=$ETCDCTL_API \
|
||||||
|
+ --env ETCDCTL_CACERT=$SERVER_CACERT \
|
||||||
|
+ --env ETCDCTL_CERT=$ETCD_PEER_CERT \
|
||||||
|
+ --env ETCDCTL_KEY=$ETCD_PEER_KEY \
|
||||||
|
+ --authfile=$OCF_RESKEY_authfile \
|
||||||
|
+ --security-opt label=disable"
|
||||||
|
if [ -n "$OCF_RESKEY_run_opts" ]; then
|
||||||
|
run_opts="$run_opts $OCF_RESKEY_run_opts"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --logger=zap \
|
||||||
|
- --log-level=info \
|
||||||
|
- --experimental-initial-corrupt-check=true \
|
||||||
|
- --snapshot-count=10000 \
|
||||||
|
- --initial-advertise-peer-urls=$NODEIPURL:2380 \
|
||||||
|
- --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
|
||||||
|
- --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
|
||||||
|
- --trusted-ca-file=$SERVER_CACERT \
|
||||||
|
- --client-cert-auth=true \
|
||||||
|
- --peer-cert-file=$ETCD_PEER_CERT \
|
||||||
|
- --peer-key-file=$ETCD_PEER_KEY \
|
||||||
|
- --peer-trusted-ca-file=$SERVER_CACERT \
|
||||||
|
- --peer-client-cert-auth=true \
|
||||||
|
- --advertise-client-urls=$NODEIPURL:2379 \
|
||||||
|
- --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \
|
||||||
|
- --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \
|
||||||
|
- --metrics=extensive \
|
||||||
|
- --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||||
|
- if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
||||||
|
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
||||||
|
+ if [ -f "$ETCD_CONFIGURATION_FILE" ]; then
|
||||||
|
+ ocf_log info "using etcd configuration file: $ETCD_CONFIGURATION_FILE"
|
||||||
|
+ else
|
||||||
|
+ ocf_log err "could not find $ETCD_CONFIGURATION_FILE"
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if is_force_new_cluster; then
|
||||||
|
- OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --force-new-cluster"
|
||||||
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd --config-file=$ETCD_CONFIGURATION_FILE"
|
||||||
|
+ if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
||||||
|
+ OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$OCF_RESKEY_image" = "$OCF_RESKEY_image_default" ]; then
|
||||||
|
@@ -1377,9 +1615,7 @@ podman_start()
|
||||||
|
ocf_log info "starting existing container $CONTAINER."
|
||||||
|
ocf_run podman start "$CONTAINER"
|
||||||
|
else
|
||||||
|
- # make sure any previous container matching our container name is cleaned up first.
|
||||||
|
- # we already know at this point it wouldn't be running
|
||||||
|
- remove_container
|
||||||
|
+ ocf_log info "starting new container $CONTAINER."
|
||||||
|
run_new_container "$run_opts" "$OCF_RESKEY_image" "$OCF_RESKEY_run_cmd"
|
||||||
|
if [ $? -eq 125 ]; then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
@@ -1439,7 +1675,6 @@ podman_stop()
|
||||||
|
local rc
|
||||||
|
podman_simple_status
|
||||||
|
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||||
|
- remove_container
|
||||||
|
ocf_log info "could not leave members list: etcd container not running"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
@@ -1475,7 +1710,7 @@ podman_stop()
|
||||||
|
ocf_run podman kill "$CONTAINER"
|
||||||
|
rc=$?
|
||||||
|
else
|
||||||
|
- ocf_log debug "waiting $timeout second[s] before killing container"
|
||||||
|
+ ocf_log info "waiting $timeout second[s] before killing container"
|
||||||
|
ocf_run podman stop -t="$timeout" "$CONTAINER"
|
||||||
|
rc=$?
|
||||||
|
# on stop, systemd will automatically delete any transient
|
||||||
|
@@ -1496,11 +1731,6 @@ podman_stop()
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if ! remove_container; then
|
||||||
|
- ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
- fi
|
||||||
|
-
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -1532,6 +1762,7 @@ podman_validate()
|
||||||
|
check_binary oc
|
||||||
|
check_binary podman
|
||||||
|
check_binary jq
|
||||||
|
+ check_binary tar
|
||||||
|
|
||||||
|
if [ -z "$OCF_RESKEY_node_ip_map" ]; then
|
||||||
|
ocf_exit_reason "'node_ip_map' option is required"
|
||||||
|
@@ -1589,6 +1820,9 @@ else
|
||||||
|
fi
|
||||||
|
|
||||||
|
CONTAINER=$OCF_RESKEY_name
|
||||||
|
+POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||||
|
+ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||||
|
+ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||||
|
|
||||||
|
# Note: we currently monitor podman containers by with the "podman exec"
|
||||||
|
# command, so make sure that invocation is always valid by enforcing the
|
||||||
@ -0,0 +1,193 @@
|
|||||||
|
From 11cdff8c886c72c83c26e48e46a8620c06e4c2f0 Mon Sep 17 00:00:00 2001
|
||||||
|
From: E Hila <ehila@redhat.com>
|
||||||
|
Date: Tue, 9 Sep 2025 06:06:12 -0400
|
||||||
|
Subject: [PATCH] OCPBUGS-60977: podman-etcd: wrap ipv6 address in brackets for
|
||||||
|
attribute_node_ip (#2068)
|
||||||
|
|
||||||
|
When trying to determine the node ip address we need to make sure we account for ipv6 and dualstack deployments, and accordingly wrap ipv6 in brackets so it correctly resolves. Since the node ip mapping is provided by the controller, we parse out the IP address of the node from there and use a helper function for building URLs with ports to correctly use brackets for ipv6 ip addresses.
|
||||||
|
|
||||||
|
Signed-off-by: ehila <ehila@redhat.com>
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 77 ++++++++++++++++++++++++++++---------------
|
||||||
|
1 file changed, 51 insertions(+), 26 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 884b7c579..4969fbaaf 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -347,21 +347,41 @@ remove_container()
|
||||||
|
return $rc
|
||||||
|
}
|
||||||
|
|
||||||
|
+# Correctly wraps an ipv6 in [] for url otherwise use return normal ipv4 address.
|
||||||
|
+ip_url() {
|
||||||
|
+ local ip_addr=$1
|
||||||
|
+ local value
|
||||||
|
+ if echo "$ip_addr" | grep -q ":" ; then
|
||||||
|
+ value="[$ip_addr]"
|
||||||
|
+ else
|
||||||
|
+ value="$ip_addr"
|
||||||
|
+ fi
|
||||||
|
+ echo "https://$value"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
attribute_node_ip()
|
||||||
|
{
|
||||||
|
local action="$1"
|
||||||
|
local attribute="node_ip"
|
||||||
|
- local value
|
||||||
|
+ local ip_addr name
|
||||||
|
|
||||||
|
- if ! value=$(ip -brief addr show "$OCF_RESKEY_nic" | awk '{gsub("/.*", "", $3); print $3}'); then
|
||||||
|
- rc=$?
|
||||||
|
- ocf_log err "could not get node ip, error code: $rc"
|
||||||
|
- return "$rc"
|
||||||
|
+ for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
+ name=$(echo "$node" | cut -d: -f1)
|
||||||
|
+ # ignore other nodes
|
||||||
|
+ if [ "$name" != "$NODENAME" ]; then
|
||||||
|
+ continue
|
||||||
|
+ fi
|
||||||
|
+ ip_addr=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||||
|
+ done
|
||||||
|
+
|
||||||
|
+ if [ -z "$ip_addr" ]; then
|
||||||
|
+ ocf_log err "ip address was empty when querying (getent ahosts) for hostname: $(hostname -f)"
|
||||||
|
+ return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
case "$action" in
|
||||||
|
get)
|
||||||
|
- echo "$value"
|
||||||
|
+ echo "$ip_addr"
|
||||||
|
;;
|
||||||
|
update)
|
||||||
|
if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||||
|
@@ -409,26 +429,28 @@ get_env_from_manifest() {
|
||||||
|
}
|
||||||
|
|
||||||
|
prepare_env() {
|
||||||
|
- local name ip standalone_node
|
||||||
|
+ local name ip ipurl standalone_node
|
||||||
|
|
||||||
|
NODEIP="$(attribute_node_ip get)"
|
||||||
|
+ NODEIPURL=$(ip_url $NODEIP)
|
||||||
|
|
||||||
|
if is_force_new_cluster; then
|
||||||
|
- ALL_ETCD_ENDPOINTS="https://$NODEIP:2379"
|
||||||
|
+ ALL_ETCD_ENDPOINTS="$NODEIPURL:2379"
|
||||||
|
ETCD_INITIAL_CLUSTER_STATE="new"
|
||||||
|
- ETCD_INITIAL_CLUSTER="$NODENAME=https://$NODEIP:2380"
|
||||||
|
+ ETCD_INITIAL_CLUSTER="$NODENAME=$NODEIPURL:2380"
|
||||||
|
else
|
||||||
|
ETCD_INITIAL_CLUSTER_STATE="existing"
|
||||||
|
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
- name=$(echo "$node" | awk -F":" '{print $1}')
|
||||||
|
- ip=$(echo "$node" | awk -F":" '{print $2}')
|
||||||
|
+ name=$(echo "$node" | cut -d: -f1)
|
||||||
|
+ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||||
|
+ ipurl="$(ip_url $ip)"
|
||||||
|
if [ -z "$name" ] || [ -z "$ip" ]; then
|
||||||
|
ocf_exit_reason "name or ip missing for 1 or more nodes"
|
||||||
|
exit $OCF_ERR_CONFIGURED
|
||||||
|
fi
|
||||||
|
|
||||||
|
- [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379"
|
||||||
|
- [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=https://$ip:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=https://$ip:2380"
|
||||||
|
+ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379"
|
||||||
|
+ [ -z "$ETCD_INITIAL_CLUSTER" ] && ETCD_INITIAL_CLUSTER="$name=$ipurl:2380" || ETCD_INITIAL_CLUSTER="$ETCD_INITIAL_CLUSTER,$name=$ipurl:2380"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
@@ -609,9 +631,11 @@ add_member_as_learner()
|
||||||
|
local rc
|
||||||
|
local member_name=$1
|
||||||
|
local member_ip=$2
|
||||||
|
+ local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||||
|
+ local peer_url=$(ip_url $member_ip)
|
||||||
|
|
||||||
|
ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||||
|
- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="https://$(attribute_node_ip get):2379" member add "$member_name" --peer-urls="https://$member_ip:2380" --learner)
|
||||||
|
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
ocf_log err "could not add $member_name as learner, error code: $rc"
|
||||||
|
@@ -806,14 +830,15 @@ get_peer_node_name() {
|
||||||
|
|
||||||
|
get_all_etcd_endpoints() {
|
||||||
|
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
- name=$(echo "$node" | awk -F":" '{print $1}')
|
||||||
|
- ip=$(echo "$node" | awk -F":" '{print $2}')
|
||||||
|
+ name=$(echo "$node" | cut -d: -f1)
|
||||||
|
+ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||||
|
+ ipurl="$(ip_url $ip)"
|
||||||
|
if [ -z "$name" ] || [ -z "$ip" ]; then
|
||||||
|
ocf_exit_reason "name or ip missing for 1 or more nodes"
|
||||||
|
exit $OCF_ERR_CONFIGURED
|
||||||
|
fi
|
||||||
|
|
||||||
|
- [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="https://$ip:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,https://$ip:2379"
|
||||||
|
+ [ -z "$ALL_ETCD_ENDPOINTS" ] && ALL_ETCD_ENDPOINTS="$ipurl:2379" || ALL_ETCD_ENDPOINTS="$ALL_ETCD_ENDPOINTS,$ipurl:2379"
|
||||||
|
done
|
||||||
|
echo "$ALL_ETCD_ENDPOINTS"
|
||||||
|
}
|
||||||
|
@@ -831,7 +856,7 @@ get_member_list_json() {
|
||||||
|
# Get the list of members visible to the current node
|
||||||
|
local this_node_endpoint
|
||||||
|
|
||||||
|
- this_node_endpoint="https://$(attribute_node_ip get):2379"
|
||||||
|
+ this_node_endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||||
|
podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -886,14 +911,14 @@ check_peers()
|
||||||
|
# ]
|
||||||
|
# }
|
||||||
|
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
- name=$(echo "$node" | awk -F":" '{print $1}')
|
||||||
|
+ name=$(echo "$node" | cut -d: -f1)
|
||||||
|
# do not check itself
|
||||||
|
if [ "$name" = "$NODENAME" ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
||||||
|
- ip=$(echo "$node" | awk -F":" '{print $2}')
|
||||||
|
+ ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||||
|
id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||||
|
if [ -z "$id" ]; then
|
||||||
|
ocf_log info "$name is not in the members list"
|
||||||
|
@@ -1307,7 +1332,7 @@ podman_start()
|
||||||
|
--log-level=info \
|
||||||
|
--experimental-initial-corrupt-check=true \
|
||||||
|
--snapshot-count=10000 \
|
||||||
|
- --initial-advertise-peer-urls=https://${NODEIP}:2380 \
|
||||||
|
+ --initial-advertise-peer-urls=$NODEIPURL:2380 \
|
||||||
|
--cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.crt \
|
||||||
|
--key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-certs/etcd-serving-${NODENAME}.key \
|
||||||
|
--trusted-ca-file=$SERVER_CACERT \
|
||||||
|
@@ -1316,11 +1341,11 @@ podman_start()
|
||||||
|
--peer-key-file=$ETCD_PEER_KEY \
|
||||||
|
--peer-trusted-ca-file=$SERVER_CACERT \
|
||||||
|
--peer-client-cert-auth=true \
|
||||||
|
- --advertise-client-urls=https://${NODEIP}:2379 \
|
||||||
|
- --listen-client-urls=https://${LISTEN_CLIENT_URLS}:2379,unixs://${NODEIP}:0 \
|
||||||
|
- --listen-peer-urls=https://${LISTEN_PEER_URLS}:2380 \
|
||||||
|
+ --advertise-client-urls=$NODEIPURL:2379 \
|
||||||
|
+ --listen-client-urls=$(ip_url ${LISTEN_CLIENT_URLS}):2379,unixs://${NODEIP}:0 \
|
||||||
|
+ --listen-peer-urls=$(ip_url ${LISTEN_PEER_URLS}):2380 \
|
||||||
|
--metrics=extensive \
|
||||||
|
- --listen-metrics-urls=https://${LISTEN_METRICS_URLS}:9978"
|
||||||
|
+ --listen-metrics-urls=$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||||
|
if [ -n "$OCF_RESKEY_run_cmd_opts" ]; then
|
||||||
|
OCF_RESKEY_run_cmd="$OCF_RESKEY_run_cmd $OCF_RESKEY_run_cmd_opts"
|
||||||
|
fi
|
||||||
|
@@ -1430,7 +1455,7 @@ podman_stop()
|
||||||
|
ocf_log info "last member. Not leaving the member list"
|
||||||
|
else
|
||||||
|
ocf_log info "leaving members list as member with ID $member_id"
|
||||||
|
- endpoint="https://$(attribute_node_ip get):2379"
|
||||||
|
+ endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||||
|
if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||||
|
rc=$?
|
||||||
|
ocf_log err "error leaving members list, error code: $rc"
|
||||||
1127
SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch
Normal file
1127
SOURCES/RHEL-114489-1-powervs-move-ip-new-ra.patch
Normal file
File diff suppressed because it is too large
Load Diff
19
SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch
Normal file
19
SOURCES/RHEL-114489-2-powervs-move-ip-set-bundled-path.patch
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
--- a/heartbeat/powervs-move-ip.in 2025-09-15 16:13:34.225046827 +0200
|
||||||
|
+++ b/heartbeat/powervs-move-ip.in 2025-09-15 17:39:02.746258434 +0200
|
||||||
|
@@ -33,9 +33,13 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
-import requests
|
||||||
|
-import requests.adapters
|
||||||
|
-import urllib3.util
|
||||||
|
+try:
|
||||||
|
+ sys.path.insert(0, '/usr/lib/fence-agents/support/ibm')
|
||||||
|
+ import requests
|
||||||
|
+ import requests.adapters
|
||||||
|
+ import urllib3.util
|
||||||
|
+except ImportError:
|
||||||
|
+ pass
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
OCF_FUNCTIONS_DIR = os.environ.get(
|
||||||
@ -0,0 +1,197 @@
|
|||||||
|
From a4e496e5e6d9abde1b071fa2dfa1c6e7ba899cf1 Mon Sep 17 00:00:00 2001
|
||||||
|
From: =?UTF-8?q?Edmund=20H=C3=A4fele?= <edmund.haefele@de.ibm.com>
|
||||||
|
Date: Thu, 30 Oct 2025 13:03:22 +0100
|
||||||
|
Subject: [PATCH] Update powervs-move-ip
|
||||||
|
|
||||||
|
- Add `iflabel` argument.
|
||||||
|
- Increase maximum number of retries for HTTP requests to four.
|
||||||
|
---
|
||||||
|
heartbeat/powervs-move-ip.in | 66 +++++++++++++++++++++++++-----------
|
||||||
|
1 file changed, 47 insertions(+), 19 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in
|
||||||
|
index d55979e52..e2250c998 100755
|
||||||
|
--- a/heartbeat/powervs-move-ip.in
|
||||||
|
+++ b/heartbeat/powervs-move-ip.in
|
||||||
|
@@ -50,11 +50,13 @@ RESOURCE_OPTIONS = (
|
||||||
|
"use_token_cache",
|
||||||
|
"monitor_api",
|
||||||
|
"device",
|
||||||
|
+ "iflabel",
|
||||||
|
"proxy",
|
||||||
|
)
|
||||||
|
IP_CMD = "/usr/sbin/ip"
|
||||||
|
+IFLABEL_MAX_LEN = 15 # Maximum character limit for interface labels
|
||||||
|
REQUESTS_TIMEOUT = 5 # Timeout for requests calls
|
||||||
|
-HTTP_MAX_RETRIES = 3 # Maximum number of retries for HTTP requests
|
||||||
|
+HTTP_MAX_RETRIES = 4 # Maximum number of retries for HTTP requests
|
||||||
|
HTTP_BACKOFF_FACTOR = 0.3 # Sleep (factor * (2^number of previous retries)) secs
|
||||||
|
HTTP_STATUS_FORCE_RETRIES = (500, 502, 503, 504) # HTTP status codes to retry on
|
||||||
|
HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "PUT", "DELETE"})
|
||||||
|
@@ -154,13 +156,13 @@ def ip_check_device(device):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
-def ip_alias_add(ip, device):
|
||||||
|
+def ip_alias_add(ip, device, label=None):
|
||||||
|
"""Add an IP alias to the given device."""
|
||||||
|
ip_cidr = f"{ip}/{CIDR_NETMASK}"
|
||||||
|
ocf.logger.debug(
|
||||||
|
- f"[ip_alias_add]: adding IP alias '{ip_cidr}' to interface '{device}'"
|
||||||
|
+ f"[ip_alias_add]: adding IP alias '{ip_cidr}' with label '{label}' to interface '{device}'"
|
||||||
|
)
|
||||||
|
- _ = ip_address_add(ip_cidr, device)
|
||||||
|
+ _ = ip_address_add(ip_cidr, device, label)
|
||||||
|
|
||||||
|
|
||||||
|
def ip_alias_remove(ip):
|
||||||
|
@@ -522,6 +524,7 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||||
|
region="",
|
||||||
|
route_host_map="",
|
||||||
|
device="",
|
||||||
|
+ iflabel="",
|
||||||
|
proxy="",
|
||||||
|
monitor_api="",
|
||||||
|
use_token_cache="",
|
||||||
|
@@ -543,6 +546,7 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||||
|
self.route_info = self._get_route_info()
|
||||||
|
self.route_name = self.route_info["name"]
|
||||||
|
self.device = self._get_device_name(device)
|
||||||
|
+ self.iflabel = self._make_iflabel(iflabel)
|
||||||
|
|
||||||
|
def _get_ip_info(self, ip):
|
||||||
|
"""Validate the given IP address and return its standard form."""
|
||||||
|
@@ -588,7 +592,7 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||||
|
nodename = (
|
||||||
|
hostname
|
||||||
|
if not self._is_remote_route
|
||||||
|
- else next((h for h in route_map if h != hostname), None)
|
||||||
|
+ else next((host for host in route_map if host != hostname), None)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not nodename or nodename not in route_map:
|
||||||
|
@@ -646,6 +650,21 @@ class PowerCloudRoute(PowerCloudAPI):
|
||||||
|
ocf.OCF_ERR_CONFIGURED,
|
||||||
|
)
|
||||||
|
|
||||||
|
+ def _make_iflabel(self, label=None):
|
||||||
|
+ """Constructs an interface label in the format 'device:label' if both are provided."""
|
||||||
|
+ if not label or self._is_remote_route:
|
||||||
|
+ return None
|
||||||
|
+
|
||||||
|
+ iflabel = f"{self.device}:{label}"
|
||||||
|
+
|
||||||
|
+ if len(iflabel) > IFLABEL_MAX_LEN:
|
||||||
|
+ raise PowerCloudRouteError(
|
||||||
|
+ f"_make_iflabel: interface label '{iflabel}' exceeds limit of {IFLABEL_MAX_LEN} characters",
|
||||||
|
+ ocf.OCF_ERR_CONFIGURED,
|
||||||
|
+ )
|
||||||
|
+
|
||||||
|
+ return iflabel
|
||||||
|
+
|
||||||
|
def _set_route_enabled(self, enabled: bool):
|
||||||
|
"""Enable or disable the PowerVS network route."""
|
||||||
|
resource = f"/v1/routes/{self.route_id}"
|
||||||
|
@@ -706,6 +725,7 @@ def start_action(
|
||||||
|
use_token_cache="",
|
||||||
|
monitor_api="",
|
||||||
|
device="",
|
||||||
|
+ iflabel="",
|
||||||
|
proxy="",
|
||||||
|
):
|
||||||
|
"""Assign the service IP.
|
||||||
|
@@ -730,7 +750,7 @@ def start_action(
|
||||||
|
local_route = create_route_instance(resource_options)
|
||||||
|
|
||||||
|
# Add IP alias
|
||||||
|
- ip_alias_add(ip, local_route.device)
|
||||||
|
+ ip_alias_add(ip, local_route.device, local_route.iflabel)
|
||||||
|
|
||||||
|
# Enable local route
|
||||||
|
ocf.logger.debug(f"[start_action]: enabling local route '{local_route.route_name}'")
|
||||||
|
@@ -758,6 +778,7 @@ def stop_action(
|
||||||
|
use_token_cache="",
|
||||||
|
monitor_api="",
|
||||||
|
device="",
|
||||||
|
+ iflabel="",
|
||||||
|
proxy="",
|
||||||
|
):
|
||||||
|
"""Remove the service IP.
|
||||||
|
@@ -810,6 +831,7 @@ def monitor_action(
|
||||||
|
use_token_cache="",
|
||||||
|
monitor_api="",
|
||||||
|
device="",
|
||||||
|
+ iflabel="",
|
||||||
|
proxy="",
|
||||||
|
):
|
||||||
|
"""Monitor the service IP.
|
||||||
|
@@ -829,15 +851,11 @@ def monitor_action(
|
||||||
|
interface_name = ip_find_device(ip)
|
||||||
|
|
||||||
|
if not use_extended_monitor:
|
||||||
|
- if interface_name:
|
||||||
|
- ocf.logger.debug(
|
||||||
|
- f"[monitor_action]: IP alias '{ip}' is active'"
|
||||||
|
- )
|
||||||
|
+ if interface_name:
|
||||||
|
+ ocf.logger.debug(f"[monitor_action]: IP alias '{ip}' is active'")
|
||||||
|
return ocf.OCF_SUCCESS
|
||||||
|
- else:
|
||||||
|
- ocf.logger.debug(
|
||||||
|
- f"[monitor_action]: IP alias '{ip}' is not active"
|
||||||
|
- )
|
||||||
|
+ else:
|
||||||
|
+ ocf.logger.debug(f"[monitor_action]: IP alias '{ip}' is not active")
|
||||||
|
return ocf.OCF_NOT_RUNNING
|
||||||
|
|
||||||
|
remote_route = create_route_instance(
|
||||||
|
@@ -893,6 +911,7 @@ def validate_all_action(
|
||||||
|
use_token_cache="",
|
||||||
|
monitor_api="",
|
||||||
|
device="",
|
||||||
|
+ iflabel="",
|
||||||
|
proxy="",
|
||||||
|
):
|
||||||
|
"""Validate resource agent parameters.
|
||||||
|
@@ -914,12 +933,10 @@ def main():
|
||||||
|
Resource Agent to move an IP address from one Power Virtual Server instance to another.
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
- 1. Red Hat Enterprise Linux 9.4 or higher
|
||||||
|
-
|
||||||
|
- 2. Two-node cluster
|
||||||
|
+ 1. Two-node cluster
|
||||||
|
- Distributed across two PowerVS workspaces in separate data centers within the same region.
|
||||||
|
|
||||||
|
- 3. IBM Cloud API Key:
|
||||||
|
+ 2. IBM Cloud API Key:
|
||||||
|
- Create a service API key with privileges for both workspaces.
|
||||||
|
- Save the key in a file and copy it to both cluster nodes using the same path and filename.
|
||||||
|
- Reference the key file path in the resource definition.
|
||||||
|
@@ -932,7 +949,7 @@ def main():
|
||||||
|
"powervs-move-ip",
|
||||||
|
shortdesc="Manages Power Virtual Server overlay IP routes.",
|
||||||
|
longdesc=agent_description,
|
||||||
|
- version=1.00,
|
||||||
|
+ version=1.01,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent.add_parameter(
|
||||||
|
@@ -1011,6 +1028,17 @@ def main():
|
||||||
|
default="",
|
||||||
|
required=False,
|
||||||
|
)
|
||||||
|
+ agent.add_parameter(
|
||||||
|
+ "iflabel",
|
||||||
|
+ shortdesc="Network interface label",
|
||||||
|
+ longdesc=(
|
||||||
|
+ "A custom suffix for the IP address label. "
|
||||||
|
+ "It is appended to the interface name in the format device:label. "
|
||||||
|
+ "The full label must not exceed 15 characters. "
|
||||||
|
+ ),
|
||||||
|
+ content_type="string",
|
||||||
|
+ required=False,
|
||||||
|
+ )
|
||||||
|
agent.add_parameter(
|
||||||
|
"proxy",
|
||||||
|
shortdesc="Proxy",
|
||||||
@ -0,0 +1,186 @@
|
|||||||
|
From 1afdd91b2961061937fc802c575304ede8d79286 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Wed, 10 Sep 2025 16:56:56 +0200
|
||||||
|
Subject: [PATCH] podman-etcd: Add cluster-wide force_new_cluster attribute
|
||||||
|
checking
|
||||||
|
|
||||||
|
Implement cluster-wide validation of force_new_cluster attribute to resolve
|
||||||
|
race conditions during automated cluster recovery. The enhancement ensures
|
||||||
|
agents check for the cluster-wide attribute before falling back to local
|
||||||
|
etcd revision comparison.
|
||||||
|
|
||||||
|
Key changes:
|
||||||
|
- Enhanced get_force_new_cluster() to query all cluster nodes
|
||||||
|
- Ensure force_new_cluster is not set in both nodes to prevent
|
||||||
|
conflicting recovery attempts
|
||||||
|
- Updated startup logic to prioritize cluster-wide attribute checking
|
||||||
|
|
||||||
|
fixes OCPBUGS-61117
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 107 ++++++++++++++++++++++++++++--------------
|
||||||
|
1 file changed, 72 insertions(+), 35 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 33804414a..f3a6da5e2 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -794,54 +794,72 @@ set_force_new_cluster()
|
||||||
|
return $rc
|
||||||
|
}
|
||||||
|
|
||||||
|
+# get_force_new_cluster returns a space-separated list of nodes that have the force_new_cluster attribute set.
|
||||||
|
+# Return values:
|
||||||
|
+# - Exit code 0 with non-empty output: One or more nodes have the force_new_cluster attribute set
|
||||||
|
+# - Exit code 0 with empty output: No nodes have the force_new_cluster attribute set
|
||||||
|
+# - Exit code 1 with empty output: Error occurred while querying the cluster nodes
|
||||||
|
get_force_new_cluster()
|
||||||
|
{
|
||||||
|
- crm_attribute --lifetime reboot --query --name "force_new_cluster" | awk -F"value=" '{print $2}'
|
||||||
|
+ local node nodes value
|
||||||
|
+ local holders=""
|
||||||
|
+
|
||||||
|
+ if ! nodes=$(crm_node -l | awk '{print $2}'); then
|
||||||
|
+ ocf_log err "could not get force_new_cluster attribute, crm_node error code: $?"
|
||||||
|
+ return 1
|
||||||
|
+ fi
|
||||||
|
+ if [ -z "$nodes" ]; then
|
||||||
|
+ ocf_log err "could not get force_new_cluster attribute, the list of nodes is empty"
|
||||||
|
+ return 1
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ for node in $nodes; do
|
||||||
|
+ if ! value=$(crm_attribute --query --lifetime reboot --name "force_new_cluster" --node "$node" 2>/dev/null | awk -F'value=' '{print $2}' | tr -d "'"); then
|
||||||
|
+ ocf_log err "could not get force_new_cluster attribute, crm_attribut error code: $?"
|
||||||
|
+ return 1
|
||||||
|
+ fi
|
||||||
|
+ if [ -n "$value" ]; then
|
||||||
|
+ holders="$holders$node "
|
||||||
|
+ fi
|
||||||
|
+ done
|
||||||
|
+ echo "$holders"
|
||||||
|
}
|
||||||
|
|
||||||
|
+
|
||||||
|
clear_force_new_cluster()
|
||||||
|
{
|
||||||
|
- local force_new_cluster_node
|
||||||
|
-
|
||||||
|
- force_new_cluster_node=$(get_force_new_cluster)
|
||||||
|
- if [ -z "$force_new_cluster_node" ]; then
|
||||||
|
- ocf_log info "$NODENAME: force_new_cluster attribute not set"
|
||||||
|
+ # only the holder of "force_new_cluster" attribute can delete it
|
||||||
|
+ if ! is_force_new_cluster; then
|
||||||
|
+ ocf_log info "force_new_cluster unset or not owned by $NODENAME"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
|
||||||
|
- # only the holder of "force_new_cluster" attribute can delete it
|
||||||
|
- if [ "$NODENAME" = "$force_new_cluster_node" ]; then
|
||||||
|
- crm_attribute --lifetime reboot --name "force_new_cluster" --delete
|
||||||
|
- rc=$?
|
||||||
|
- if [ $rc -ne 0 ]; then
|
||||||
|
- ocf_log err "could not clear force_new_cluster attribute, error code: $rc"
|
||||||
|
- else
|
||||||
|
- ocf_log info "$NODENAME: force_new_cluster attribute cleared"
|
||||||
|
- fi
|
||||||
|
- return $rc
|
||||||
|
- else
|
||||||
|
- ocf_log info "$NODENAME does not hold force_new_cluster ($force_new_cluster_node has it)"
|
||||||
|
- return $OCF_SUCCESS
|
||||||
|
+ if ! crm_attribute --delete --lifetime reboot --node "$NODENAME" --name "force_new_cluster"; then
|
||||||
|
+ ocf_log err "could not clear force_new_cluster attribute, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
+
|
||||||
|
+ ocf_log info "$NODENAME: force_new_cluster attribute cleared"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
+
|
||||||
|
is_force_new_cluster()
|
||||||
|
{
|
||||||
|
- # Return 0 if 'force_new_cluster' is set and the value matches the current node name, 1 otherwise.
|
||||||
|
- local value
|
||||||
|
+ # Return 0 if 'force_new_cluster' is set on the current node, 1 otherwise.
|
||||||
|
+ local fnc_holders
|
||||||
|
|
||||||
|
- value=$(get_force_new_cluster)
|
||||||
|
- if [ -z "$value" ]; then
|
||||||
|
- ocf_log debug "force_new_cluster attribute is not set"
|
||||||
|
- return 1
|
||||||
|
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||||
|
+ ocf_exit_reason "is_force_new_cluster: Failed to get force_new_cluster node holders"
|
||||||
|
+ exit $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if [ "$value" = "$NODENAME" ]; then
|
||||||
|
+ if echo "$fnc_holders" | grep -q -w "$NODENAME"; then
|
||||||
|
ocf_log debug "$NODENAME has force_new_cluster set"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
- ocf_log info "force_new_cluster attribute set on peer node $value"
|
||||||
|
+ ocf_log debug "force_new_cluster attribute is not set on $NODENAME"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -1415,17 +1433,34 @@ podman_start()
|
||||||
|
return "$OCF_ERR_GENERIC"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- # force-new-cluster property is a runtime-scoped flag that instructs the agent to force a new cluster-of-1.
|
||||||
|
- # Since this attribute is configured with a reboot-lifetime, it is automatically cleared when the machine reboots.
|
||||||
|
- # If the agent detects during its start that this property is set, it indicates that the flag was explicitly set
|
||||||
|
- # during the current node boot session, implying a deliberate request to recover the cluster.
|
||||||
|
if ocf_is_true "$pod_was_running"; then
|
||||||
|
ocf_log info "static pod was running: start normally"
|
||||||
|
else
|
||||||
|
- if is_force_new_cluster; then
|
||||||
|
- ocf_log notice "'$NODENAME' marked to force-new-cluster"
|
||||||
|
+ local fnc_holders
|
||||||
|
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||||
|
+ ocf_exit_reason "Failed to get force_new_cluster node holders"
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ local fnc_holder_count
|
||||||
|
+ fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||||
|
+ if [ "$fnc_holder_count" -gt 1 ]; then
|
||||||
|
+ ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if [ "$fnc_holder_count" -eq 1 ]; then
|
||||||
|
+ if echo "$fnc_holders" | grep -q -w "$NODENAME"; then
|
||||||
|
+ # Attribute is set on the local node.
|
||||||
|
+ ocf_log notice "$NODENAME marked to force-new-cluster"
|
||||||
|
+ JOIN_AS_LEARNER=false
|
||||||
|
+ else
|
||||||
|
+ # Attribute is set on a peer node.
|
||||||
|
+ ocf_log info "$NODENAME shall join as learner because force_new_cluster is set on peer $fnc_holders"
|
||||||
|
+ JOIN_AS_LEARNER=true
|
||||||
|
+ fi
|
||||||
|
else
|
||||||
|
- ocf_log info "'$NODENAME' is not marked to force-new-cluster"
|
||||||
|
+ ocf_log info "no node is marked to force-new-cluster"
|
||||||
|
# When the local agent starts, we can infer the cluster state by counting
|
||||||
|
# how many agents are starting or already active:
|
||||||
|
# - 1 active agent: it's the peer (we are just starting)
|
||||||
|
@@ -1522,7 +1557,7 @@ podman_start()
|
||||||
|
for try in $(seq $retries); do
|
||||||
|
learner_node=$(attribute_learner_node get)
|
||||||
|
if [ "$NODENAME" != "$learner_node" ]; then
|
||||||
|
- ocf_log info "$learner_node is not in the member list yet. Retry in $poll_interval_sec seconds."
|
||||||
|
+ ocf_log info "$NODENAME is not in the member list yet. Retry in $poll_interval_sec seconds."
|
||||||
|
sleep $poll_interval_sec
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
@@ -1673,6 +1708,8 @@ podman_stop()
|
||||||
|
{
|
||||||
|
local timeout=60
|
||||||
|
local rc
|
||||||
|
+
|
||||||
|
+ ocf_log notice "podman-etcd stop"
|
||||||
|
podman_simple_status
|
||||||
|
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||||
|
ocf_log info "could not leave members list: etcd container not running"
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
From 1e546b85010e5fdbf7a0f31207dce144c14c50ec Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Wed, 29 Oct 2025 15:17:30 +0100
|
||||||
|
Subject: [PATCH] MailTo: add s-nail support for multiple recipients
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/MailTo | 16 ++++++++++------
|
||||||
|
1 file changed, 10 insertions(+), 6 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/MailTo b/heartbeat/MailTo
|
||||||
|
index 56940bafaa..a3ee6a04c8 100755
|
||||||
|
--- a/heartbeat/MailTo
|
||||||
|
+++ b/heartbeat/MailTo
|
||||||
|
@@ -92,12 +92,16 @@ END
|
||||||
|
}
|
||||||
|
|
||||||
|
MailProgram() {
|
||||||
|
- $MAILCMD -s "$1" "$email" <<EOF
|
||||||
|
- $Subject
|
||||||
|
-
|
||||||
|
- Command line was:
|
||||||
|
- $ARGS
|
||||||
|
-EOF
|
||||||
|
+ local body="\
|
||||||
|
+$Subject
|
||||||
|
+
|
||||||
|
+Command line was:
|
||||||
|
+$ARGS"
|
||||||
|
+ if $MAILCMD -V | grep -q "^s-nail"; then
|
||||||
|
+ printf "$body" | $MAILCMD -s "$1" $(echo $email | sed "s/,\s*/ /g")
|
||||||
|
+ else
|
||||||
|
+ printf "$body" | $MAILCMD -s "$1" "$email"
|
||||||
|
+ fi
|
||||||
|
return $?
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,321 @@
|
|||||||
|
From a31f15104fc712cd25f8a59d49f1bbcdbbbc5434 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Tue, 30 Sep 2025 11:54:44 +0200
|
||||||
|
Subject: [PATCH 1/2] Refactor(podman-etcd): improve peer checking and
|
||||||
|
leadership loss detection
|
||||||
|
|
||||||
|
The check_peers function is broken up into smaller, more manageable
|
||||||
|
functions. This refactoring separates the logic for detecting a loss of
|
||||||
|
cluster leadership from the logic for managing peer membership.
|
||||||
|
|
||||||
|
The main function is renamed to check_peer as there is only 1 peer to
|
||||||
|
check (it was check_peers).
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 78 +++++++++++++++++++++++++------------------
|
||||||
|
1 file changed, 45 insertions(+), 33 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index f3a6da5e2..3d1e4c520 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1014,42 +1014,35 @@ get_member_list_json() {
|
||||||
|
podman exec "${CONTAINER}" etcdctl member list --endpoints="$this_node_endpoint" -w json
|
||||||
|
}
|
||||||
|
|
||||||
|
-check_peers()
|
||||||
|
+detect_cluster_leadership_loss()
|
||||||
|
{
|
||||||
|
- # Check peers endpoint status and locally accessible member list
|
||||||
|
- local member_list_json
|
||||||
|
-
|
||||||
|
- if ! container_exists; then
|
||||||
|
- # we need a running container to execute etcdctl.
|
||||||
|
- return $OCF_SUCCESS
|
||||||
|
+ endpoint_status_json=$(get_endpoint_status_json)
|
||||||
|
+ ocf_log info "endpoint status: $endpoint_status_json"
|
||||||
|
+
|
||||||
|
+ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||||
|
+ if [ "$count_endpoints" -eq 1 ]; then
|
||||||
|
+ ocf_log info "one endpoint only: checking status errors"
|
||||||
|
+ endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
||||||
|
+ if echo "$endpoint_status_errors" | grep -q "no leader"; then
|
||||||
|
+ set_force_new_cluster
|
||||||
|
+ set_standalone_node
|
||||||
|
+ ocf_exit_reason "$NODENAME must force a new cluster"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if [ "$endpoint_status_errors" != "null" ]; then
|
||||||
|
+ ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
- member_list_json=$(get_member_list_json)
|
||||||
|
- rc=$?
|
||||||
|
- ocf_log debug "member list: $member_list_json"
|
||||||
|
- if [ $rc -ne 0 ]; then
|
||||||
|
- ocf_log info "podman failed to get member list, error code: $rc"
|
||||||
|
-
|
||||||
|
- endpoint_status_json=$(get_endpoint_status_json)
|
||||||
|
- ocf_log info "endpoint status: $endpoint_status_json"
|
||||||
|
-
|
||||||
|
- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||||
|
- if [ "$count_endpoints" -eq 1 ]; then
|
||||||
|
- ocf_log info "one endpoint only: checking status errors"
|
||||||
|
- endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
||||||
|
- if echo "$endpoint_status_errors" | grep -q "no leader"; then
|
||||||
|
- set_force_new_cluster
|
||||||
|
- set_standalone_node
|
||||||
|
- ocf_exit_reason "$NODENAME must force a new cluster"
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
- fi
|
||||||
|
- if [ "$endpoint_status_errors" != "null" ]; then
|
||||||
|
- ocf_log err "unmanaged endpoint status error: $endpoint_status_errors"
|
||||||
|
- fi
|
||||||
|
- fi
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
|
||||||
|
- return $OCF_SUCCESS
|
||||||
|
- fi
|
||||||
|
+manage_peer_membership()
|
||||||
|
+{
|
||||||
|
+ # Read etcd member list to detect the status of the peer member.
|
||||||
|
+ # If the peer is missing from the member list, it will be added back as learner
|
||||||
|
+ # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
|
||||||
|
+ local member_list_json="$1"
|
||||||
|
|
||||||
|
# Example of .members[] instance fields in member list json format:
|
||||||
|
# NOTE that "name" is present in voting members only, while "isLearner" in learner members only
|
||||||
|
@@ -1083,6 +1076,25 @@ check_peers()
|
||||||
|
clear_standalone_and_learner_if_not_learners "$member_list_json"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+check_peer()
|
||||||
|
+{
|
||||||
|
+ # Check peers endpoint status and locally accessible member list
|
||||||
|
+ local member_list_json
|
||||||
|
+
|
||||||
|
+ # we need a running container to execute etcdctl.
|
||||||
|
+ if ! container_exists; then
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! member_list_json=$(get_member_list_json); then
|
||||||
|
+ ocf_log info "podman failed to get member list, error code: $?"
|
||||||
|
+ detect_cluster_leadership_loss
|
||||||
|
+ return $?
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ manage_peer_membership "$member_list_json"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -1124,7 +1136,7 @@ podman_monitor()
|
||||||
|
# monitor operation to fail.
|
||||||
|
# TODO: move this inside check_peers where we already query member list json
|
||||||
|
attribute_node_member_id update
|
||||||
|
- if ! check_peers; then
|
||||||
|
+ if ! check_peer; then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
From de7c73a933cefb8f7b9e810bd23c3d12f6d6f29a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Tue, 30 Sep 2025 18:38:06 +0200
|
||||||
|
Subject: [PATCH 2/2] OCPBUGS-42808: podman-etcd: add automatic learner member
|
||||||
|
promotion
|
||||||
|
|
||||||
|
Automatically promote etcd learner members to voting members when detected.
|
||||||
|
Includes refactored member management functions and improved validation.
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 108 ++++++++++++++++++++++++++++++------------
|
||||||
|
1 file changed, 79 insertions(+), 29 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 3d1e4c520..e1425ec02 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -712,6 +712,22 @@ attribute_node_revision_peer()
|
||||||
|
crm_attribute --query --type nodes --node "$nodename" --name "revision" | awk -F"value=" '{print $2}'
|
||||||
|
}
|
||||||
|
|
||||||
|
+# Converts a decimal number to hexadecimal format with validation
|
||||||
|
+# Args: $1 - decimal number (test for non-negative integer too)
|
||||||
|
+# Returns: 0 on success, OCF_ERR_GENERIC on invalid input
|
||||||
|
+# Outputs: hexadecimal representation to stdout
|
||||||
|
+decimal_to_hex() {
|
||||||
|
+ local dec=$1
|
||||||
|
+
|
||||||
|
+ if ! echo "$dec" | grep -q "^[1-9][0-9]*$"; then
|
||||||
|
+ ocf_log err "Invalid member ID format: '$dec' (expected decimal number)"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ printf "%x" "$dec"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
attribute_node_member_id()
|
||||||
|
{
|
||||||
|
local action="$1"
|
||||||
|
@@ -737,7 +753,7 @@ attribute_node_member_id()
|
||||||
|
return "$rc"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- local value
|
||||||
|
+ local value value_hex
|
||||||
|
if ! value=$(echo -n "$member_list_json" | jq -r ".header.member_id"); then
|
||||||
|
rc=$?
|
||||||
|
ocf_log err "could not get $attribute from member list JSON, error code: $rc"
|
||||||
|
@@ -745,8 +761,11 @@ attribute_node_member_id()
|
||||||
|
fi
|
||||||
|
|
||||||
|
# JSON member_id is decimal, while etcdctl command needs the hex version
|
||||||
|
- value=$(printf "%x" "$value")
|
||||||
|
- if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value"; then
|
||||||
|
+ if ! value_hex=$(decimal_to_hex "$value"); then
|
||||||
|
+ ocf_log err "could not convert decimal member_id '$value' to hex, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if ! crm_attribute --type nodes --node "$NODENAME" --name "$attribute" --update "$value_hex"; then
|
||||||
|
rc=$?
|
||||||
|
ocf_log err "could not update etcd $attribute, error code: $rc"
|
||||||
|
return "$rc"
|
||||||
|
@@ -905,42 +924,70 @@ clear_standalone_node()
|
||||||
|
crm_attribute --name "standalone_node" --delete
|
||||||
|
}
|
||||||
|
|
||||||
|
-clear_standalone_and_learner_if_not_learners()
|
||||||
|
+
|
||||||
|
+# Promotes an etcd learner member to a voting member
|
||||||
|
+# Args: $1 - learner member ID in decimal format
|
||||||
|
+# Returns: OCF_SUCCESS (even on expected promotion failures), OCF_ERR_GENERIC on conversion errors
|
||||||
|
+# Note: Promotion failures are expected and logged as info (peer may not be up-to-date)
|
||||||
|
+promote_learner_member()
|
||||||
|
+{
|
||||||
|
+ local learner_member_id=$1
|
||||||
|
+
|
||||||
|
+ # JSON member_id is decimal, while etcdctl command needs the hex version
|
||||||
|
+ if ! learner_member_id_hex=$(decimal_to_hex "$learner_member_id"); then
|
||||||
|
+ ocf_log err "could not convert decimal member_id '$learner_member_id' to hex, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
|
||||||
|
+ # promotion is expected to fail if the peer is not yet up-to-date
|
||||||
|
+ ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+ ocf_log info "successfully promoted member '$learner_member_id_hex'"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+# Reconciles etcd cluster member states
|
||||||
|
+# Promotes learner members or clears standalone/learner attributes as needed
|
||||||
|
+# Args: $1 - member list JSON from etcdctl
|
||||||
|
+# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
|
||||||
|
+# Note: Only operates when exactly 2 started members are present
|
||||||
|
+reconcile_member_state()
|
||||||
|
{
|
||||||
|
local rc
|
||||||
|
local member_list_json="$1"
|
||||||
|
|
||||||
|
- number_of_members=$(printf "%s" "$member_list_json" | jq -r ".members[].ID" | wc -l)
|
||||||
|
- if [ "$number_of_members" -ne 2 ]; then
|
||||||
|
- ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_members members, need 2"
|
||||||
|
+ # count only the started members, which have the ".name" JSON field
|
||||||
|
+ number_of_started_members=$(printf "%s" "$member_list_json" | jq -r ".members[].name | select(. != null)" | wc -l)
|
||||||
|
+ if [ "$number_of_started_members" -ne 2 ]; then
|
||||||
|
+ ocf_log info "could not clear standalone_node, nor learner_node properties: found $number_of_started_members members, need 2"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
|
||||||
|
- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||||
|
+ learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
ocf_log err "could not get isLearner field from member list, error code: $rc"
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if [ -z "$id" ]; then
|
||||||
|
- clear_standalone_node
|
||||||
|
- rc=$?
|
||||||
|
- if [ $rc -ne 0 ]; then
|
||||||
|
- ocf_og error "could not clear standalone_node attribute, error code: $rc"
|
||||||
|
- return $rc
|
||||||
|
- fi
|
||||||
|
+ if [ -n "$learner_member_id" ]; then
|
||||||
|
+ promote_learner_member "$learner_member_id"
|
||||||
|
+ return $?
|
||||||
|
fi
|
||||||
|
- if [ -z "$id" ]; then
|
||||||
|
- attribute_learner_node clear
|
||||||
|
- rc=$?
|
||||||
|
- if [ $rc -ne 0 ]; then
|
||||||
|
- ocf_og error "could not clear learner_node attribute, error code: $rc"
|
||||||
|
- return $rc
|
||||||
|
+
|
||||||
|
+ if [ -z "$learner_member_id" ]; then
|
||||||
|
+ if ! clear_standalone_node; then
|
||||||
|
+ ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if ! attribute_learner_node clear; then
|
||||||
|
+ ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
- return $rc
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
attribute_learner_node()
|
||||||
|
@@ -1019,7 +1066,7 @@ detect_cluster_leadership_loss()
|
||||||
|
endpoint_status_json=$(get_endpoint_status_json)
|
||||||
|
ocf_log info "endpoint status: $endpoint_status_json"
|
||||||
|
|
||||||
|
- count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||||
|
+ count_endpoints=$(printf "%s" "$endpoint_status_json" | jq -r ".[].Endpoint" | wc -l)
|
||||||
|
if [ "$count_endpoints" -eq 1 ]; then
|
||||||
|
ocf_log info "one endpoint only: checking status errors"
|
||||||
|
endpoint_status_errors=$(printf "%s" "$endpoint_status_json" | jq -r ".[0].Status.errors")
|
||||||
|
@@ -1037,11 +1084,14 @@ detect_cluster_leadership_loss()
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
+
|
||||||
|
+# Manages etcd peer membership by detecting and handling missing or rejoining peers
|
||||||
|
+# Adds missing peers as learners and reconciles member states when peers rejoin
|
||||||
|
+# Args: $1 - member list JSON from etcdctl
|
||||||
|
+# Returns: OCF_SUCCESS on completion, OCF_ERR_GENERIC on errors
|
||||||
|
+# Note: Iterates through all peer nodes to ensure proper cluster membership
|
||||||
|
manage_peer_membership()
|
||||||
|
{
|
||||||
|
- # Read etcd member list to detect the status of the peer member.
|
||||||
|
- # If the peer is missing from the member list, it will be added back as learner
|
||||||
|
- # If the peer is back in the member list, we ensure that the related CIB attributes (standalone and learner_node) are reset
|
||||||
|
local member_list_json="$1"
|
||||||
|
|
||||||
|
# Example of .members[] instance fields in member list json format:
|
||||||
|
@@ -1066,14 +1116,14 @@ manage_peer_membership()
|
||||||
|
|
||||||
|
# Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
||||||
|
ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||||
|
- id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||||
|
- if [ -z "$id" ]; then
|
||||||
|
+ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||||
|
+ if [ -z "$peer_member_id" ]; then
|
||||||
|
ocf_log info "$name is not in the members list"
|
||||||
|
add_member_as_learner "$name" "$ip"
|
||||||
|
set_standalone_node
|
||||||
|
else
|
||||||
|
ocf_log debug "$name is in the members list by IP: $ip"
|
||||||
|
- clear_standalone_and_learner_if_not_learners "$member_list_json"
|
||||||
|
+ reconcile_member_state "$member_list_json"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
135
SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch
Normal file
135
SOURCES/RHEL-121986-Filesystem-speed-up-get-PIDs.patch
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
From 93729d83fa5bf15f4ec694e08e9777bde858fb41 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||||
|
Date: Thu, 16 Oct 2025 10:58:37 +0200
|
||||||
|
Subject: [PATCH 1/2] Filesystem: speed up get_pids
|
||||||
|
|
||||||
|
With force_umount=safe, we "manually" scan the /proc/ file system.
|
||||||
|
|
||||||
|
We look for symlinks pointing into the path we are interested in.
|
||||||
|
Specifically, we are interested in
|
||||||
|
/proc/<pid>/{root,exe,cwd}
|
||||||
|
/proc/<pid>/fd/<fd>
|
||||||
|
We also look for relevant memory mappings in /proc/<pid>/maps
|
||||||
|
|
||||||
|
All these are per process, not per "task" or "thread".
|
||||||
|
see procfs(5) and pthreads(7).
|
||||||
|
Still, we currently also scan /proc/<pid>/task/<tid>/
|
||||||
|
for all the same things.
|
||||||
|
|
||||||
|
With a large system with many heavily threaded processes,
|
||||||
|
this can significantly slow down this scanning,
|
||||||
|
without gaining new information.
|
||||||
|
|
||||||
|
Adding -maxdepth to the find command line avoids this useless work,
|
||||||
|
potentially reducing the scanning time by orders of magnitute
|
||||||
|
on systems with many heavily threaded processes.
|
||||||
|
|
||||||
|
We could also write a dedicated helper in C to do the very same thing,
|
||||||
|
with the option to "short circuit" and proceed with the next pid
|
||||||
|
as soon as the first "match" is found for the currently inspected pid.
|
||||||
|
|
||||||
|
That could further reduce the scanning time
|
||||||
|
by about an additional factor of 10.
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 25 +++++++++++++++++++++----
|
||||||
|
1 file changed, 21 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index 6d3960162..f76339fd6 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -680,14 +680,31 @@ get_pids()
|
||||||
|
# -path "/proc/[!0-9]*" -prune -o ...
|
||||||
|
# -path "/proc/[0-9]*" -a ...
|
||||||
|
# the latter seemd to be significantly faster for this one in my naive test.
|
||||||
|
+
|
||||||
|
+ # root, cwd, exe, maps, fd: all per process, not per task ("thread").
|
||||||
|
+ # -maxdepth to avoid repeatedly scanning the same thing
|
||||||
|
+ # for all threads of a heavily threaded process.
|
||||||
|
+ #
|
||||||
|
+ # Adding -maxdepth reduced scanning from > 16 seconds to < 2 seconds
|
||||||
|
+ # on a mostly idle system that happened to run a few java processes.
|
||||||
|
+ #
|
||||||
|
+ # We can also add a dedicated helper in C do twhat is done below,
|
||||||
|
+ # which would reduce the scanning time by an
|
||||||
|
+ # additional factor of 10 again.
|
||||||
|
+ #
|
||||||
|
+ # Or trust that fuser (above) learned something in the last 15 years
|
||||||
|
+ # and avoids blocking operations meanwhile?
|
||||||
|
procs=$(exec 2>/dev/null;
|
||||||
|
- find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||||
|
+ find /proc -mindepth 1 -maxdepth 3 \
|
||||||
|
+ -path "/proc/[0-9]*" \
|
||||||
|
+ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||||
|
awk -F/ '{print $3}' | uniq)
|
||||||
|
|
||||||
|
- # This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
|
||||||
|
- # if you don't want the latter, add -maxdepth.
|
||||||
|
+ # memory mappings are also per process, not per task.
|
||||||
|
+ # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
|
||||||
|
+ # if you also want the latter, drop -maxdepth.
|
||||||
|
mmap_procs=$(exec 2>/dev/null;
|
||||||
|
- find /proc -path "/proc/[0-9]*/maps" -print |
|
||||||
|
+ find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print |
|
||||||
|
xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||||
|
printf "${procs}\n${mmap_procs}" | sort -u
|
||||||
|
fi
|
||||||
|
|
||||||
|
From 3d34db0c60a125126361b45ff8303358b6275298 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||||
|
Date: Thu, 16 Oct 2025 11:31:00 +0200
|
||||||
|
Subject: [PATCH 2/2] Filesystem: futher speed up get_pids
|
||||||
|
|
||||||
|
If we have /proc/<pid>/map_files/* symlinks,
|
||||||
|
we don't need to additionally grep /proc/<pid>/maps.
|
||||||
|
|
||||||
|
Also don't first collect output of commands into variables
|
||||||
|
just to pipe them to sort -u later,
|
||||||
|
just pipe the output of the commands through sort -u directly.
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 31 +++++++++++++++++++------------
|
||||||
|
1 file changed, 19 insertions(+), 12 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index f76339fd6..7021f13da 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -694,19 +694,26 @@ get_pids()
|
||||||
|
#
|
||||||
|
# Or trust that fuser (above) learned something in the last 15 years
|
||||||
|
# and avoids blocking operations meanwhile?
|
||||||
|
- procs=$(exec 2>/dev/null;
|
||||||
|
- find /proc -mindepth 1 -maxdepth 3 \
|
||||||
|
- -path "/proc/[0-9]*" \
|
||||||
|
- -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||||
|
- awk -F/ '{print $3}' | uniq)
|
||||||
|
-
|
||||||
|
- # memory mappings are also per process, not per task.
|
||||||
|
- # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
|
||||||
|
- # if you also want the latter, drop -maxdepth.
|
||||||
|
- mmap_procs=$(exec 2>/dev/null;
|
||||||
|
+ (
|
||||||
|
+ # If you want to debug this, drop this redirection.
|
||||||
|
+ # But it producess too much "No such file" noise for kernel
|
||||||
|
+ # threads or due to races with exiting processes or closing fds.
|
||||||
|
+ exec 2>/dev/null;
|
||||||
|
+ find /proc -mindepth 1 -maxdepth 3 \
|
||||||
|
+ -path "/proc/[0-9]*" \
|
||||||
|
+ -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||||
|
+ awk -F/ '{print $3}' | uniq
|
||||||
|
+
|
||||||
|
+ # If we have "map_files/", "find" above already found the
|
||||||
|
+ # relevant symlinks, and we don't need to grep "maps" below.
|
||||||
|
+ # Available since kernel 3.3, respectively 4.3.
|
||||||
|
+ test -d /proc/$$/map_files ||
|
||||||
|
+ # memory mappings are also per process, not per task.
|
||||||
|
+ # This finds only /proc/<pid>/maps, and not /proc/<pid>/task/<tid>/maps;
|
||||||
|
+ # if you also want the latter, drop -maxdepth.
|
||||||
|
find /proc -mindepth 2 -maxdepth 2 -path "/proc/[0-9]*/maps" -print |
|
||||||
|
- xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||||
|
- printf "${procs}\n${mmap_procs}" | sort -u
|
||||||
|
+ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq
|
||||||
|
+ ) | sort -u
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
166
SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch
Normal file
166
SOURCES/RHEL-123887-podman-etcd-certificate-rotation.patch
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
From 6bfbe1dc3a0dad234decd77330ca6189e932bb89 Mon Sep 17 00:00:00 2001
|
||||||
|
From: ehila <ehila@redhat.com>
|
||||||
|
Date: Thu, 16 Oct 2025 23:39:32 -0400
|
||||||
|
Subject: [PATCH] feat: add support for podman-etcd cert rotation
|
||||||
|
|
||||||
|
added a cert check function to the monitor call to force a restart of etcd when the certs have been changed
|
||||||
|
|
||||||
|
Signed-off-by: ehila <ehila@redhat.com>
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 87 ++++++++++++++++++++++++++++++++++++++++++-
|
||||||
|
1 file changed, 86 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index e1425ec02..b8dfb2f9e 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -40,6 +40,7 @@
|
||||||
|
# Parameter defaults
|
||||||
|
OCF_RESKEY_image_default="default"
|
||||||
|
OCF_RESKEY_pod_manifest_default="/etc/kubernetes/static-pod-resources/etcd-certs/configmaps/external-etcd-pod/pod.yaml"
|
||||||
|
+OCF_RESKEY_etcd_certs_dir_default="/etc/kubernetes/static-pod-resources/etcd-certs"
|
||||||
|
OCF_RESKEY_name_default="etcd"
|
||||||
|
OCF_RESKEY_nic_default="br-ex"
|
||||||
|
OCF_RESKEY_authfile_default="/var/lib/kubelet/config.json"
|
||||||
|
@@ -51,6 +52,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||||
|
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||||
|
+: ${OCF_RESKEY_etcd_certs_dir=${OCF_RESKEY_etcd_certs_dir_default}}
|
||||||
|
: ${OCF_RESKEY_name=${OCF_RESKEY_name_default}}
|
||||||
|
: ${OCF_RESKEY_nic=${OCF_RESKEY_nic_default}}
|
||||||
|
: ${OCF_RESKEY_authfile=${OCF_RESKEY_authfile_default}}
|
||||||
|
@@ -88,6 +90,15 @@ The Pod manifest with the configuration for Etcd.
|
||||||
|
<content type="string" default="${OCF_RESKEY_pod_manifest_default}"/>
|
||||||
|
</parameter>
|
||||||
|
|
||||||
|
+<parameter name="etcd_certs_dir" required="0" unique="0">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+The Etcd certificates directory mounted into the etcd container.
|
||||||
|
+The agent will monitor this directory for changes and restart the etcd container if the certificates have changed.
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">Etcd certificates directory</shortdesc>
|
||||||
|
+<content type="string" default="${OCF_RESKEY_etcd_certs_dir_default}"/>
|
||||||
|
+</parameter>
|
||||||
|
+
|
||||||
|
<parameter name="image" required="0" unique="0">
|
||||||
|
<longdesc lang="en">
|
||||||
|
The podman image to base this container off of.
|
||||||
|
@@ -289,6 +300,59 @@ Expects to have a fully populated OCF RA-compliant environment set.
|
||||||
|
END
|
||||||
|
}
|
||||||
|
|
||||||
|
+etcd_certificates_hash_manager()
|
||||||
|
+{
|
||||||
|
+ local action="$1"
|
||||||
|
+ local current_hash
|
||||||
|
+ local stored_hash
|
||||||
|
+
|
||||||
|
+ # If the certs directory doesn't exist, consider it unchanged
|
||||||
|
+ if [ ! -d "$OCF_RESKEY_etcd_certs_dir" ]; then
|
||||||
|
+ ocf_log warn "certificates directory $OCF_RESKEY_etcd_certs_dir does not exist, skipping certificate monitoring"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Calculate hash of all certificate files, ignore key files to avoid accidental disclosure of sensitive information
|
||||||
|
+ # we only need to monitor the certificate files to detect changes.
|
||||||
|
+ if ! current_hash=$(find "$OCF_RESKEY_etcd_certs_dir" -type f \( -name "*.crt" \) -exec sha256sum {} \; | sort | sha256sum | cut -d' ' -f1); then
|
||||||
|
+ ocf_log err "failed to calculate certificate files hash"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # If no stored hash exists, create one and return success
|
||||||
|
+ if [ ! -f "$ETCD_CERTS_HASH_FILE" ]; then
|
||||||
|
+ echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"
|
||||||
|
+ ocf_log info "created initial certificate hash: $current_hash"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ case "$action" in
|
||||||
|
+ "update")
|
||||||
|
+ if ! echo "$current_hash" > "$ETCD_CERTS_HASH_FILE"; then
|
||||||
|
+ ocf_log err "failed to update certificate hash file $ETCD_CERTS_HASH_FILE"
|
||||||
|
+ fi
|
||||||
|
+ ocf_log info "updated certificate hash: $current_hash"
|
||||||
|
+ ;;
|
||||||
|
+ "check")
|
||||||
|
+ if ! stored_hash=$(cat "$ETCD_CERTS_HASH_FILE"); then
|
||||||
|
+ ocf_log err "failed to read stored certificate hash from $ETCD_CERTS_HASH_FILE"
|
||||||
|
+ # This should not happen but if for some reason we can not read the stored hash,
|
||||||
|
+ # use the current hash and log the error but allow etcd to run as long as possible.
|
||||||
|
+ stored_hash="$current_hash"
|
||||||
|
+ fi
|
||||||
|
+ if [ "$current_hash" != "$stored_hash" ]; then
|
||||||
|
+ ocf_exit_reason "$NODENAME etcd certificate files have changed (stored: $stored_hash, current: $current_hash)"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ ;;
|
||||||
|
+ *)
|
||||||
|
+ ocf_log err "unsupported action: $action"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ ;;
|
||||||
|
+ esac
|
||||||
|
+
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
|
||||||
|
monitor_cmd_exec()
|
||||||
|
{
|
||||||
|
@@ -357,7 +421,7 @@ archive_current_container()
|
||||||
|
|
||||||
|
# archive corresponding etcd configuration files
|
||||||
|
local files_to_archive=""
|
||||||
|
- for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE"; do
|
||||||
|
+ for file in "$OCF_RESKEY_authfile" "$POD_MANIFEST_COPY" "$ETCD_CONFIGURATION_FILE" "$ETCD_CERTS_HASH_FILE"; do
|
||||||
|
if [ -f "$file" ]; then
|
||||||
|
files_to_archive="$files_to_archive $file"
|
||||||
|
else
|
||||||
|
@@ -1178,6 +1242,11 @@ podman_monitor()
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ # Check if certificate files have changed, if they have, etcd needs to be restarted
|
||||||
|
+ if ! etcd_certificates_hash_manager "check"; then
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
if is_learner; then
|
||||||
|
ocf_log info "$NODENAME is learner. Cannot get member id"
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
@@ -1483,6 +1552,14 @@ podman_start()
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ # Update the certificate hash after the container has started successfully
|
||||||
|
+ # this is to ensure that the certificate hash is updated after a restart is initiated
|
||||||
|
+ # by a cert rotation event from the monitor command.
|
||||||
|
+ if ! etcd_certificates_hash_manager "update"; then
|
||||||
|
+ ocf_exit_reason "etcd certificate hash manager failed to update the certificate hash"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
# check if the container has already started
|
||||||
|
podman_simple_status
|
||||||
|
if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
@@ -1888,6 +1965,13 @@ podman_validate()
|
||||||
|
exit $OCF_ERR_CONFIGURED
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ if ! echo "validation test" > "$ETCD_CERTS_HASH_FILE" \
|
||||||
|
+ || ! cat "$ETCD_CERTS_HASH_FILE" >/dev/null 2>&1 \
|
||||||
|
+ || ! rm "$ETCD_CERTS_HASH_FILE"; then
|
||||||
|
+ ocf_exit_reason "cannot read/write to certificate hash file $ETCD_CERTS_HASH_FILE"
|
||||||
|
+ exit $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -1922,6 +2006,7 @@ CONTAINER=$OCF_RESKEY_name
|
||||||
|
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||||
|
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||||
|
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||||
|
+ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||||
|
|
||||||
|
# Note: we currently monitor podman containers by with the "podman exec"
|
||||||
|
# command, so make sure that invocation is always valid by enforcing the
|
||||||
@ -0,0 +1,115 @@
|
|||||||
|
From 6a5608f02a657cf006b6d44d31200342c4bd19b9 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Tue, 28 Oct 2025 12:47:10 +0100
|
||||||
|
Subject: [PATCH] podman-etcd: compute dynamic revision bump from maxRaftIndex
|
||||||
|
(#2087)
|
||||||
|
|
||||||
|
Replace hardcoded 1 billion revision bump with dynamic calculation based
|
||||||
|
on 20% of the last known maxRaftIndex from revision.json.
|
||||||
|
|
||||||
|
This aligns with the logic used by cluster-etcd-operator's
|
||||||
|
quorum-restore-pod utility and ensures the bump amount is proportional
|
||||||
|
to the cluster's actual revision state.
|
||||||
|
|
||||||
|
The implementation:
|
||||||
|
- Adds compute_bump_revision() function with safe fallback to 1bn
|
||||||
|
default
|
||||||
|
- Extracts magic values to named constants
|
||||||
|
(ETCD_REVISION_BUMP_PERCENTAGE, ETCD_BUMP_REV_DEFAULT,
|
||||||
|
ETCD_REVISION_JSON)
|
||||||
|
- Validates computed values (non-zero, not exceeding default)
|
||||||
|
- Logs computation results for debugging
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da9166
|
||||||
|
22c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 38 ++++++++++++++++++++++++++++++++++----
|
||||||
|
1 file changed, 34 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index b8dfb2f9e..551d37a20 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -619,16 +619,43 @@ prepare_env() {
|
||||||
|
LISTEN_METRICS_URLS="0.0.0.0"
|
||||||
|
}
|
||||||
|
|
||||||
|
+compute_bump_revision() {
|
||||||
|
+ # Same logic used by cluster-etcd-operator quorum-restore-pod utility.
|
||||||
|
+ # see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||||
|
+ # set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
|
||||||
|
+ BUMP_REV=$ETCD_BUMP_REV_DEFAULT
|
||||||
|
+ if [ ! -f "${ETCD_REVISION_JSON}" ]; then
|
||||||
|
+ ocf_log err "could not compute bump revision: ${ETCD_REVISION_JSON} not found. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # this will bump by the amount of 20% of the last known live revision.
|
||||||
|
+ if ! COMPUTED_BUMP=$(jq -r "(.maxRaftIndex*${ETCD_REVISION_BUMP_PERCENTAGE}|floor)" "${ETCD_REVISION_JSON}"); then
|
||||||
|
+ ocf_log err "could not compute maxRaftIndex for bump revision, jq error code: $?. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if [ -z "${COMPUTED_BUMP}" ] || [ "${COMPUTED_BUMP}" -le 0 ] || [ "${COMPUTED_BUMP}" -gt "${ETCD_BUMP_REV_DEFAULT}" ]; then
|
||||||
|
+ ocf_log err "computed bump revision (${COMPUTED_BUMP}) is invalid. Defaulting to ${ETCD_BUMP_REV_DEFAULT} revision bump"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ BUMP_REV="${COMPUTED_BUMP}"
|
||||||
|
+ ocf_log info "bumping etcd revisions by ${BUMP_REV}"
|
||||||
|
+}
|
||||||
|
|
||||||
|
generate_etcd_configuration() {
|
||||||
|
if is_force_new_cluster; then
|
||||||
|
+ compute_bump_revision
|
||||||
|
# The embedded newline is required for correct YAML formatting.
|
||||||
|
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: true
|
||||||
|
-force-new-cluster-bump-amount: 1000000000"
|
||||||
|
+force-new-cluster-bump-amount: $BUMP_REV"
|
||||||
|
else
|
||||||
|
FORCE_NEW_CLUSTER_CONFIG="force-new-cluster: false"
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ # the space indentation for client-transport-security and peer-transport-security
|
||||||
|
+ # is required for correct YAML formatting.
|
||||||
|
cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||||
|
logger: zap
|
||||||
|
log-level: info
|
||||||
|
@@ -707,7 +734,7 @@ attribute_node_cluster_id()
|
||||||
|
{
|
||||||
|
local action="$1"
|
||||||
|
local value
|
||||||
|
- if ! value=$(jq -r ".clusterId" /var/lib/etcd/revision.json); then
|
||||||
|
+ if ! value=$(jq -r ".clusterId" "$ETCD_REVISION_JSON"); then
|
||||||
|
rc=$?
|
||||||
|
ocf_log err "could not get cluster_id, error code: $rc"
|
||||||
|
return "$rc"
|
||||||
|
@@ -745,7 +772,7 @@ attribute_node_revision()
|
||||||
|
local value
|
||||||
|
local attribute="revision"
|
||||||
|
|
||||||
|
- if ! value=$(jq -r ".maxRaftIndex" /var/lib/etcd/revision.json); then
|
||||||
|
+ if ! value=$(jq -r ".maxRaftIndex" "$ETCD_REVISION_JSON"); then
|
||||||
|
rc=$?
|
||||||
|
ocf_log err "could not get $attribute, error code: $rc"
|
||||||
|
return "$rc"
|
||||||
|
@@ -1456,7 +1483,7 @@ can_reuse_container() {
|
||||||
|
|
||||||
|
|
||||||
|
# If the container does not exist it cannot be reused
|
||||||
|
- if ! container_exists; then
|
||||||
|
+ if ! container_exists; then
|
||||||
|
OCF_RESKEY_reuse=0
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
fi
|
||||||
|
@@ -2006,6 +2033,9 @@ CONTAINER=$OCF_RESKEY_name
|
||||||
|
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||||
|
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||||
|
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||||
|
+ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
|
||||||
|
+ETCD_REVISION_BUMP_PERCENTAGE=0.2
|
||||||
|
+ETCD_BUMP_REV_DEFAULT=1000000000
|
||||||
|
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||||
|
|
||||||
|
# Note: we currently monitor podman containers by with the "podman exec"
|
||||||
@ -0,0 +1,222 @@
|
|||||||
|
From e8fb2ad9cc14e91b74b5cde1e012d92afcddb1a5 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Sat, 25 Oct 2025 17:27:42 +0200
|
||||||
|
Subject: [PATCH] podman-etcd: add container crash detection with coordinated
|
||||||
|
recovery
|
||||||
|
|
||||||
|
This change prevents the agent from starting prematurely when the etcd
|
||||||
|
container has failed. Previously, an early start would cause the agent
|
||||||
|
to block while waiting for peer-initiated recovery. This blocking
|
||||||
|
prevented Pacemaker from allowing the surviving agent to stop and
|
||||||
|
properly recover the cluster.
|
||||||
|
|
||||||
|
The change introduces `container_health_check` function to monitor the
|
||||||
|
container's state and catch etcd failures. This check uses a state file
|
||||||
|
to distinguish between a planned shutdown and an unexpected failure:
|
||||||
|
|
||||||
|
* Container Running: The state file is created or updated with the
|
||||||
|
current epoch (timestamp). Returns: "healthy".
|
||||||
|
* Container Not Running + No State File: It's the first check. Returns:
|
||||||
|
"not-running".
|
||||||
|
* Container Not Running + State File: An unexpected failure is detected.
|
||||||
|
* If force_new_cluster is set, the status is: "failed-restart-now".
|
||||||
|
* Otherwise, the status is: "failed-wait-for-peer".
|
||||||
|
|
||||||
|
The state file is written in a temporary directory (HA_RSCTMP) to ensure
|
||||||
|
automatic cleanup on reboot. It is also explicitly removed in
|
||||||
|
`podman_start` and `podman_stop` to mark planned transitions.
|
||||||
|
|
||||||
|
A new helper function `get_time_since_last_heartbeat()` calculates
|
||||||
|
elapsed time since the last healthy check for diagnostic logging.
|
||||||
|
|
||||||
|
Monitor behavior changes:
|
||||||
|
* failed-wait-for-peer: Returns OCF_SUCCESS to keep resource running
|
||||||
|
while waiting for peer-initiated recovery, as the agent is not able
|
||||||
|
to recover the cluster from a failed state.
|
||||||
|
* failed-restart-now: Returns OCF_ERR_GENERIC to trigger restart once
|
||||||
|
peer has set force_new_cluster
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 133 +++++++++++++++++++++++++++++++++++++++---
|
||||||
|
1 file changed, 124 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index b8dfb2f9e..d596c6f2a 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1226,22 +1226,122 @@ podman_simple_status()
|
||||||
|
return $rc
|
||||||
|
}
|
||||||
|
|
||||||
|
-podman_monitor()
|
||||||
|
+# get_time_since_last_heartbeat returns the time in seconds since the heartbeat file was last updated.
|
||||||
|
+#
|
||||||
|
+# Returns: time in seconds since last heartbeat, or empty string if file doesn't exist
|
||||||
|
+get_time_since_last_heartbeat()
|
||||||
|
{
|
||||||
|
+ local last_heartbeat
|
||||||
|
+
|
||||||
|
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ last_heartbeat=$(cat "$CONTAINER_HEARTBEAT_FILE")
|
||||||
|
+ echo $(($(date +%s) - last_heartbeat))
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+# container_health_check performs comprehensive health monitoring for the container.
|
||||||
|
+# This function allows coordinated failure handling where the agent waits for
|
||||||
|
+# peer-initiated cluster recovery in case of container failure.
|
||||||
|
+#
|
||||||
|
+# Uses a state file to track container state:
|
||||||
|
+# - Container running: Update state file with current epoch, return "healthy"
|
||||||
|
+# - Container not running + no state file: Return "not-running" (never checked before)
|
||||||
|
+# - Container not running + state file: Failure detected, check force_new_cluster
|
||||||
|
+# - If force_new_cluster set: Return "failed-restart-now"
|
||||||
|
+# - Otherwise: Return "failed-wait-for-peer"
|
||||||
|
+#
|
||||||
|
+# Returns: healthy, not-running, failed-restart-now, failed-wait-for-peer
|
||||||
|
+
|
||||||
|
+container_health_check()
|
||||||
|
+{
|
||||||
|
+ local rc
|
||||||
|
+
|
||||||
|
# We rely on running podman exec to monitor the container
|
||||||
|
# state because that command seems to be less prone to
|
||||||
|
# performance issue under IO load.
|
||||||
|
#
|
||||||
|
# For probes to work, we expect cmd_exec to be able to report
|
||||||
|
- # when a container is not running. Here, we're not interested
|
||||||
|
- # in distinguishing whether it's stopped or non existing
|
||||||
|
- # (there's function container_exists for that)
|
||||||
|
+ # when a container is not running. Here, we're not interested
|
||||||
|
+ # in distinguishing whether it's stopped or non existing
|
||||||
|
+ # (there's function container_exists for that)
|
||||||
|
+ # For monitor, however, we still need to know if it has stopped
|
||||||
|
+ # recently (i.e. a failure), or not (fresh start)
|
||||||
|
monitor_cmd_exec
|
||||||
|
rc=$?
|
||||||
|
- if [ $rc -ne 0 ]; then
|
||||||
|
- return $rc
|
||||||
|
+ if [ "$rc" -eq 0 ]; then
|
||||||
|
+ # Container is running - update state file with current epoch
|
||||||
|
+ local current_epoch
|
||||||
|
+ current_epoch=$(date +%s)
|
||||||
|
+ if ! echo "$current_epoch" > "$CONTAINER_HEARTBEAT_FILE"; then
|
||||||
|
+ ocf_log warn "Failed to update container heartbeat file, error code: $?"
|
||||||
|
+ # wait for peer to detect any real issue with the etcd cluster or wait for the
|
||||||
|
+ # next monitor interval
|
||||||
|
+ echo "failed-wait-for-peer"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+ echo "healthy"
|
||||||
|
+ return
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ # Check if state file exists (was container running on last check?)
|
||||||
|
+ if [ ! -f "$CONTAINER_HEARTBEAT_FILE" ]; then
|
||||||
|
+ # No state file - container was never checked before
|
||||||
|
+ ocf_log debug "Container ${CONTAINER} has no previous state"
|
||||||
|
+ echo "not-running"
|
||||||
|
+ # NOTE: this is where the probe is expected to exit, keeping the logic
|
||||||
|
+ # quick and less prone to performance issue under IO load.
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # State file exists - the container failed, check recovery status in this lifecycle
|
||||||
|
+ local time_since_heartbeat
|
||||||
|
+ time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||||
|
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||||
|
+
|
||||||
|
+ # Check if peer has set force_new_cluster for recovery
|
||||||
|
+ local fnc_holders
|
||||||
|
+ if ! fnc_holders=$(get_force_new_cluster); then
|
||||||
|
+ ocf_log err "Could not detect peer-initiated recovery. Checking again in the next monitor cycle"
|
||||||
|
+ echo "failed-wait-for-peer"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if [ -n "$fnc_holders" ]; then
|
||||||
|
+ ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||||
|
+ echo "failed-restart-now"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ echo "failed-wait-for-peer"
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+podman_monitor()
|
||||||
|
+{
|
||||||
|
+ local container_health_state
|
||||||
|
+
|
||||||
|
+ container_health_state=$(container_health_check)
|
||||||
|
+ case "$container_health_state" in
|
||||||
|
+ healthy)
|
||||||
|
+ # Continue with normal monitoring
|
||||||
|
+ ;;
|
||||||
|
+ not-running)
|
||||||
|
+ return $OCF_NOT_RUNNING
|
||||||
|
+ ;;
|
||||||
|
+ failed-restart-now)
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ ;;
|
||||||
|
+ failed-wait-for-peer)
|
||||||
|
+ # Continue running, waiting for peer recovery
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ ;;
|
||||||
|
+ *)
|
||||||
|
+ ocf_log err "Unknown health state: $container_health_state"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ ;;
|
||||||
|
+ esac
|
||||||
|
+
|
||||||
|
# Check if certificate files have changed, if they have, etcd needs to be restarted
|
||||||
|
if ! etcd_certificates_hash_manager "check"; then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
@@ -1533,6 +1633,12 @@ podman_start()
|
||||||
|
local pod_was_running=false
|
||||||
|
|
||||||
|
ocf_log notice "podman-etcd start"
|
||||||
|
+
|
||||||
|
+ # Clear container health check state file
|
||||||
|
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
|
||||||
|
+ ocf_log err "could not delete container health check state file"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
attribute_node_ip update
|
||||||
|
attribute_node_cluster_id update
|
||||||
|
attribute_node_revision update
|
||||||
|
@@ -1849,15 +1955,21 @@ podman_stop()
|
||||||
|
local rc
|
||||||
|
|
||||||
|
ocf_log notice "podman-etcd stop"
|
||||||
|
+
|
||||||
|
+ # Clear container health check state file
|
||||||
|
+ if ! rm -f "$CONTAINER_HEARTBEAT_FILE"; then
|
||||||
|
+ ocf_log err "could not delete container health check state file"
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ attribute_node_revision update
|
||||||
|
+ attribute_node_cluster_id update
|
||||||
|
+
|
||||||
|
podman_simple_status
|
||||||
|
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||||
|
ocf_log info "could not leave members list: etcd container not running"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
|
||||||
|
- attribute_node_revision update
|
||||||
|
- attribute_node_cluster_id update
|
||||||
|
-
|
||||||
|
if ! member_id=$(attribute_node_member_id get); then
|
||||||
|
ocf_log err "error leaving members list: could not get member-id"
|
||||||
|
else
|
||||||
|
@@ -2007,6 +2119,9 @@ POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||||
|
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||||
|
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||||
|
ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||||
|
+# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
|
||||||
|
+# This is intentional - reboots are controlled stops, not failures requiring detection.
|
||||||
|
+CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
|
||||||
|
|
||||||
|
# Note: we currently monitor podman containers by with the "podman exec"
|
||||||
|
# command, so make sure that invocation is always valid by enforcing the
|
||||||
@ -0,0 +1,47 @@
|
|||||||
|
From a155018f6d65edf99493804dad99412b50d13e6c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Wed, 5 Nov 2025 13:48:38 +0100
|
||||||
|
Subject: [PATCH] podman-etcd: fix count of fnc holders in
|
||||||
|
container_health_check
|
||||||
|
|
||||||
|
The variable `fnc_holders` (a list of nodes that have force_new_cluster
|
||||||
|
CIB attribute set) can contain empty spaces. Because of this, the
|
||||||
|
shell's simple `-n` test is not enough to establish if there are no
|
||||||
|
`fnc_holders`.
|
||||||
|
|
||||||
|
Fixed counting the number of words inside the variable.
|
||||||
|
|
||||||
|
Moreover
|
||||||
|
* Enhanced comment for clarity.
|
||||||
|
* Log level changed to `info`. We want visibility when the monitor
|
||||||
|
detects the peer node is ready for recovery, and this is rare enough
|
||||||
|
not to flood the logs.
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 7 ++++---
|
||||||
|
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 5bdc6d184..7795130a6 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1366,7 +1366,7 @@ container_health_check()
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
- # State file exists - the container failed, check recovery status in this lifecycle
|
||||||
|
+ # Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
|
||||||
|
local time_since_heartbeat
|
||||||
|
time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||||
|
ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||||
|
@@ -1379,8 +1379,9 @@ container_health_check()
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if [ -n "$fnc_holders" ]; then
|
||||||
|
- ocf_log debug "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||||
|
+ local fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||||
|
+ if [ "$fnc_holder_count" -gt 0 ]; then
|
||||||
|
+ ocf_log info "force_new_cluster detected (set by: $fnc_holders), triggering restart"
|
||||||
|
echo "failed-restart-now"
|
||||||
|
return
|
||||||
|
fi
|
||||||
@ -0,0 +1,158 @@
|
|||||||
|
From 48455cb6cef9c5b849045bc838bc2b5ccd01b0fe Mon Sep 17 00:00:00 2001
|
||||||
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||||
|
Date: Fri, 7 Nov 2025 17:06:57 +0100
|
||||||
|
Subject: [PATCH 1/3] storage_mon: refactor removing basically duplicate code
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 45 ++++++++++++++++-----------------------------
|
||||||
|
1 file changed, 16 insertions(+), 29 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index 27d2ff1d1..fa9bd0cbc 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -119,6 +119,8 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||||
|
int device_fd;
|
||||||
|
int res;
|
||||||
|
off_t seek_spot;
|
||||||
|
+ int sec_size = 512;
|
||||||
|
+ void *buffer;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
printf("Testing device %s\n", device);
|
||||||
|
@@ -164,9 +166,6 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flags & O_DIRECT) {
|
||||||
|
- int sec_size = 0;
|
||||||
|
- void *buffer;
|
||||||
|
-
|
||||||
|
#ifdef __FreeBSD__
|
||||||
|
res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
|
||||||
|
#else
|
||||||
|
@@ -176,33 +175,21 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||||
|
PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
+ }
|
||||||
|
|
||||||
|
- if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
||||||
|
- PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
||||||
|
- goto error;
|
||||||
|
- }
|
||||||
|
- res = read(device_fd, buffer, sec_size);
|
||||||
|
- free(buffer);
|
||||||
|
- if (res < 0) {
|
||||||
|
- PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
||||||
|
- goto error;
|
||||||
|
- }
|
||||||
|
- if (res < sec_size) {
|
||||||
|
- PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res);
|
||||||
|
- goto error;
|
||||||
|
- }
|
||||||
|
- } else {
|
||||||
|
- char buffer[512];
|
||||||
|
-
|
||||||
|
- res = read(device_fd, buffer, sizeof(buffer));
|
||||||
|
- if (res < 0) {
|
||||||
|
- PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
||||||
|
- goto error;
|
||||||
|
- }
|
||||||
|
- if (res < (int)sizeof(buffer)) {
|
||||||
|
- PRINT_STORAGE_MON_ERR("Failed to read %ld bytes from %s, got %d", sizeof(buffer), device, res);
|
||||||
|
- goto error;
|
||||||
|
- }
|
||||||
|
+ if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
||||||
|
+ PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
||||||
|
+ goto error;
|
||||||
|
+ }
|
||||||
|
+ res = read(device_fd, buffer, sec_size);
|
||||||
|
+ free(buffer);
|
||||||
|
+ if (res < 0) {
|
||||||
|
+ PRINT_STORAGE_MON_ERR("Failed to read %s: %s", device, strerror(errno));
|
||||||
|
+ goto error;
|
||||||
|
+ }
|
||||||
|
+ if (res < sec_size) {
|
||||||
|
+ PRINT_STORAGE_MON_ERR("Failed to read %d bytes from %s, got %d", sec_size, device, res);
|
||||||
|
+ goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fake an error */
|
||||||
|
|
||||||
|
From 310f224fc7d9a6f4fca234f10696e6049c8f2666 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||||
|
Date: Fri, 7 Nov 2025 17:14:06 +0100
|
||||||
|
Subject: [PATCH 2/3] storage_mon.c: refactor moving up getting blocksize
|
||||||
|
|
||||||
|
if that fails we can bail out without unnecessary seek
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 24 ++++++++++++------------
|
||||||
|
1 file changed, 12 insertions(+), 12 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index fa9bd0cbc..960266a74 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -152,6 +152,18 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||||
|
PRINT_STORAGE_MON_INFO("%s: opened %s O_DIRECT, size=%zu", device, (flags & O_DIRECT)?"with":"without", devsize);
|
||||||
|
}
|
||||||
|
|
||||||
|
+ if (flags & O_DIRECT) {
|
||||||
|
+#ifdef __FreeBSD__
|
||||||
|
+ res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
|
||||||
|
+#else
|
||||||
|
+ res = ioctl(device_fd, BLKSSZGET, &sec_size);
|
||||||
|
+#endif
|
||||||
|
+ if (res < 0) {
|
||||||
|
+ PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
||||||
|
+ goto error;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
/* Don't fret about real randomness */
|
||||||
|
srand(time(NULL) + getpid());
|
||||||
|
/* Pick a random place on the device - sector aligned */
|
||||||
|
@@ -165,18 +177,6 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||||
|
PRINT_STORAGE_MON_INFO("%s: reading from pos %ld", device, seek_spot);
|
||||||
|
}
|
||||||
|
|
||||||
|
- if (flags & O_DIRECT) {
|
||||||
|
-#ifdef __FreeBSD__
|
||||||
|
- res = ioctl(device_fd, DIOCGSECTORSIZE, &sec_size);
|
||||||
|
-#else
|
||||||
|
- res = ioctl(device_fd, BLKSSZGET, &sec_size);
|
||||||
|
-#endif
|
||||||
|
- if (res < 0) {
|
||||||
|
- PRINT_STORAGE_MON_ERR("Failed to get block device sector size for %s: %s", device, strerror(errno));
|
||||||
|
- goto error;
|
||||||
|
- }
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
if (posix_memalign(&buffer, sysconf(_SC_PAGESIZE), sec_size) != 0) {
|
||||||
|
PRINT_STORAGE_MON_ERR("Failed to allocate aligned memory: %s", strerror(errno));
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
From ac19911ce550d5eca42be6cb44632384bdf8e1c9 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Klaus Wenninger <klaus.wenninger@aon.at>
|
||||||
|
Date: Fri, 7 Nov 2025 17:18:45 +0100
|
||||||
|
Subject: [PATCH 3/3] storage_mon.c: fix block-seek mask deriving it from the
|
||||||
|
block-size
|
||||||
|
|
||||||
|
now this is as well working for e.g. 4K block-devices
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index 960266a74..6c4555f04 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -167,7 +167,7 @@ static void *test_device(const char *device, int verbose, int inject_error_perce
|
||||||
|
/* Don't fret about real randomness */
|
||||||
|
srand(time(NULL) + getpid());
|
||||||
|
/* Pick a random place on the device - sector aligned */
|
||||||
|
- seek_spot = (rand() % (devsize-1024)) & 0xFFFFFFFFFFFFFE00;
|
||||||
|
+ seek_spot = (rand() % (devsize-sec_size)) & ~(((off_t) sec_size)-1);
|
||||||
|
res = lseek(device_fd, seek_spot, SEEK_SET);
|
||||||
|
if (res < 0) {
|
||||||
|
PRINT_STORAGE_MON_ERR("Failed to seek %s: %s", device, strerror(errno));
|
||||||
@ -0,0 +1,106 @@
|
|||||||
|
From d5b4428e6cd66fd47680531ff0244d9b56e4e4c2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||||
|
Date: Tue, 14 Oct 2025 11:57:09 +0200
|
||||||
|
Subject: [PATCH 1/2] Redo counting of active_resources
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 46 +++++++++++++++++++++++++++++++++++++++++--
|
||||||
|
1 file changed, 44 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index e1425ec02..dbf16918d 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1029,6 +1029,48 @@ get_peer_node_name() {
|
||||||
|
crm_node -l | awk '{print $2}' | grep -v "$NODENAME"
|
||||||
|
}
|
||||||
|
|
||||||
|
+# Calculate the count of truly active resources by excluding those being stopped.
|
||||||
|
+# According to Pacemaker documentation, during "Post-notification (stop) /
|
||||||
|
+# Pre-notification (start)" transitions, the true active resource count should be:
|
||||||
|
+# Active resources = $OCF_RESKEY_CRM_meta_notify_active_resource
|
||||||
|
+# minus $OCF_RESKEY_CRM_meta_notify_stop_resource
|
||||||
|
+# This handles the case where a resource appears in both the active and stop lists
|
||||||
|
+# during rapid restart scenarios (e.g., process crash recovery).
|
||||||
|
+get_truly_active_resources_count() {
|
||||||
|
+ local active_list="$OCF_RESKEY_CRM_meta_notify_active_resource"
|
||||||
|
+ local stop_list="$OCF_RESKEY_CRM_meta_notify_stop_resource"
|
||||||
|
+ local truly_active=""
|
||||||
|
+
|
||||||
|
+ # If no active resources, return 0
|
||||||
|
+ if [ -z "$active_list" ]; then
|
||||||
|
+ echo "0"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # If no resources being stopped, return count of active resources
|
||||||
|
+ if [ -z "$stop_list" ]; then
|
||||||
|
+ echo "$active_list" | wc -w
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Filter out resources that are being stopped from the active list
|
||||||
|
+ for resource in $active_list; do
|
||||||
|
+ local is_stopping=0
|
||||||
|
+ for stop_resource in $stop_list; do
|
||||||
|
+ if [ "$resource" = "$stop_resource" ]; then
|
||||||
|
+ is_stopping=1
|
||||||
|
+ break
|
||||||
|
+ fi
|
||||||
|
+ done
|
||||||
|
+ if [ $is_stopping -eq 0 ]; then
|
||||||
|
+ truly_active="$truly_active $resource"
|
||||||
|
+ fi
|
||||||
|
+ done
|
||||||
|
+
|
||||||
|
+ # Count the truly active resources (trim leading space and count words)
|
||||||
|
+ echo "$truly_active" | wc -w
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
get_all_etcd_endpoints() {
|
||||||
|
for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
name=$(echo "$node" | cut -d: -f1)
|
||||||
|
@@ -1529,8 +1571,8 @@ podman_start()
|
||||||
|
# - 0 active agents, 1 starting: we are starting; the peer is not starting
|
||||||
|
# - 0 active agents, 2 starting: both agents are starting simultaneously
|
||||||
|
local active_resources_count
|
||||||
|
- active_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_active_resource" | wc -w)
|
||||||
|
- ocf_log info "found '$active_resources_count' active etcd resources (meta notify environment variable: '$OCF_RESKEY_CRM_meta_notify_active_resource')"
|
||||||
|
+ active_resources_count=$(get_truly_active_resources_count)
|
||||||
|
+ ocf_log info "found '$active_resources_count' active etcd resources (active: '$OCF_RESKEY_CRM_meta_notify_active_resource', stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
|
||||||
|
case "$active_resources_count" in
|
||||||
|
1)
|
||||||
|
if [ "$(attribute_learner_node get)" = "$(get_peer_node_name)" ]; then
|
||||||
|
|
||||||
|
From 0114ddf83c95122a7f9fe9f704f864242cdb284a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||||
|
Date: Wed, 29 Oct 2025 12:49:17 +0100
|
||||||
|
Subject: [PATCH 2/2] Update truly active resources count with safer empty
|
||||||
|
calculation
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 6 ++++--
|
||||||
|
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index dbf16918d..8fc92a537 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1042,13 +1042,15 @@ get_truly_active_resources_count() {
|
||||||
|
local truly_active=""
|
||||||
|
|
||||||
|
# If no active resources, return 0
|
||||||
|
- if [ -z "$active_list" ]; then
|
||||||
|
+ # Use word count to handle whitespace-only values
|
||||||
|
+ if [ "$(echo "$active_list" | wc -w)" -eq 0 ]; then
|
||||||
|
echo "0"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If no resources being stopped, return count of active resources
|
||||||
|
- if [ -z "$stop_list" ]; then
|
||||||
|
+ # Use word count to handle whitespace-only values
|
||||||
|
+ if [ "$(echo "$stop_list" | wc -w)" -eq 0 ]; then
|
||||||
|
echo "$active_list" | wc -w
|
||||||
|
return
|
||||||
|
fi
|
||||||
@ -0,0 +1,161 @@
|
|||||||
|
From 578e6d982e5ab705dac216cecf85c50fe3842af5 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Sun, 16 Nov 2025 19:40:30 +0100
|
||||||
|
Subject: [PATCH] OCPBUGS-60098: podman-etcd: prevent last active member from
|
||||||
|
leaving the etcd member list
|
||||||
|
|
||||||
|
When stopping etcd instances, simultaneous member removal from both
|
||||||
|
nodes can corrupt the etcd Write-Ahead Log (WAL). This change implements
|
||||||
|
a two-part solution:
|
||||||
|
|
||||||
|
1. Concurrent stop protection: When multiple nodes are stopping, the
|
||||||
|
alphabetically second node delays its member removal by 10
|
||||||
|
seconds. This prevents simultaneous member list updates that can
|
||||||
|
corrupt WAL.
|
||||||
|
|
||||||
|
2. Last member detection: Checks active resource count after any
|
||||||
|
delay. If this is the last active member, skips member removal to
|
||||||
|
avoid leaving an empty cluster.
|
||||||
|
|
||||||
|
Additionally, reorders podman_stop() to clear the member_id attribute
|
||||||
|
after leaving the member list, ensuring the attribute reflects actual
|
||||||
|
cluster state during shutdown.
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 86 ++++++++++++++++++++++++++++++++++---------
|
||||||
|
1 file changed, 69 insertions(+), 17 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 7795130a6..7b6e08f11 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1341,6 +1341,11 @@ container_health_check()
|
||||||
|
# recently (i.e. a failure), or not (fresh start)
|
||||||
|
monitor_cmd_exec
|
||||||
|
rc=$?
|
||||||
|
+ if [ "$rc" -ne 0 ]; then
|
||||||
|
+ ocf_log info "Container ${CONTAINER} not-running"
|
||||||
|
+ echo "not-running"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
if [ "$rc" -eq 0 ]; then
|
||||||
|
# Container is running - update state file with current epoch
|
||||||
|
local current_epoch
|
||||||
|
@@ -1639,7 +1644,7 @@ can_reuse_container() {
|
||||||
|
OCF_RESKEY_reuse=0
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
@@ -1866,7 +1871,7 @@ podman_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||||
|
- local wait_timeout_sec=$((10*60))
|
||||||
|
+ local wait_timeout_sec=60
|
||||||
|
local poll_interval_sec=5
|
||||||
|
local retries=$(( wait_timeout_sec / poll_interval_sec ))
|
||||||
|
|
||||||
|
@@ -2021,6 +2026,64 @@ podman_start()
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
+# leave_etcd_member_list removes the current node from the etcd member list during
|
||||||
|
+# shutdown to ensure clean cluster state.
|
||||||
|
+#
|
||||||
|
+# Skips removal if this is the standalone (last) node. When both nodes are stopping
|
||||||
|
+# concurrently, delays the second node to prevent simultaneous member removal that
|
||||||
|
+# could corrupt the etcd WAL.
|
||||||
|
+leave_etcd_member_list()
|
||||||
|
+{
|
||||||
|
+ if ! member_id=$(attribute_node_member_id get); then
|
||||||
|
+ ocf_log err "error leaving members list: could not get member-id"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if is_standalone; then
|
||||||
|
+ ocf_log info "last member. Not leaving the member list"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ local stopping_resources_count
|
||||||
|
+ stopping_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_stop_resource" | wc -w)
|
||||||
|
+ ocf_log info "found '$stopping_resources_count' stopping etcd resources (stop: '$OCF_RESKEY_CRM_meta_notify_stop_resource')"
|
||||||
|
+ if [ "$stopping_resources_count" -gt 1 ]; then
|
||||||
|
+ # Prevent WAL corruption by delaying the alphabetically second node's member
|
||||||
|
+ # removal when both nodes are stopping concurrently.
|
||||||
|
+ local delayed_node
|
||||||
|
+
|
||||||
|
+ node_names_sorted=$(echo "$OCF_RESKEY_node_ip_map" | sed 's/:[^;]*//g; s/;/ /g' | tr ' ' '\n' | sort | tr '\n' ' ')
|
||||||
|
+ delayed_node="$(echo "$node_names_sorted" | cut -d' ' -f2)"
|
||||||
|
+
|
||||||
|
+ if [ -z "$delayed_node" ]; then
|
||||||
|
+ ocf_log warn "could not determine node to be delayed: not leaving the member list"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if [ "$NODENAME" = "$delayed_node" ]; then
|
||||||
|
+ ocf_log info "delaying stop for ${DELAY_SECOND_NODE_LEAVE_SEC}s to prevent simultaneous etcd member removal"
|
||||||
|
+ sleep $DELAY_SECOND_NODE_LEAVE_SEC
|
||||||
|
+ fi
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Ensure we're not the last active resource before leaving. The `standalone_node` property
|
||||||
|
+ # may not be set if stop was called before monitor check, or after the delayed node waited.
|
||||||
|
+ local active_resources_count
|
||||||
|
+ active_resources_count=$(get_truly_active_resources_count)
|
||||||
|
+ if [ "$active_resources_count" -lt 1 ]; then
|
||||||
|
+ ocf_log info "last member. Not leaving the member list"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ ocf_log info "leaving members list as member with ID $member_id"
|
||||||
|
+ local endpoint
|
||||||
|
+ endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||||
|
+ if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||||
|
+ rc=$?
|
||||||
|
+ ocf_log err "error leaving members list, error code: $rc"
|
||||||
|
+ fi
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
podman_stop()
|
||||||
|
{
|
||||||
|
local timeout=60
|
||||||
|
@@ -2039,24 +2102,12 @@ podman_stop()
|
||||||
|
podman_simple_status
|
||||||
|
if [ $? -eq $OCF_NOT_RUNNING ]; then
|
||||||
|
ocf_log info "could not leave members list: etcd container not running"
|
||||||
|
+ attribute_node_member_id clear
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if ! member_id=$(attribute_node_member_id get); then
|
||||||
|
- ocf_log err "error leaving members list: could not get member-id"
|
||||||
|
- else
|
||||||
|
- # TODO: is it worth/possible to check the current status instead than relying on cached attributes?
|
||||||
|
- if is_standalone; then
|
||||||
|
- ocf_log info "last member. Not leaving the member list"
|
||||||
|
- else
|
||||||
|
- ocf_log info "leaving members list as member with ID $member_id"
|
||||||
|
- endpoint="$(ip_url $(attribute_node_ip get)):2379"
|
||||||
|
- if ! ocf_run podman exec "$CONTAINER" etcdctl member remove "$member_id" --endpoints="$endpoint"; then
|
||||||
|
- rc=$?
|
||||||
|
- ocf_log err "error leaving members list, error code: $rc"
|
||||||
|
- fi
|
||||||
|
- fi
|
||||||
|
- fi
|
||||||
|
+ leave_etcd_member_list
|
||||||
|
+ # clear node_member_id CIB attribute only after leaving the member list
|
||||||
|
attribute_node_member_id clear
|
||||||
|
|
||||||
|
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
|
||||||
|
@@ -2197,6 +2248,7 @@ ETCD_CERTS_HASH_FILE="${OCF_RESKEY_config_location}/certs.hash"
|
||||||
|
# State file location: Uses HA_RSCTMP to ensure automatic cleanup on reboot.
|
||||||
|
# This is intentional - reboots are controlled stops, not failures requiring detection.
|
||||||
|
CONTAINER_HEARTBEAT_FILE=${HA_RSCTMP}/podman-container-last-running
|
||||||
|
+DELAY_SECOND_NODE_LEAVE_SEC=10
|
||||||
|
|
||||||
|
# Note: we currently monitor podman containers by with the "podman exec"
|
||||||
|
# command, so make sure that invocation is always valid by enforcing the
|
||||||
42
SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch
Normal file
42
SOURCES/RHEL-130580-2-podman-etcd-remove-test-code.patch
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
From 29df4255c5f65ea94fb6de997805dca65e31071c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Mon, 24 Nov 2025 12:21:55 +0100
|
||||||
|
Subject: [PATCH] podman-etcd: remove test code (#2103)
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 8 +-------
|
||||||
|
1 file changed, 1 insertion(+), 7 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 7b6e08f11..b1f52cd5c 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1341,11 +1341,6 @@ container_health_check()
|
||||||
|
# recently (i.e. a failure), or not (fresh start)
|
||||||
|
monitor_cmd_exec
|
||||||
|
rc=$?
|
||||||
|
- if [ "$rc" -ne 0 ]; then
|
||||||
|
- ocf_log info "Container ${CONTAINER} not-running"
|
||||||
|
- echo "not-running"
|
||||||
|
- return
|
||||||
|
- fi
|
||||||
|
if [ "$rc" -eq 0 ]; then
|
||||||
|
# Container is running - update state file with current epoch
|
||||||
|
local current_epoch
|
||||||
|
@@ -1644,7 +1639,6 @@ can_reuse_container() {
|
||||||
|
OCF_RESKEY_reuse=0
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
if ! filtered_original_pod_manifest=$(filter_pod_manifest "$OCF_RESKEY_pod_manifest"); then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
@@ -1871,7 +1865,7 @@ podman_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||||
|
- local wait_timeout_sec=60
|
||||||
|
+ local wait_timeout_sec=$((10*60))
|
||||||
|
local poll_interval_sec=5
|
||||||
|
local retries=$(( wait_timeout_sec / poll_interval_sec ))
|
||||||
|
|
||||||
@ -0,0 +1,107 @@
|
|||||||
|
From 5cc74acd67c294da36b3f40e44842a82aa7d0957 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Wed, 26 Nov 2025 11:43:25 +0100
|
||||||
|
Subject: [PATCH] OCPEDGE-2213: podman-etcd: fix to prevent learner from
|
||||||
|
starting before cluster is ready (#2098)
|
||||||
|
|
||||||
|
* OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
|
||||||
|
|
||||||
|
Clear stale learner_node attribute during stop and on restart when no
|
||||||
|
active resources exist, ensuring learner always waits for peer
|
||||||
|
availability.
|
||||||
|
|
||||||
|
* fix: podman-etcd should cleanup standalone/learner attributes when promotion succeeds
|
||||||
|
|
||||||
|
* fix: remove misleading endpoint IP from log
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 33 +++++++++++++++++++--------------
|
||||||
|
1 file changed, 19 insertions(+), 14 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index b1f52cd5c..3e3f1d60e 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -880,7 +880,7 @@ add_member_as_learner()
|
||||||
|
local endpoint_url=$(ip_url $(attribute_node_ip get))
|
||||||
|
local peer_url=$(ip_url $member_ip)
|
||||||
|
|
||||||
|
- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the member list as learner"
|
||||||
|
+ ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||||
|
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
@@ -1032,7 +1032,7 @@ promote_learner_member()
|
||||||
|
if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote "$learner_member_id_hex" 2>&1; then
|
||||||
|
# promotion is expected to fail if the peer is not yet up-to-date
|
||||||
|
ocf_log info "could not promote member $learner_member_id_hex, error code: $?"
|
||||||
|
- return $OCF_SUCCESS
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
ocf_log info "successfully promoted member '$learner_member_id_hex'"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
@@ -1063,19 +1063,19 @@ reconcile_member_state()
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$learner_member_id" ]; then
|
||||||
|
- promote_learner_member "$learner_member_id"
|
||||||
|
- return $?
|
||||||
|
- fi
|
||||||
|
-
|
||||||
|
- if [ -z "$learner_member_id" ]; then
|
||||||
|
- if ! clear_standalone_node; then
|
||||||
|
- ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
- fi
|
||||||
|
- if ! attribute_learner_node clear; then
|
||||||
|
- ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||||
|
+ if ! promote_learner_member "$learner_member_id"; then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
+ # promotion succeded: continue to clear standalone_node and learner_node
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if ! clear_standalone_node; then
|
||||||
|
+ ocf_log error "could not clear standalone_node attribute, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ if ! attribute_learner_node clear; then
|
||||||
|
+ ocf_log error "could not clear learner_node attribute, error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
@@ -1258,6 +1258,7 @@ manage_peer_membership()
|
||||||
|
set_standalone_node
|
||||||
|
else
|
||||||
|
ocf_log debug "$name is in the members list by IP: $ip"
|
||||||
|
+ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||||
|
reconcile_member_state "$member_list_json"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
@@ -1369,7 +1370,7 @@ container_health_check()
|
||||||
|
# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
|
||||||
|
local time_since_heartbeat
|
||||||
|
time_since_heartbeat=$(get_time_since_last_heartbeat)
|
||||||
|
- ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
|
||||||
|
+ ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
|
||||||
|
|
||||||
|
# Check if peer has set force_new_cluster for recovery
|
||||||
|
local fnc_holders
|
||||||
|
@@ -1795,6 +1796,9 @@ podman_start()
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
0)
|
||||||
|
+ # No active resources: clear any stale learner_node attribute from previous failed session
|
||||||
|
+ ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
|
||||||
|
+ attribute_learner_node clear
|
||||||
|
# count how many agents are starting now
|
||||||
|
local start_resources_count
|
||||||
|
start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
|
||||||
|
@@ -2090,6 +2094,7 @@ podman_stop()
|
||||||
|
ocf_log err "could not delete container health check state file"
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ attribute_learner_node clear
|
||||||
|
attribute_node_revision update
|
||||||
|
attribute_node_cluster_id update
|
||||||
|
|
||||||
@ -0,0 +1,146 @@
|
|||||||
|
From 192b0ecbe015e8b8a4d32f8b066ead3a6dba0589 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Tue, 2 Dec 2025 10:01:01 +0100
|
||||||
|
Subject: [PATCH] OCPEDGE-2231: podman-etcd: improve error handling to support
|
||||||
|
retry on start errors (#2105)
|
||||||
|
|
||||||
|
* podman-etcd: improve add_member_as_learner error log
|
||||||
|
|
||||||
|
Improving add_member_as_learner error log to better debug rare issue
|
||||||
|
when the podman exec command returns error, but the etcd member is added
|
||||||
|
to the list anyway. This is critical as the `learner_node` attribute
|
||||||
|
won't be cleaned up anymore.
|
||||||
|
|
||||||
|
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
|
||||||
|
* podman-etcd: remove duplicated check for container already started
|
||||||
|
|
||||||
|
* podman-etcd: improve error return codes to support start retries
|
||||||
|
|
||||||
|
Improved and/or changed some returns code to allow or forbid retry in
|
||||||
|
case of start errors.
|
||||||
|
|
||||||
|
see: OCPEDGE-2231
|
||||||
|
|
||||||
|
---------
|
||||||
|
|
||||||
|
Signed-off-by: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 40 +++++++++++++++++++++++++---------------
|
||||||
|
1 file changed, 25 insertions(+), 15 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 3e3f1d60e..242226bb1 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -617,9 +617,13 @@ prepare_env() {
|
||||||
|
LISTEN_CLIENT_URLS="0.0.0.0"
|
||||||
|
LISTEN_PEER_URLS="0.0.0.0"
|
||||||
|
LISTEN_METRICS_URLS="0.0.0.0"
|
||||||
|
+
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
compute_bump_revision() {
|
||||||
|
+ local rc
|
||||||
|
+
|
||||||
|
# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
|
||||||
|
# see https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
|
||||||
|
# set a default value: 1bn would be an etcd running at 1000 writes/s for about eleven days.
|
||||||
|
@@ -691,7 +695,13 @@ experimental-max-learners: 1
|
||||||
|
experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||||
|
experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
EOF
|
||||||
|
+ rc=$?
|
||||||
|
+ if [ $rc -ne 0 ]; then
|
||||||
|
+ ocf_log err "could not create etcd configuration, 'cat' error code: $rc"
|
||||||
|
+ return $OCF_ERR_CONFIGURED
|
||||||
|
+ fi
|
||||||
|
|
||||||
|
+ # Append cipher suites from the env variable where the entries are comma separated.
|
||||||
|
{
|
||||||
|
if [ -n "$ETCD_CIPHER_SUITES" ]; then
|
||||||
|
echo "cipher-suites:"
|
||||||
|
@@ -700,6 +710,13 @@ EOF
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
} >> "$ETCD_CONFIGURATION_FILE"
|
||||||
|
+ rc=$?
|
||||||
|
+ if [ $rc -ne 0 ]; then
|
||||||
|
+ ocf_log err "could not append cipher suites to etcd configuration, error code: $rc"
|
||||||
|
+ return $OCF_ERR_CONFIGURED
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
archive_data_folder()
|
||||||
|
@@ -884,7 +901,7 @@ add_member_as_learner()
|
||||||
|
out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
- ocf_log err "could not add $member_name as learner, error code: $rc"
|
||||||
|
+ ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
ocf_log info "$out"
|
||||||
|
@@ -1763,7 +1780,7 @@ podman_start()
|
||||||
|
fnc_holder_count=$(echo "$fnc_holders" | wc -w)
|
||||||
|
if [ "$fnc_holder_count" -gt 1 ]; then
|
||||||
|
ocf_exit_reason "force_new_cluster attribute is set on multiple nodes ($fnc_holders)"
|
||||||
|
- return "$OCF_ERR_GENERIC"
|
||||||
|
+ return "$OCF_ERR_CONFIGURED"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$fnc_holder_count" -eq 1 ]; then
|
||||||
|
@@ -1837,7 +1854,7 @@ podman_start()
|
||||||
|
ocf_log info "same cluster_id and revision: start normal"
|
||||||
|
else
|
||||||
|
ocf_exit_reason "same revision but different cluster id"
|
||||||
|
- return "$OCF_ERR_GENERIC"
|
||||||
|
+ return "$OCF_ERR_CONFIGURED"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
;;
|
||||||
|
@@ -1862,12 +1879,6 @@ podman_start()
|
||||||
|
|
||||||
|
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
|
||||||
|
|
||||||
|
- # check to see if the container has already started
|
||||||
|
- podman_simple_status
|
||||||
|
- if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
- return "$OCF_SUCCESS"
|
||||||
|
- fi
|
||||||
|
-
|
||||||
|
if ocf_is_true "$JOIN_AS_LEARNER"; then
|
||||||
|
local wait_timeout_sec=$((10*60))
|
||||||
|
local poll_interval_sec=5
|
||||||
|
@@ -1894,9 +1905,8 @@ podman_start()
|
||||||
|
|
||||||
|
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||||
|
if ! can_reuse_container ; then
|
||||||
|
- rc="$?"
|
||||||
|
- ocf_log err "could not determine etcd container reuse strategy, rc: $rc"
|
||||||
|
- return "$rc"
|
||||||
|
+ ocf_log err "could not determine etcd container reuse strategy"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Archive current container and its configuration before creating
|
||||||
|
@@ -1912,13 +1922,13 @@ podman_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! prepare_env; then
|
||||||
|
- ocf_log err "Could not prepare environment for podman, error code: $?"
|
||||||
|
+ ocf_log err "Could not prepare environment for podman"
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! generate_etcd_configuration; then
|
||||||
|
- ocf_log err "Could not generate etcd configuration, error code: $?"
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
+ ocf_log err "Could not generate etcd configuration"
|
||||||
|
+ return $OCF_ERR_CONFIGURED
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_opts="$run_opts \
|
||||||
@ -0,0 +1,52 @@
|
|||||||
|
From 8b70d5026fee0910a52f0fdefcaf930b2c0a3909 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Wed, 3 Dec 2025 11:38:25 +0100
|
||||||
|
Subject: [PATCH] podman-etcd: sync environment variables with Pod manifest
|
||||||
|
|
||||||
|
The EXPERIMENTAL substring was removed from
|
||||||
|
ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION and
|
||||||
|
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERNAL in the Pod
|
||||||
|
manifest. This change aligns our config with those updates.
|
||||||
|
|
||||||
|
NOTE: Some Etcd flags deprecated in v3.6 will be replaced in a future
|
||||||
|
change.
|
||||||
|
|
||||||
|
See: https://github.com/openshift/cluster-etcd-operator/pull/1507
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 9 +++++----
|
||||||
|
1 file changed, 5 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 242226bb1..bb2900536 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -604,8 +604,8 @@ prepare_env() {
|
||||||
|
fi
|
||||||
|
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
|
||||||
|
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
|
||||||
|
- ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||||
|
- ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
+ ETCD_WARNING_APPLY_DURATION=$(get_env_from_manifest "ETCD_WARNING_APPLY_DURATION")
|
||||||
|
+ ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest "ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest "ETCD_HEARTBEAT_INTERVAL")
|
||||||
|
ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest "ETCD_QUOTA_BACKEND_BYTES")
|
||||||
|
ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest "ETCD_SOCKET_REUSE_ADDRESS")
|
||||||
|
@@ -660,6 +660,7 @@ force-new-cluster-bump-amount: $BUMP_REV"
|
||||||
|
|
||||||
|
# the space indentation for client-transport-security and peer-transport-security
|
||||||
|
# is required for correct YAML formatting.
|
||||||
|
+ # TODO: replace flags deprecated in Etcd v3.6
|
||||||
|
cat > "$ETCD_CONFIGURATION_FILE" << EOF
|
||||||
|
logger: zap
|
||||||
|
log-level: info
|
||||||
|
@@ -692,8 +693,8 @@ listen-metrics-urls: "$(ip_url ${LISTEN_METRICS_URLS}):9978"
|
||||||
|
metrics: extensive
|
||||||
|
experimental-initial-corrupt-check: true
|
||||||
|
experimental-max-learners: 1
|
||||||
|
-experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
|
||||||
|
-experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds "$ETCD_WARNING_APPLY_DURATION")
|
||||||
|
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds "$ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
|
||||||
|
EOF
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
@ -1,45 +0,0 @@
|
|||||||
--- a/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/response.py 2023-10-17 19:42:56.000000000 +0200
|
|
||||||
+++ b/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/response.py 2026-01-02 11:19:25.583808492 +0100
|
|
||||||
@@ -135,8 +135,18 @@
|
|
||||||
they were applied.
|
|
||||||
"""
|
|
||||||
|
|
||||||
+ # Maximum allowed number of chained HTTP encodings in the
|
|
||||||
+ # Content-Encoding header.
|
|
||||||
+ max_decode_links = 5
|
|
||||||
+
|
|
||||||
def __init__(self, modes):
|
|
||||||
- self._decoders = [_get_decoder(m.strip()) for m in modes.split(",")]
|
|
||||||
+ encodings = [m.strip() for m in modes.split(",")]
|
|
||||||
+ if len(encodings) > self.max_decode_links:
|
|
||||||
+ raise DecodeError(
|
|
||||||
+ "Too many content encodings in the chain: "
|
|
||||||
+ f"{len(encodings)} > {self.max_decode_links}"
|
|
||||||
+ )
|
|
||||||
+ self._decoders = [_get_decoder(e) for e in encodings]
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
return self._decoders[0].flush()
|
|
||||||
|
|
||||||
--- a/gcp/google-cloud-sdk/lib/third_party/urllib3/response.py 2023-10-17 19:42:56.000000000 +0200
|
|
||||||
+++ b/gcp/google-cloud-sdk/lib/third_party/urllib3/response.py 2026-01-02 11:19:25.583808492 +0100
|
|
||||||
@@ -135,8 +135,18 @@
|
|
||||||
they were applied.
|
|
||||||
"""
|
|
||||||
|
|
||||||
+ # Maximum allowed number of chained HTTP encodings in the
|
|
||||||
+ # Content-Encoding header.
|
|
||||||
+ max_decode_links = 5
|
|
||||||
+
|
|
||||||
def __init__(self, modes):
|
|
||||||
- self._decoders = [_get_decoder(m.strip()) for m in modes.split(",")]
|
|
||||||
+ encodings = [m.strip() for m in modes.split(",")]
|
|
||||||
+ if len(encodings) > self.max_decode_links:
|
|
||||||
+ raise DecodeError(
|
|
||||||
+ "Too many content encodings in the chain: "
|
|
||||||
+ f"{len(encodings)} > {self.max_decode_links}"
|
|
||||||
+ )
|
|
||||||
+ self._decoders = [_get_decoder(e) for e in encodings]
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
return self._decoders[0].flush()
|
|
||||||
@ -0,0 +1,25 @@
|
|||||||
|
From 7449fd88d21650db1eaafdc7ef85bf3553f6ac7f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Pablo Fontanilla <pfontani@redhat.com>
|
||||||
|
Date: Thu, 8 Jan 2026 09:42:42 +0100
|
||||||
|
Subject: [PATCH] OCPBUGS-64765: podman-etcd: add -a option to crictl ps
|
||||||
|
(#2112)
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 4 ++--
|
||||||
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index bb2900536..591a663bf 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -738,8 +738,8 @@ archive_data_folder()
|
||||||
|
|
||||||
|
etcd_pod_container_exists() {
|
||||||
|
local count_matches
|
||||||
|
- # Check whether the etcd pod exists on the same node (header line included)
|
||||||
|
- count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
|
||||||
|
+ # Check whether the etcd pod exists on the same node (including stopped/exited containers)
|
||||||
|
+ count_matches=$(crictl pods --label app=etcd -q | xargs -I {} crictl ps -a --pod {} -o json | jq -r '.containers[].metadata | select ( .name == "etcd" ).name' | wc -l)
|
||||||
|
if [ "$count_matches" -eq 1 ]; then
|
||||||
|
# etcd pod found
|
||||||
|
return 0
|
||||||
@ -1,563 +0,0 @@
|
|||||||
--- a/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/response.py 2026-01-20 10:46:57.006470161 +0100
|
|
||||||
+++ b/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/response.py 2026-01-20 10:55:44.090084896 +0100
|
|
||||||
@@ -23,6 +23,7 @@
|
|
||||||
from .exceptions import (
|
|
||||||
BodyNotHttplibCompatible,
|
|
||||||
DecodeError,
|
|
||||||
+ DependencyWarning,
|
|
||||||
HTTPError,
|
|
||||||
IncompleteRead,
|
|
||||||
InvalidChunkLength,
|
|
||||||
@@ -41,34 +42,60 @@
|
|
||||||
class DeflateDecoder(object):
|
|
||||||
def __init__(self):
|
|
||||||
self._first_try = True
|
|
||||||
- self._data = b""
|
|
||||||
+ self._first_try_data = b""
|
|
||||||
+ self._unfed_data = b""
|
|
||||||
self._obj = zlib.decompressobj()
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self._obj, name)
|
|
||||||
|
|
||||||
- def decompress(self, data):
|
|
||||||
- if not data:
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
+ data = self._unfed_data + data
|
|
||||||
+ self._unfed_data = b""
|
|
||||||
+ if not data and not self._obj.unconsumed_tail:
|
|
||||||
return data
|
|
||||||
+ original_max_length = max_length
|
|
||||||
+ if original_max_length < 0:
|
|
||||||
+ max_length = 0
|
|
||||||
+ elif original_max_length == 0:
|
|
||||||
+ # We should not pass 0 to the zlib decompressor because 0 is
|
|
||||||
+ # the default value that will make zlib decompress without a
|
|
||||||
+ # length limit.
|
|
||||||
+ # Data should be stored for subsequent calls.
|
|
||||||
+ self._unfed_data = data
|
|
||||||
+ return b""
|
|
||||||
|
|
||||||
+ # Subsequent calls always reuse `self._obj`. zlib requires
|
|
||||||
+ # passing the unconsumed tail if decompression is to continue.
|
|
||||||
if not self._first_try:
|
|
||||||
- return self._obj.decompress(data)
|
|
||||||
+ return self._obj.decompress(
|
|
||||||
+ self._obj.unconsumed_tail + data, max_length=max_length
|
|
||||||
+ )
|
|
||||||
|
|
||||||
- self._data += data
|
|
||||||
+ # First call tries with RFC 1950 ZLIB format.
|
|
||||||
+ self._first_try_data += data
|
|
||||||
try:
|
|
||||||
- decompressed = self._obj.decompress(data)
|
|
||||||
+ decompressed = self._obj.decompress(data, max_length=max_length)
|
|
||||||
if decompressed:
|
|
||||||
self._first_try = False
|
|
||||||
- self._data = None
|
|
||||||
+ self._first_try_data = b""
|
|
||||||
return decompressed
|
|
||||||
+ # On failure, it falls back to RFC 1951 DEFLATE format.
|
|
||||||
except zlib.error:
|
|
||||||
self._first_try = False
|
|
||||||
self._obj = zlib.decompressobj(-zlib.MAX_WBITS)
|
|
||||||
try:
|
|
||||||
- return self.decompress(self._data)
|
|
||||||
+ return self.decompress(
|
|
||||||
+ self._first_try_data, max_length=original_max_length
|
|
||||||
+ )
|
|
||||||
finally:
|
|
||||||
- self._data = None
|
|
||||||
+ self._first_try_data = b""
|
|
||||||
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ return bool(self._unfed_data) or (
|
|
||||||
+ bool(self._obj.unconsumed_tail) and not self._first_try
|
|
||||||
+ )
|
|
||||||
|
|
||||||
class GzipDecoderState(object):
|
|
||||||
|
|
||||||
@@ -81,30 +108,64 @@
|
|
||||||
def __init__(self):
|
|
||||||
self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
self._state = GzipDecoderState.FIRST_MEMBER
|
|
||||||
+ self._unconsumed_tail = b""
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self._obj, name)
|
|
||||||
|
|
||||||
- def decompress(self, data):
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
ret = bytearray()
|
|
||||||
- if self._state == GzipDecoderState.SWALLOW_DATA or not data:
|
|
||||||
+ if self._state == GzipDecoderState.SWALLOW_DATA:
|
|
||||||
+ return bytes(ret)
|
|
||||||
+
|
|
||||||
+ if max_length == 0:
|
|
||||||
+ # We should not pass 0 to the zlib decompressor because 0 is
|
|
||||||
+ # the default value that will make zlib decompress without a
|
|
||||||
+ # length limit.
|
|
||||||
+ # Data should be stored for subsequent calls.
|
|
||||||
+ self._unconsumed_tail += data
|
|
||||||
+ return b""
|
|
||||||
+
|
|
||||||
+ # zlib requires passing the unconsumed tail to the subsequent
|
|
||||||
+ # call if decompression is to continue.
|
|
||||||
+ data = self._unconsumed_tail + data
|
|
||||||
+ if not data and self._obj.eof:
|
|
||||||
return bytes(ret)
|
|
||||||
+
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
- ret += self._obj.decompress(data)
|
|
||||||
+ ret += self._obj.decompress(
|
|
||||||
+ data, max_length=max(max_length - len(ret), 0)
|
|
||||||
+ )
|
|
||||||
except zlib.error:
|
|
||||||
previous_state = self._state
|
|
||||||
# Ignore data after the first error
|
|
||||||
self._state = GzipDecoderState.SWALLOW_DATA
|
|
||||||
+ self._unconsumed_tail = b""
|
|
||||||
if previous_state == GzipDecoderState.OTHER_MEMBERS:
|
|
||||||
# Allow trailing garbage acceptable in other gzip clients
|
|
||||||
return bytes(ret)
|
|
||||||
raise
|
|
||||||
- data = self._obj.unused_data
|
|
||||||
+
|
|
||||||
+ self._unconsumed_tail = data = (
|
|
||||||
+ self._obj.unconsumed_tail or self._obj.unused_data
|
|
||||||
+ )
|
|
||||||
+ if max_length > 0 and len(ret) >= max_length:
|
|
||||||
+ break
|
|
||||||
+
|
|
||||||
if not data:
|
|
||||||
return bytes(ret)
|
|
||||||
- self._state = GzipDecoderState.OTHER_MEMBERS
|
|
||||||
- self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
+ # When the end of a gzip member is reached, a new decompressor
|
|
||||||
+ # must be created for unused (possibly future) data.
|
|
||||||
+ if self._obj.eof:
|
|
||||||
+ self._state = GzipDecoderState.OTHER_MEMBERS
|
|
||||||
+ self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
+
|
|
||||||
+ return bytes(ret)
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ return bool(self._unconsumed_tail)
|
|
||||||
|
|
||||||
|
|
||||||
if brotli is not None:
|
|
||||||
@@ -116,9 +177,35 @@
|
|
||||||
def __init__(self):
|
|
||||||
self._obj = brotli.Decompressor()
|
|
||||||
if hasattr(self._obj, "decompress"):
|
|
||||||
- self.decompress = self._obj.decompress
|
|
||||||
+ setattr(self, "_decompress", self._obj.decompress)
|
|
||||||
else:
|
|
||||||
- self.decompress = self._obj.process
|
|
||||||
+ setattr(self, "_decompress", self._obj.process)
|
|
||||||
+
|
|
||||||
+ # Requires Brotli >= 1.2.0 for `output_buffer_limit`.
|
|
||||||
+ def _decompress(self, data: bytes, output_buffer_limit: int = -1) -> bytes:
|
|
||||||
+ raise NotImplementedError()
|
|
||||||
+
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
+ try:
|
|
||||||
+ if max_length > 0:
|
|
||||||
+ return self._decompress(data, output_buffer_limit=max_length)
|
|
||||||
+ else:
|
|
||||||
+ return self._decompress(data)
|
|
||||||
+ except TypeError:
|
|
||||||
+ # Fallback for Brotli/brotlicffi/brotlipy versions without
|
|
||||||
+ # the `output_buffer_limit` parameter.
|
|
||||||
+ warnings.warn(
|
|
||||||
+ "Brotli >= 1.2.0 is required to prevent decompression bombs.",
|
|
||||||
+ DependencyWarning,
|
|
||||||
+ )
|
|
||||||
+ return self._decompress(data)
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ try:
|
|
||||||
+ return not self._obj.can_accept_more_data()
|
|
||||||
+ except AttributeError:
|
|
||||||
+ return False
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
if hasattr(self._obj, "flush"):
|
|
||||||
@@ -151,10 +238,35 @@
|
|
||||||
def flush(self):
|
|
||||||
return self._decoders[0].flush()
|
|
||||||
|
|
||||||
- def decompress(self, data):
|
|
||||||
- for d in reversed(self._decoders):
|
|
||||||
- data = d.decompress(data)
|
|
||||||
- return data
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
+ if max_length <= 0:
|
|
||||||
+ for d in reversed(self._decoders):
|
|
||||||
+ data = d.decompress(data)
|
|
||||||
+ return data
|
|
||||||
+
|
|
||||||
+ ret = bytearray()
|
|
||||||
+ # Every while loop iteration goes through all decoders once.
|
|
||||||
+ # It exits when enough data is read or no more data can be read.
|
|
||||||
+ # It is possible that the while loop iteration does not produce
|
|
||||||
+ # any data because we retrieve up to `max_length` from every
|
|
||||||
+ # decoder, and the amount of bytes may be insufficient for the
|
|
||||||
+ # next decoder to produce enough/any output.
|
|
||||||
+ while True:
|
|
||||||
+ any_data = False
|
|
||||||
+ for d in reversed(self._decoders):
|
|
||||||
+ data = d.decompress(data, max_length=max_length - len(ret))
|
|
||||||
+ if data:
|
|
||||||
+ any_data = True
|
|
||||||
+ # We should not break when no data is returned because
|
|
||||||
+ # next decoders may produce data even with empty input.
|
|
||||||
+ ret += data
|
|
||||||
+ if not any_data or len(ret) >= max_length:
|
|
||||||
+ return bytes(ret)
|
|
||||||
+ data = b""
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ return any(d.has_unconsumed_tail for d in self._decoders)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_decoder(mode):
|
|
||||||
@@ -405,16 +517,25 @@
|
|
||||||
if brotli is not None:
|
|
||||||
DECODER_ERROR_CLASSES += (brotli.error,)
|
|
||||||
|
|
||||||
- def _decode(self, data, decode_content, flush_decoder):
|
|
||||||
+ def _decode(
|
|
||||||
+ self,
|
|
||||||
+ data: bytes,
|
|
||||||
+ decode_content: bool,
|
|
||||||
+ flush_decoder: bool,
|
|
||||||
+ max_length: int = None,
|
|
||||||
+ ) -> bytes:
|
|
||||||
"""
|
|
||||||
Decode the data passed in and potentially flush the decoder.
|
|
||||||
"""
|
|
||||||
if not decode_content:
|
|
||||||
return data
|
|
||||||
|
|
||||||
+ if max_length is None or flush_decoder:
|
|
||||||
+ max_length = -1
|
|
||||||
+
|
|
||||||
try:
|
|
||||||
if self._decoder:
|
|
||||||
- data = self._decoder.decompress(data)
|
|
||||||
+ data = self._decoder.decompress(data, max_length=max_length)
|
|
||||||
except self.DECODER_ERROR_CLASSES as e:
|
|
||||||
content_encoding = self.headers.get("content-encoding", "").lower()
|
|
||||||
raise DecodeError(
|
|
||||||
@@ -634,7 +755,10 @@
|
|
||||||
for line in self.read_chunked(amt, decode_content=decode_content):
|
|
||||||
yield line
|
|
||||||
else:
|
|
||||||
- while not is_fp_closed(self._fp):
|
|
||||||
+ while (
|
|
||||||
+ not is_fp_closed(self._fp)
|
|
||||||
+ or (self._decoder and self._decoder.has_unconsumed_tail)
|
|
||||||
+ ):
|
|
||||||
data = self.read(amt=amt, decode_content=decode_content)
|
|
||||||
|
|
||||||
if data:
|
|
||||||
@@ -840,7 +964,10 @@
|
|
||||||
break
|
|
||||||
chunk = self._handle_chunk(amt)
|
|
||||||
decoded = self._decode(
|
|
||||||
- chunk, decode_content=decode_content, flush_decoder=False
|
|
||||||
+ chunk,
|
|
||||||
+ decode_content=decode_content,
|
|
||||||
+ flush_decoder=False,
|
|
||||||
+ max_length=amt,
|
|
||||||
)
|
|
||||||
if decoded:
|
|
||||||
yield decoded
|
|
||||||
|
|
||||||
--- a/gcp/google-cloud-sdk/lib/third_party/urllib3/response.py 2026-01-20 10:46:57.006470161 +0100
|
|
||||||
+++ b/gcp/google-cloud-sdk/lib/third_party/urllib3/response.py 2026-01-20 10:55:44.090084896 +0100
|
|
||||||
@@ -23,6 +23,7 @@
|
|
||||||
from .exceptions import (
|
|
||||||
BodyNotHttplibCompatible,
|
|
||||||
DecodeError,
|
|
||||||
+ DependencyWarning,
|
|
||||||
HTTPError,
|
|
||||||
IncompleteRead,
|
|
||||||
InvalidChunkLength,
|
|
||||||
@@ -41,34 +42,60 @@
|
|
||||||
class DeflateDecoder(object):
|
|
||||||
def __init__(self):
|
|
||||||
self._first_try = True
|
|
||||||
- self._data = b""
|
|
||||||
+ self._first_try_data = b""
|
|
||||||
+ self._unfed_data = b""
|
|
||||||
self._obj = zlib.decompressobj()
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self._obj, name)
|
|
||||||
|
|
||||||
- def decompress(self, data):
|
|
||||||
- if not data:
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
+ data = self._unfed_data + data
|
|
||||||
+ self._unfed_data = b""
|
|
||||||
+ if not data and not self._obj.unconsumed_tail:
|
|
||||||
return data
|
|
||||||
+ original_max_length = max_length
|
|
||||||
+ if original_max_length < 0:
|
|
||||||
+ max_length = 0
|
|
||||||
+ elif original_max_length == 0:
|
|
||||||
+ # We should not pass 0 to the zlib decompressor because 0 is
|
|
||||||
+ # the default value that will make zlib decompress without a
|
|
||||||
+ # length limit.
|
|
||||||
+ # Data should be stored for subsequent calls.
|
|
||||||
+ self._unfed_data = data
|
|
||||||
+ return b""
|
|
||||||
|
|
||||||
+ # Subsequent calls always reuse `self._obj`. zlib requires
|
|
||||||
+ # passing the unconsumed tail if decompression is to continue.
|
|
||||||
if not self._first_try:
|
|
||||||
- return self._obj.decompress(data)
|
|
||||||
+ return self._obj.decompress(
|
|
||||||
+ self._obj.unconsumed_tail + data, max_length=max_length
|
|
||||||
+ )
|
|
||||||
|
|
||||||
- self._data += data
|
|
||||||
+ # First call tries with RFC 1950 ZLIB format.
|
|
||||||
+ self._first_try_data += data
|
|
||||||
try:
|
|
||||||
- decompressed = self._obj.decompress(data)
|
|
||||||
+ decompressed = self._obj.decompress(data, max_length=max_length)
|
|
||||||
if decompressed:
|
|
||||||
self._first_try = False
|
|
||||||
- self._data = None
|
|
||||||
+ self._first_try_data = b""
|
|
||||||
return decompressed
|
|
||||||
+ # On failure, it falls back to RFC 1951 DEFLATE format.
|
|
||||||
except zlib.error:
|
|
||||||
self._first_try = False
|
|
||||||
self._obj = zlib.decompressobj(-zlib.MAX_WBITS)
|
|
||||||
try:
|
|
||||||
- return self.decompress(self._data)
|
|
||||||
+ return self.decompress(
|
|
||||||
+ self._first_try_data, max_length=original_max_length
|
|
||||||
+ )
|
|
||||||
finally:
|
|
||||||
- self._data = None
|
|
||||||
+ self._first_try_data = b""
|
|
||||||
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ return bool(self._unfed_data) or (
|
|
||||||
+ bool(self._obj.unconsumed_tail) and not self._first_try
|
|
||||||
+ )
|
|
||||||
|
|
||||||
class GzipDecoderState(object):
|
|
||||||
|
|
||||||
@@ -81,30 +108,64 @@
|
|
||||||
def __init__(self):
|
|
||||||
self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
self._state = GzipDecoderState.FIRST_MEMBER
|
|
||||||
+ self._unconsumed_tail = b""
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
|
||||||
return getattr(self._obj, name)
|
|
||||||
|
|
||||||
- def decompress(self, data):
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
ret = bytearray()
|
|
||||||
- if self._state == GzipDecoderState.SWALLOW_DATA or not data:
|
|
||||||
+ if self._state == GzipDecoderState.SWALLOW_DATA:
|
|
||||||
+ return bytes(ret)
|
|
||||||
+
|
|
||||||
+ if max_length == 0:
|
|
||||||
+ # We should not pass 0 to the zlib decompressor because 0 is
|
|
||||||
+ # the default value that will make zlib decompress without a
|
|
||||||
+ # length limit.
|
|
||||||
+ # Data should be stored for subsequent calls.
|
|
||||||
+ self._unconsumed_tail += data
|
|
||||||
+ return b""
|
|
||||||
+
|
|
||||||
+ # zlib requires passing the unconsumed tail to the subsequent
|
|
||||||
+ # call if decompression is to continue.
|
|
||||||
+ data = self._unconsumed_tail + data
|
|
||||||
+ if not data and self._obj.eof:
|
|
||||||
return bytes(ret)
|
|
||||||
+
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
- ret += self._obj.decompress(data)
|
|
||||||
+ ret += self._obj.decompress(
|
|
||||||
+ data, max_length=max(max_length - len(ret), 0)
|
|
||||||
+ )
|
|
||||||
except zlib.error:
|
|
||||||
previous_state = self._state
|
|
||||||
# Ignore data after the first error
|
|
||||||
self._state = GzipDecoderState.SWALLOW_DATA
|
|
||||||
+ self._unconsumed_tail = b""
|
|
||||||
if previous_state == GzipDecoderState.OTHER_MEMBERS:
|
|
||||||
# Allow trailing garbage acceptable in other gzip clients
|
|
||||||
return bytes(ret)
|
|
||||||
raise
|
|
||||||
- data = self._obj.unused_data
|
|
||||||
+
|
|
||||||
+ self._unconsumed_tail = data = (
|
|
||||||
+ self._obj.unconsumed_tail or self._obj.unused_data
|
|
||||||
+ )
|
|
||||||
+ if max_length > 0 and len(ret) >= max_length:
|
|
||||||
+ break
|
|
||||||
+
|
|
||||||
if not data:
|
|
||||||
return bytes(ret)
|
|
||||||
- self._state = GzipDecoderState.OTHER_MEMBERS
|
|
||||||
- self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
+ # When the end of a gzip member is reached, a new decompressor
|
|
||||||
+ # must be created for unused (possibly future) data.
|
|
||||||
+ if self._obj.eof:
|
|
||||||
+ self._state = GzipDecoderState.OTHER_MEMBERS
|
|
||||||
+ self._obj = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
||||||
+
|
|
||||||
+ return bytes(ret)
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ return bool(self._unconsumed_tail)
|
|
||||||
|
|
||||||
|
|
||||||
if brotli is not None:
|
|
||||||
@@ -116,9 +177,35 @@
|
|
||||||
def __init__(self):
|
|
||||||
self._obj = brotli.Decompressor()
|
|
||||||
if hasattr(self._obj, "decompress"):
|
|
||||||
- self.decompress = self._obj.decompress
|
|
||||||
+ setattr(self, "_decompress", self._obj.decompress)
|
|
||||||
else:
|
|
||||||
- self.decompress = self._obj.process
|
|
||||||
+ setattr(self, "_decompress", self._obj.process)
|
|
||||||
+
|
|
||||||
+ # Requires Brotli >= 1.2.0 for `output_buffer_limit`.
|
|
||||||
+ def _decompress(self, data: bytes, output_buffer_limit: int = -1) -> bytes:
|
|
||||||
+ raise NotImplementedError()
|
|
||||||
+
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
+ try:
|
|
||||||
+ if max_length > 0:
|
|
||||||
+ return self._decompress(data, output_buffer_limit=max_length)
|
|
||||||
+ else:
|
|
||||||
+ return self._decompress(data)
|
|
||||||
+ except TypeError:
|
|
||||||
+ # Fallback for Brotli/brotlicffi/brotlipy versions without
|
|
||||||
+ # the `output_buffer_limit` parameter.
|
|
||||||
+ warnings.warn(
|
|
||||||
+ "Brotli >= 1.2.0 is required to prevent decompression bombs.",
|
|
||||||
+ DependencyWarning,
|
|
||||||
+ )
|
|
||||||
+ return self._decompress(data)
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ try:
|
|
||||||
+ return not self._obj.can_accept_more_data()
|
|
||||||
+ except AttributeError:
|
|
||||||
+ return False
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
if hasattr(self._obj, "flush"):
|
|
||||||
@@ -151,10 +238,35 @@
|
|
||||||
def flush(self):
|
|
||||||
return self._decoders[0].flush()
|
|
||||||
|
|
||||||
- def decompress(self, data):
|
|
||||||
- for d in reversed(self._decoders):
|
|
||||||
- data = d.decompress(data)
|
|
||||||
- return data
|
|
||||||
+ def decompress(self, data: bytes, max_length: int = -1) -> bytes:
|
|
||||||
+ if max_length <= 0:
|
|
||||||
+ for d in reversed(self._decoders):
|
|
||||||
+ data = d.decompress(data)
|
|
||||||
+ return data
|
|
||||||
+
|
|
||||||
+ ret = bytearray()
|
|
||||||
+ # Every while loop iteration goes through all decoders once.
|
|
||||||
+ # It exits when enough data is read or no more data can be read.
|
|
||||||
+ # It is possible that the while loop iteration does not produce
|
|
||||||
+ # any data because we retrieve up to `max_length` from every
|
|
||||||
+ # decoder, and the amount of bytes may be insufficient for the
|
|
||||||
+ # next decoder to produce enough/any output.
|
|
||||||
+ while True:
|
|
||||||
+ any_data = False
|
|
||||||
+ for d in reversed(self._decoders):
|
|
||||||
+ data = d.decompress(data, max_length=max_length - len(ret))
|
|
||||||
+ if data:
|
|
||||||
+ any_data = True
|
|
||||||
+ # We should not break when no data is returned because
|
|
||||||
+ # next decoders may produce data even with empty input.
|
|
||||||
+ ret += data
|
|
||||||
+ if not any_data or len(ret) >= max_length:
|
|
||||||
+ return bytes(ret)
|
|
||||||
+ data = b""
|
|
||||||
+
|
|
||||||
+ @property
|
|
||||||
+ def has_unconsumed_tail(self) -> bool:
|
|
||||||
+ return any(d.has_unconsumed_tail for d in self._decoders)
|
|
||||||
|
|
||||||
|
|
||||||
def _get_decoder(mode):
|
|
||||||
@@ -405,16 +517,25 @@
|
|
||||||
if brotli is not None:
|
|
||||||
DECODER_ERROR_CLASSES += (brotli.error,)
|
|
||||||
|
|
||||||
- def _decode(self, data, decode_content, flush_decoder):
|
|
||||||
+ def _decode(
|
|
||||||
+ self,
|
|
||||||
+ data: bytes,
|
|
||||||
+ decode_content: bool,
|
|
||||||
+ flush_decoder: bool,
|
|
||||||
+ max_length: int = None,
|
|
||||||
+ ) -> bytes:
|
|
||||||
"""
|
|
||||||
Decode the data passed in and potentially flush the decoder.
|
|
||||||
"""
|
|
||||||
if not decode_content:
|
|
||||||
return data
|
|
||||||
|
|
||||||
+ if max_length is None or flush_decoder:
|
|
||||||
+ max_length = -1
|
|
||||||
+
|
|
||||||
try:
|
|
||||||
if self._decoder:
|
|
||||||
- data = self._decoder.decompress(data)
|
|
||||||
+ data = self._decoder.decompress(data, max_length=max_length)
|
|
||||||
except self.DECODER_ERROR_CLASSES as e:
|
|
||||||
content_encoding = self.headers.get("content-encoding", "").lower()
|
|
||||||
raise DecodeError(
|
|
||||||
@@ -634,7 +755,10 @@
|
|
||||||
for line in self.read_chunked(amt, decode_content=decode_content):
|
|
||||||
yield line
|
|
||||||
else:
|
|
||||||
- while not is_fp_closed(self._fp):
|
|
||||||
+ while (
|
|
||||||
+ not is_fp_closed(self._fp)
|
|
||||||
+ or (self._decoder and self._decoder.has_unconsumed_tail)
|
|
||||||
+ ):
|
|
||||||
data = self.read(amt=amt, decode_content=decode_content)
|
|
||||||
|
|
||||||
if data:
|
|
||||||
@@ -840,7 +964,10 @@
|
|
||||||
break
|
|
||||||
chunk = self._handle_chunk(amt)
|
|
||||||
decoded = self._decode(
|
|
||||||
- chunk, decode_content=decode_content, flush_decoder=False
|
|
||||||
+ chunk,
|
|
||||||
+ decode_content=decode_content,
|
|
||||||
+ flush_decoder=False,
|
|
||||||
+ max_length=amt,
|
|
||||||
)
|
|
||||||
if decoded:
|
|
||||||
yield decoded
|
|
||||||
@ -1,63 +0,0 @@
|
|||||||
--- a/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/response.py 2023-10-17 19:42:56.000000000 +0200
|
|
||||||
+++ b/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/response.py 2026-01-13 14:17:48.477104360 +0100
|
|
||||||
@@ -350,6 +350,7 @@
|
|
||||||
self.reason = reason
|
|
||||||
self.strict = strict
|
|
||||||
self.decode_content = decode_content
|
|
||||||
+ self._has_decoded_content = False
|
|
||||||
self.retries = retries
|
|
||||||
self.enforce_content_length = enforce_content_length
|
|
||||||
self.auto_close = auto_close
|
|
||||||
@@ -414,7 +415,11 @@
|
|
||||||
Unread data in the HTTPResponse connection blocks the connection from being released back to the pool.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
- self.read()
|
|
||||||
+ self.read(
|
|
||||||
+ # Do not spend resources decoding the content unless
|
|
||||||
+ # decoding has already been initiated.
|
|
||||||
+ decode_content=self._has_decoded_content,
|
|
||||||
+ )
|
|
||||||
except (HTTPError, SocketError, BaseSSLError, HTTPException):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@@ -536,6 +541,7 @@
|
|
||||||
try:
|
|
||||||
if self._decoder:
|
|
||||||
data = self._decoder.decompress(data, max_length=max_length)
|
|
||||||
+ self._has_decoded_content = True
|
|
||||||
except self.DECODER_ERROR_CLASSES as e:
|
|
||||||
content_encoding = self.headers.get("content-encoding", "").lower()
|
|
||||||
raise DecodeError(
|
|
||||||
|
|
||||||
--- a/gcp/google-cloud-sdk/lib/third_party/urllib3/response.py 2023-10-17 19:42:56.000000000 +0200
|
|
||||||
+++ b/gcp/google-cloud-sdk/lib/third_party/urllib3/response.py 2026-01-13 14:17:48.477104360 +0100
|
|
||||||
@@ -350,6 +350,7 @@
|
|
||||||
self.reason = reason
|
|
||||||
self.strict = strict
|
|
||||||
self.decode_content = decode_content
|
|
||||||
+ self._has_decoded_content = False
|
|
||||||
self.retries = retries
|
|
||||||
self.enforce_content_length = enforce_content_length
|
|
||||||
self.auto_close = auto_close
|
|
||||||
@@ -414,7 +415,11 @@
|
|
||||||
Unread data in the HTTPResponse connection blocks the connection from being released back to the pool.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
- self.read()
|
|
||||||
+ self.read(
|
|
||||||
+ # Do not spend resources decoding the content unless
|
|
||||||
+ # decoding has already been initiated.
|
|
||||||
+ decode_content=self._has_decoded_content,
|
|
||||||
+ )
|
|
||||||
except (HTTPError, SocketError, BaseSSLError, HTTPException):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@@ -536,6 +541,7 @@
|
|
||||||
try:
|
|
||||||
if self._decoder:
|
|
||||||
data = self._decoder.decompress(data, max_length=max_length)
|
|
||||||
+ self._has_decoded_content = True
|
|
||||||
except self.DECODER_ERROR_CLASSES as e:
|
|
||||||
content_encoding = self.headers.get("content-encoding", "").lower()
|
|
||||||
raise DecodeError(
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
--- a/gcp/google-cloud-sdk/lib/third_party/pyasn1/codec/ber/decoder.py 2019-10-17 07:00:19.000000000 +0200
|
|
||||||
+++ b/gcp/google-cloud-sdk/lib/third_party/pyasn1/codec/ber/decoder.py 2026-01-27 10:43:12.757563432 +0100
|
|
||||||
@@ -22,6 +22,10 @@
|
|
||||||
|
|
||||||
noValue = base.noValue
|
|
||||||
|
|
||||||
+# Maximum number of continuation octets (high-bit set) allowed per OID arc.
|
|
||||||
+# 20 octets allows up to 140-bit integers, supporting UUID-based OIDs
|
|
||||||
+MAX_OID_ARC_CONTINUATION_OCTETS = 20
|
|
||||||
+
|
|
||||||
|
|
||||||
class AbstractDecoder(object):
|
|
||||||
protoComponent = None
|
|
||||||
@@ -342,7 +346,14 @@
|
|
||||||
# Construct subid from a number of octets
|
|
||||||
nextSubId = subId
|
|
||||||
subId = 0
|
|
||||||
+ continuationOctetCount = 0
|
|
||||||
while nextSubId >= 128:
|
|
||||||
+ continuationOctetCount += 1
|
|
||||||
+ if continuationOctetCount > MAX_OID_ARC_CONTINUATION_OCTETS:
|
|
||||||
+ raise error.PyAsn1Error(
|
|
||||||
+ 'OID arc exceeds maximum continuation octets limit (%d) '
|
|
||||||
+ 'at position %d' % (MAX_OID_ARC_CONTINUATION_OCTETS, index)
|
|
||||||
+ )
|
|
||||||
subId = (subId << 7) + (nextSubId & 0x7F)
|
|
||||||
if index >= substrateLen:
|
|
||||||
raise error.SubstrateUnderrunError(
|
|
||||||
@ -0,0 +1,54 @@
|
|||||||
|
From 8f5c5a2a472ab404b6fd15ff492e72904dc8ac20 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Thu, 22 Jan 2026 07:37:40 +0100
|
||||||
|
Subject: [PATCH] powervs-move-ip/powervs-subnet: fix error logging
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/powervs-move-ip.in | 4 ++--
|
||||||
|
heartbeat/powervs-subnet.in | 10 ++++++----
|
||||||
|
2 files changed, 8 insertions(+), 6 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/powervs-move-ip.in b/heartbeat/powervs-move-ip.in
|
||||||
|
index e2250c998..0eea89f1d 100755
|
||||||
|
--- a/heartbeat/powervs-move-ip.in
|
||||||
|
+++ b/heartbeat/powervs-move-ip.in
|
||||||
|
@@ -310,9 +310,9 @@ class PowerCloudTokenManager:
|
||||||
|
return json.load(f)
|
||||||
|
finally:
|
||||||
|
fcntl.flock(f, fcntl.LOCK_UN)
|
||||||
|
- except (json.JSONDecodeError, FileNotFoundError, PermissionError):
|
||||||
|
+ except (json.JSONDecodeError, FileNotFoundError, PermissionError) as e:
|
||||||
|
ocf.logger.warning(
|
||||||
|
- "[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON."
|
||||||
|
+ f"[PowerCloudTokenManager] _read_cache: failed to read token cache read due to missing file or malformed JSON: '{e}'"
|
||||||
|
)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
diff --git a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in
|
||||||
|
index 062b1235e..b8f3864e9 100755
|
||||||
|
--- a/heartbeat/powervs-subnet.in
|
||||||
|
+++ b/heartbeat/powervs-subnet.in
|
||||||
|
@@ -837,8 +837,9 @@ def start_action(
|
||||||
|
if rc != ocf.OCF_SUCCESS:
|
||||||
|
return rc
|
||||||
|
|
||||||
|
- if monitor_action(**res_options) != ocf.OCF_SUCCESS:
|
||||||
|
- raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed")
|
||||||
|
+ rc = monitor_action(**res_options)
|
||||||
|
+ if rc != ocf.OCF_SUCCESS:
|
||||||
|
+ raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed", rc)
|
||||||
|
|
||||||
|
ocf.logger.info(
|
||||||
|
f"start_action: finished, added connection {conn_name} for subnet {ws.subnet_name}"
|
||||||
|
@@ -872,8 +873,9 @@ def stop_action(
|
||||||
|
|
||||||
|
ws.subnet_remove()
|
||||||
|
|
||||||
|
- if monitor_action(**res_options) != ocf.OCF_NOT_RUNNING:
|
||||||
|
- raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed")
|
||||||
|
+ rc = monitor_action(**res_options)
|
||||||
|
+ if rc != ocf.OCF_NOT_RUNNING:
|
||||||
|
+ raise PowerCloudAPIError(f"stop_action: stop subnet {ws.subnet_name} failed", rc)
|
||||||
|
|
||||||
|
ocf.logger.info(
|
||||||
|
f"stop_action: finished, deleted connection for subnet {ws.subnet_name}"
|
||||||
@ -0,0 +1,278 @@
|
|||||||
|
From 8df1e4dfdee960b971fb598c043b4ccb2b9fefca Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Mon, 3 Nov 2025 12:34:29 +0100
|
||||||
|
Subject: [PATCH] podman-etcd: enhance etcd data backup with snapshots and
|
||||||
|
retention
|
||||||
|
|
||||||
|
Replace basic data directory backup with proper etcd database snapshot
|
||||||
|
functionality. The new implementation:
|
||||||
|
- Creates timestamped snapshot files instead of moving the entire data directory
|
||||||
|
- Stores backups in a non-volatile location (backup_location parameter) instead
|
||||||
|
of the previous volatile HA_RSCTMP directory
|
||||||
|
- Validates backup file existence and size after creation
|
||||||
|
- Implements configurable retention policy via max_backup_snapshots parameter
|
||||||
|
- Automatically cleans up old snapshots to control storage usage
|
||||||
|
|
||||||
|
Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd
|
||||||
|
by default. This provides better backup reliability, persistence across reboots,
|
||||||
|
and storage management for etcd databases.
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 205 ++++++++++++++++++++++++++++++++++++++++--
|
||||||
|
1 file changed, 196 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index bb2900536..1d717ec00 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
|
||||||
|
OCF_RESKEY_oom_default="-997"
|
||||||
|
OCF_RESKEY_config_location_default="/var/lib/etcd"
|
||||||
|
OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||||
|
+OCF_RESKEY_max_backup_snapshots_default="3"
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
|
||||||
|
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
|
||||||
|
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
|
||||||
|
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
|
||||||
|
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
|
||||||
|
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
|
||||||
|
+: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
|
||||||
|
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
|
||||||
|
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
|
||||||
|
</parameter>
|
||||||
|
|
||||||
|
+<parameter name="max_backup_snapshots" required="0" unique="0">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+Maximum number of etcd database snapshots to retain. When a new snapshot is created,
|
||||||
|
+older snapshots will be automatically removed to maintain this limit. This helps
|
||||||
|
+control storage usage while ensuring recent backups are available for recovery.
|
||||||
|
+Set max_backup_snapshots=0 to disable backups.
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
|
||||||
|
+<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
|
||||||
|
+</parameter>
|
||||||
|
+
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<actions>
|
||||||
|
@@ -720,20 +733,190 @@ EOF
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
+# Remove etcd member directory to allow the node to rejoin the cluster as a learner.
|
||||||
|
+#
|
||||||
|
+# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
|
||||||
|
+# data inconsistencies. This function removes the member directory and syncs to disk.
|
||||||
|
+#
|
||||||
|
+# Returns:
|
||||||
|
+# OCF_SUCCESS - Member directory successfully removed
|
||||||
|
+# OCF_ERR_GENERIC - Failed to remove member directory (critical error)
|
||||||
|
+wipe_data_folder_for_learner()
|
||||||
|
+{
|
||||||
|
+ ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin"
|
||||||
|
+ if ! rm -rf "$ETCD_MEMBER_DIR"; then
|
||||||
|
+ ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ sync
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+# Calculate available disk space in bytes for a given directory.
|
||||||
|
+#
|
||||||
|
+# This function queries the filesystem and returns available space in bytes.
|
||||||
|
+# It converts df output (KB) to bytes for consistent size comparisons.
|
||||||
|
+#
|
||||||
|
+# Arguments:
|
||||||
|
+# $1 - Target directory path to check
|
||||||
|
+#
|
||||||
|
+# Returns:
|
||||||
|
+# OCF_SUCCESS - Available space in bytes (via stdout)
|
||||||
|
+# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
|
||||||
|
+get_available_space_in_directory()
|
||||||
|
+{
|
||||||
|
+ local target_dir=$1
|
||||||
|
+ local available_space_kb
|
||||||
|
+ local available_space_bytes
|
||||||
|
+
|
||||||
|
+ available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1)
|
||||||
|
+
|
||||||
|
+ # Validate output is numeric
|
||||||
|
+ if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then
|
||||||
|
+ echo "df command failed or returned invalid value: $available_space_kb"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ available_space_bytes=$((available_space_kb*1024))
|
||||||
|
+ echo "$available_space_bytes"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+# Archive etcd database with backup and cleanup
|
||||||
|
+#
|
||||||
|
+# This function creates a backup copy of the etcd database, validates it, and
|
||||||
|
+# removes old backups according to the retention policy. Backups are optional
|
||||||
|
+# and can be disabled by setting max_backup_snapshots=0.
|
||||||
|
+#
|
||||||
|
+# Error handling strategy:
|
||||||
|
+# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
|
||||||
|
+# Backups are beneficial but not critical for recovery operations.
|
||||||
|
+#
|
||||||
|
+# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
|
||||||
|
+# server is not running when this backup is performed.
|
||||||
|
archive_data_folder()
|
||||||
|
{
|
||||||
|
- # TODO: use etcd snapshots
|
||||||
|
- local dest_dir_name
|
||||||
|
- local data_dir="/var/lib/etcd/member"
|
||||||
|
+ local backup_dir="$OCF_RESKEY_backup_location"
|
||||||
|
+ local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
|
||||||
|
|
||||||
|
- dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
|
||||||
|
- if [ ! -d $data_dir ]; then
|
||||||
|
- ocf_log info "no data dir to backup"
|
||||||
|
+ if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then
|
||||||
|
+ ocf_log debug "etcd backup disabled (max_backup_snapshots=0)"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
- ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
|
||||||
|
- mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
|
||||||
|
- sync
|
||||||
|
+
|
||||||
|
+ # Check if the etcd database file exists
|
||||||
|
+ if [ ! -f "$etcd_db_path" ]; then
|
||||||
|
+ ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Ensure backup directory exists
|
||||||
|
+ if [ ! -d "$backup_dir" ]; then
|
||||||
|
+ ocf_log debug "creating backup directory: '$backup_dir'"
|
||||||
|
+ if ! mkdir -p "$backup_dir"; then
|
||||||
|
+ ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ ocf_log debug "checking disk space: backup_dir=$backup_dir"
|
||||||
|
+ local available_space_bytes
|
||||||
|
+ if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then
|
||||||
|
+ ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ local required_space_bytes
|
||||||
|
+ required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1)
|
||||||
|
+ if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then
|
||||||
|
+ ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then
|
||||||
|
+ ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Generate timestamp and backup filename
|
||||||
|
+ local timestamp
|
||||||
|
+ timestamp=$(date +%Y%m%d-%H%M%S)
|
||||||
|
+
|
||||||
|
+ local backup_file
|
||||||
|
+ backup_file="$backup_dir/snapshot-$timestamp.db"
|
||||||
|
+
|
||||||
|
+ ocf_log info "creating etcd database backup: '$backup_file'"
|
||||||
|
+
|
||||||
|
+ # Create the backup by copying the database file (enable Copy-on-Write copy)
|
||||||
|
+ if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then
|
||||||
|
+ ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Validate the backup file exists and has the expected size
|
||||||
|
+ if [ ! -f "$backup_file" ]; then
|
||||||
|
+ ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ local backup_size_bytes
|
||||||
|
+ backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
|
||||||
|
+ if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then
|
||||||
|
+ ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)"
|
||||||
|
+ rm -f "$backup_file"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)"
|
||||||
|
+
|
||||||
|
+ # Cleanup old backups based on retention policy
|
||||||
|
+ cleanup_old_backups "$backup_dir"
|
||||||
|
+
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+cleanup_old_backups()
|
||||||
|
+{
|
||||||
|
+ local backup_dir="$1"
|
||||||
|
+ local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
|
||||||
|
+ local backup_count
|
||||||
|
+ local backups_to_remove
|
||||||
|
+ local old_backups
|
||||||
|
+
|
||||||
|
+ # Validate max_snapshots is a positive integer
|
||||||
|
+ if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
|
||||||
|
+ ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Count existing backup files
|
||||||
|
+ backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l)
|
||||||
|
+
|
||||||
|
+ if [ "$backup_count" -le "$max_snapshots" ]; then
|
||||||
|
+ ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ # Calculate how many backups to remove
|
||||||
|
+ backups_to_remove=$((backup_count - max_snapshots))
|
||||||
|
+ ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"
|
||||||
|
+
|
||||||
|
+ # Find oldest backups sorted by modification time
|
||||||
|
+ # -t sorts by modification time, -r reverses (oldest first)
|
||||||
|
+ # -print0 and -0 handle filenames with spaces/special characters
|
||||||
|
+ old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \
|
||||||
|
+ xargs -0 -r ls -tr | \
|
||||||
|
+ head -n "$backups_to_remove")
|
||||||
|
+
|
||||||
|
+ if [ -n "$old_backups" ]; then
|
||||||
|
+ ocf_log info "removing old backups: $old_backups"
|
||||||
|
+ if ! echo "$old_backups" | xargs -r rm -f; then
|
||||||
|
+ ocf_log warn "failed to remove some old backups, error code: $?"
|
||||||
|
+ fi
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
etcd_pod_container_exists() {
|
||||||
|
@@ -1902,6 +2085,9 @@ podman_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
archive_data_folder
|
||||||
|
+ if ! wipe_data_folder_for_learner; then
|
||||||
|
+ return "$OCF_ERR_GENERIC"
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
|
||||||
|
@@ -2251,6 +2437,7 @@ CONTAINER=$OCF_RESKEY_name
|
||||||
|
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
|
||||||
|
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
|
||||||
|
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
|
||||||
|
+ETCD_MEMBER_DIR="/var/lib/etcd/member"
|
||||||
|
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
|
||||||
|
ETCD_REVISION_BUMP_PERCENTAGE=0.2
|
||||||
|
ETCD_BUMP_REV_DEFAULT=1000000000
|
||||||
@ -0,0 +1,111 @@
|
|||||||
|
From e4d311b40d8ded2a1921a0e5c01cb49a07c9fb35 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Carlo Lobrano <c.lobrano@gmail.com>
|
||||||
|
Date: Thu, 5 Feb 2026 19:31:42 +0100
|
||||||
|
Subject: [PATCH] podman-etcd: fix learner node attribute not set after etcdctl
|
||||||
|
failure
|
||||||
|
|
||||||
|
Ensure that learner_node attribute is always set when the member list
|
||||||
|
contains one learner member.
|
||||||
|
|
||||||
|
Moreover:
|
||||||
|
* Ensure set_standalone_node is called after adding a learner member.
|
||||||
|
* Capture stderr from etcdctl for better error logging.
|
||||||
|
---
|
||||||
|
heartbeat/podman-etcd | 61 +++++++++++++++++++++++++++----------------
|
||||||
|
1 file changed, 38 insertions(+), 23 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
|
||||||
|
index 77525ddb7..06814ad89 100755
|
||||||
|
--- a/heartbeat/podman-etcd
|
||||||
|
+++ b/heartbeat/podman-etcd
|
||||||
|
@@ -1082,7 +1082,7 @@ add_member_as_learner()
|
||||||
|
local peer_url=$(ip_url $member_ip)
|
||||||
|
|
||||||
|
ocf_log info "add $member_name ($member_ip) to the member list as learner"
|
||||||
|
- out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner)
|
||||||
|
+ out=$(podman exec "${CONTAINER}" etcdctl --endpoints="$endpoint_url:2379" member add "$member_name" --peer-urls="$peer_url:2380" --learner 2>&1)
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
ocf_log err "could not add $member_name as learner, error code $rc, etcdctl output: $out"
|
||||||
|
@@ -1429,10 +1429,22 @@ detect_cluster_leadership_loss()
|
||||||
|
manage_peer_membership()
|
||||||
|
{
|
||||||
|
local member_list_json="$1"
|
||||||
|
+ local peer_ip_map_entry
|
||||||
|
+ local peer_member_name
|
||||||
|
+ local peer_member_ip
|
||||||
|
+ local peer_member_id
|
||||||
|
+
|
||||||
|
+ # Get peer node name and IP
|
||||||
|
+ peer_ip_map_entry=$(echo "$OCF_RESKEY_node_ip_map" | tr ';' '\n' | grep -vF "$NODENAME")
|
||||||
|
+ if [ -z "$peer_ip_map_entry" ]; then
|
||||||
|
+ ocf_exit_reason "manage_peer_membership: could not parse node_ip_map: '$OCF_RESKEY_node_ip_map'"
|
||||||
|
+ exit $OCF_ERR_CONFIGURED
|
||||||
|
+ fi
|
||||||
|
+ peer_member_name=$(echo "$peer_ip_map_entry" | cut -d: -f1)
|
||||||
|
+ peer_member_ip=$(echo "$peer_ip_map_entry" | cut -d: -f2-)
|
||||||
|
|
||||||
|
- # Example of .members[] instance fields in member list json format:
|
||||||
|
- # NOTE that "name" is present in voting members only, while "isLearner" in learner members only
|
||||||
|
- # and the value is always true (not a string) in that case.
|
||||||
|
+ # Parsing the member list's json output to find a "learner" member.
|
||||||
|
+ # Example of .members[] instance fields in member list json format:
|
||||||
|
# {
|
||||||
|
# "ID": <member ID>,
|
||||||
|
# "name": "<node hostname>",
|
||||||
|
@@ -1443,26 +1455,28 @@ manage_peer_membership()
|
||||||
|
# "https://<node IP>:2379"
|
||||||
|
# ]
|
||||||
|
# }
|
||||||
|
- for node in $(echo "$OCF_RESKEY_node_ip_map" | sed "s/\s//g;s/;/ /g"); do
|
||||||
|
- name=$(echo "$node" | cut -d: -f1)
|
||||||
|
- # do not check itself
|
||||||
|
- if [ "$name" = "$NODENAME" ]; then
|
||||||
|
- continue
|
||||||
|
- fi
|
||||||
|
+ # NOTE that the "name" field is present in voting members only, while "isLearner"
|
||||||
|
+ # field in learner members only and the value is always true (not a string) in that case.
|
||||||
|
+ peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$peer_member_ip\")) | any).ID")
|
||||||
|
+ if [ -z "$peer_member_id" ]; then
|
||||||
|
+ ocf_log info "$peer_member_name is not in the members list"
|
||||||
|
+ add_member_as_learner "$peer_member_name" "$peer_member_ip"
|
||||||
|
+ set_standalone_node
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
|
||||||
|
- # Check by IP instead of Name since "learner" members appear only in peerURLs, not by Name.
|
||||||
|
- ip=$(echo "$node" | cut -d: -f2-) # Grab everything after the first : this covers ipv4/ipv6
|
||||||
|
- peer_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .peerURLs | map(test(\"$ip\")) | any).ID")
|
||||||
|
- if [ -z "$peer_member_id" ]; then
|
||||||
|
- ocf_log info "$name is not in the members list"
|
||||||
|
- add_member_as_learner "$name" "$ip"
|
||||||
|
- set_standalone_node
|
||||||
|
- else
|
||||||
|
- ocf_log debug "$name is in the members list by IP: $ip"
|
||||||
|
- # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||||
|
- reconcile_member_state "$member_list_json"
|
||||||
|
- fi
|
||||||
|
- done
|
||||||
|
+ # Ensure learner_node attribute is always set when we have a learner member
|
||||||
|
+ local learner_member_id=$(printf "%s" "$member_list_json" | jq -r ".members[] | select( .isLearner==true ).ID")
|
||||||
|
+ local current_learner_node=$(attribute_learner_node get)
|
||||||
|
+ if [ -n "$learner_member_id" ] && [ -z "$current_learner_node" ]; then
|
||||||
|
+ ocf_log debug "$peer_member_name found as learner in member list, but learner_node attribute was not set. Updating"
|
||||||
|
+ attribute_learner_node update "$peer_member_name"
|
||||||
|
+ return
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ ocf_log debug "$peer_member_name is in the members list by IP: $peer_member_ip"
|
||||||
|
+ # Errors from reconcile_member_state are logged internally. Ignoring them here prevents stopping a healthy voter agent; critical local failures are caught by detect_cluster_leadership_loss.
|
||||||
|
+ reconcile_member_state "$member_list_json"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_peer()
|
||||||
|
@@ -2209,6 +2223,7 @@ podman_start()
|
||||||
|
peer_node_ip="$(attribute_node_ip_peer)"
|
||||||
|
if [ -n "$peer_node_name" ] && [ -n "$peer_node_ip" ]; then
|
||||||
|
add_member_as_learner "$peer_node_name" "$peer_node_ip"
|
||||||
|
+ set_standalone_node
|
||||||
|
else
|
||||||
|
ocf_log err "could not add peer as learner (peer node name: ${peer_node_name:-unknown}, peer ip: ${peer_node_ip:-unknown})"
|
||||||
|
fi
|
||||||
@ -1,156 +0,0 @@
|
|||||||
--- a/heartbeat/portblock 2026-02-27 08:43:50.813925268 +0100
|
|
||||||
+++ b/heartbeat/portblock 2026-02-27 08:44:40.481824601 +0100
|
|
||||||
@@ -29,12 +29,17 @@
|
|
||||||
OCF_RESKEY_direction_default="in"
|
|
||||||
OCF_RESKEY_action_default=""
|
|
||||||
OCF_RESKEY_method_default="drop"
|
|
||||||
-OCF_RESKEY_status_check_default="rule"
|
|
||||||
OCF_RESKEY_ip_default="0.0.0.0/0"
|
|
||||||
OCF_RESKEY_reset_local_on_unblock_stop_default="false"
|
|
||||||
OCF_RESKEY_tickle_dir_default=""
|
|
||||||
OCF_RESKEY_sync_script_default=""
|
|
||||||
|
|
||||||
+if ocf_is_ms; then
|
|
||||||
+ OCF_RESKEY_status_check_default="rule"
|
|
||||||
+else
|
|
||||||
+ OCF_RESKEY_status_check_default="pseudo"
|
|
||||||
+fi
|
|
||||||
+
|
|
||||||
: ${OCF_RESKEY_protocol=${OCF_RESKEY_protocol_default}}
|
|
||||||
: ${OCF_RESKEY_portno=${OCF_RESKEY_portno_default}}
|
|
||||||
: ${OCF_RESKEY_direction=${OCF_RESKEY_direction_default}}
|
|
||||||
@@ -401,6 +406,10 @@
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
+# A long time ago, these messages needed to go to stdout,
|
|
||||||
+# "running" / "OK" being the trigger string
|
|
||||||
+# for heartbeat in haresources mode.
|
|
||||||
+# Now they are still useful for debugging.
|
|
||||||
SayActive()
|
|
||||||
{
|
|
||||||
ocf_log debug "$CMD $method rule [$*] is running (OK)"
|
|
||||||
@@ -416,6 +425,11 @@
|
|
||||||
ocf_log debug "$CMD $method rule [$*] is inactive"
|
|
||||||
}
|
|
||||||
|
|
||||||
+SayConsideredInactive()
|
|
||||||
+{
|
|
||||||
+ ocf_log debug "$CMD $method rule [$*] considered to be inactive"
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
#IptablesStatus {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
|
||||||
IptablesStatus() {
|
|
||||||
local rc
|
|
||||||
@@ -441,8 +455,17 @@
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
- SayInactive $*
|
|
||||||
- rc=$OCF_NOT_RUNNING
|
|
||||||
+ if [ "$OCF_RESKEY_status_check" != "rule" ] \
|
|
||||||
+ && test -e "$state_file" && test "$inverse_state_file" -nt "$state_file"; then
|
|
||||||
+ # rule present, action=unblock, unblock statefile present,
|
|
||||||
+ # block state file more recent.
|
|
||||||
+ # apparently an unusual setup: unblock first, block later
|
|
||||||
+ SayConsideredActive $*
|
|
||||||
+ rc=$OCF_SUCCESS
|
|
||||||
+ else
|
|
||||||
+ SayInactive $*
|
|
||||||
+ rc=$OCF_NOT_RUNNING
|
|
||||||
+ fi
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
elif [ "$OCF_RESKEY_status_check" = "rule" ]; then
|
|
||||||
@@ -454,6 +477,7 @@
|
|
||||||
*)
|
|
||||||
SayActive $*
|
|
||||||
if [ "$__OCF_ACTION" = "monitor" ] && [ "$promotion_score" = "$SCORE_PROMOTED" ]; then
|
|
||||||
+ save_tcp_connections
|
|
||||||
rc=$OCF_RUNNING_MASTER
|
|
||||||
else
|
|
||||||
rc=$OCF_SUCCESS
|
|
||||||
@@ -463,7 +487,10 @@
|
|
||||||
else
|
|
||||||
case $5 in
|
|
||||||
block)
|
|
||||||
- if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then
|
|
||||||
+ if test -e "$state_file" && test "$inverse_state_file" -nt "$state_file"; then
|
|
||||||
+ # rule NOT present, action=block, block state file present,
|
|
||||||
+ # unblock state file more recent.
|
|
||||||
+ # expected setup: block first, unblock later
|
|
||||||
SayConsideredActive $*
|
|
||||||
rc=$OCF_SUCCESS
|
|
||||||
else
|
|
||||||
@@ -472,13 +499,15 @@
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
- if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then
|
|
||||||
+ if test -e "$state_file" ; then
|
|
||||||
+ # rule NOT present, action=unblock, unblock state file present
|
|
||||||
SayActive $*
|
|
||||||
- #This is only run on real monitor events.
|
|
||||||
+ # This is only run on real monitor events (state file present).
|
|
||||||
save_tcp_connections
|
|
||||||
rc=$OCF_SUCCESS
|
|
||||||
else
|
|
||||||
- SayInactive $*
|
|
||||||
+ # rule NOT present, action=unblock, unblock state file NOT present
|
|
||||||
+ SayConsideredInactive $*
|
|
||||||
rc=$OCF_NOT_RUNNING
|
|
||||||
fi
|
|
||||||
;;
|
|
||||||
@@ -562,7 +591,7 @@
|
|
||||||
#IptablesStart {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
|
||||||
IptablesStart()
|
|
||||||
{
|
|
||||||
- ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start
|
|
||||||
+ ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start "$state_file"
|
|
||||||
case $5 in
|
|
||||||
block) IptablesBLOCK "$@"
|
|
||||||
rc=$?
|
|
||||||
@@ -584,7 +613,8 @@
|
|
||||||
#IptablesStop {udp|tcp} portno,portno ip {in|out|both} {block|unblock}
|
|
||||||
IptablesStop()
|
|
||||||
{
|
|
||||||
- ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop
|
|
||||||
+ ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop "$state_file"
|
|
||||||
+
|
|
||||||
case $5 in
|
|
||||||
block) IptablesUNBLOCK "$@"
|
|
||||||
rc=$?
|
|
||||||
@@ -797,6 +827,33 @@
|
|
||||||
|
|
||||||
IptablesValidateAll
|
|
||||||
|
|
||||||
+# State file name for ha_pseudo_resource
|
|
||||||
+#
|
|
||||||
+# The expected usage of this agent is to pair a "block" with an "unblock",
|
|
||||||
+# and order startup and configuration of some service between these.
|
|
||||||
+#
|
|
||||||
+# The established idiom is to have two separate instances with inverse actions.
|
|
||||||
+# To "reliably" report the status of "block" during a monitor action,
|
|
||||||
+# it is not sufficient to check the existence of the blocking rule.
|
|
||||||
+#
|
|
||||||
+# It is also insufficient to rely on the pseudo resource state file
|
|
||||||
+# of this instance only.
|
|
||||||
+#
|
|
||||||
+# To know our actual expectation, we need to check the state file of the
|
|
||||||
+# "inverse" instance as well.
|
|
||||||
+#
|
|
||||||
+# Because we don't know the OCF_RESOURCE_INSTANCE value of the other instance,
|
|
||||||
+# we override the state file name for both instances to something derived from
|
|
||||||
+# our parameters.
|
|
||||||
+#
|
|
||||||
+# This should give use the same "global state" view as the "promotion score"
|
|
||||||
+# does for the promotable clone variant of this agent.
|
|
||||||
+#
|
|
||||||
+[ "$action" = block ] && inverse_action=unblock || inverse_action=block
|
|
||||||
+state_file_base=$(echo "portblock_${protocol}_${portno}_${ip}_${direction}" | tr -c '0-9a-zA-Z._' _)
|
|
||||||
+state_file=${HA_RSCTMP}/${state_file_base}_${action}
|
|
||||||
+inverse_state_file=${HA_RSCTMP}/${state_file_base}_${inverse_action}
|
|
||||||
+
|
|
||||||
case $__OCF_ACTION in
|
|
||||||
start)
|
|
||||||
IptablesStart "$protocol" "$portno" "$ip" "$direction" "$action"
|
|
||||||
@ -1,46 +0,0 @@
|
|||||||
From 66885ea0227e847b571608015b150d391a6234d7 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
|
||||||
Date: Mon, 23 Feb 2026 13:35:58 +0100
|
|
||||||
Subject: [PATCH] db2: set reintegration when promotion is successful
|
|
||||||
|
|
||||||
---
|
|
||||||
heartbeat/db2 | 19 +++++++++++++++++++
|
|
||||||
1 file changed, 19 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
|
||||||
index 82f2f82c3..4420b9989 100755
|
|
||||||
--- a/heartbeat/db2
|
|
||||||
+++ b/heartbeat/db2
|
|
||||||
@@ -955,6 +955,16 @@ db2_promote() {
|
|
||||||
PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|PRIMARY/REMOTE_CATCHUP_PENDING/CONNECTED|Primary/Peer)
|
|
||||||
# nothing to do, only update pacemaker's view
|
|
||||||
echo MASTER > $STATE_FILE
|
|
||||||
+
|
|
||||||
+ if [ -n "$remote_host" ]; then
|
|
||||||
+ for db in $dblist
|
|
||||||
+ do
|
|
||||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
|
||||||
+ ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
|
|
||||||
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
|
|
||||||
+ done
|
|
||||||
+ fi
|
|
||||||
+
|
|
||||||
return $OCF_SUCCESS
|
|
||||||
;;
|
|
||||||
|
|
||||||
@@ -981,6 +991,15 @@ db2_promote() {
|
|
||||||
# update pacemaker's view
|
|
||||||
echo MASTER > $STATE_FILE
|
|
||||||
|
|
||||||
+ if [ -n "$remote_host" ]; then
|
|
||||||
+ for db in $dblist
|
|
||||||
+ do
|
|
||||||
+ reint_attr="db2hadr-${inst1}_${inst2}_${db}_reint"
|
|
||||||
+ ocf_log debug "Promotion succeeded, setting $reint_attr = 1"
|
|
||||||
+ crm_attribute -n "$reint_attr" -N "$remote_host" -v "1" -l forever
|
|
||||||
+ done
|
|
||||||
+ fi
|
|
||||||
+
|
|
||||||
return $OCF_SUCCESS
|
|
||||||
fi
|
|
||||||
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
From 3712b1f52bccddc767ad6f16ec67d6c8c29f1f71 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Valentin Vidic <vvidic@valentin-vidic.from.hr>
|
|
||||||
Date: Sun, 3 Apr 2022 20:39:01 +0200
|
|
||||||
Subject: [PATCH] db2: fix bashism
|
|
||||||
|
|
||||||
dash only allows -a as AND operator.
|
|
||||||
---
|
|
||||||
heartbeat/db2 | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
|
||||||
index ea24d33fc8..4a4b2f477f 100755
|
|
||||||
--- a/heartbeat/db2
|
|
||||||
+++ b/heartbeat/db2
|
|
||||||
@@ -407,7 +407,7 @@ db2_start() {
|
|
||||||
# partition is explicitly specified, activate without
|
|
||||||
# partition information. This allows db2 instances without
|
|
||||||
# partition support to be managed.
|
|
||||||
- if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then
|
|
||||||
+ if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
|
|
||||||
start_opts=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
@@ -511,7 +511,7 @@ db2_stop_bg() {
|
|
||||||
|
|
||||||
rc=$OCF_SUCCESS
|
|
||||||
|
|
||||||
- if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then
|
|
||||||
+ if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
|
|
||||||
stop_opts=""
|
|
||||||
fi
|
|
||||||
|
|
||||||
@ -1,143 +0,0 @@
|
|||||||
From 26c0d48bc69da1859f1ce5205a8bb6eaf6297b81 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
|
||||||
Date: Wed, 25 Mar 2026 10:46:09 +0100
|
|
||||||
Subject: [PATCH] db2: do not use db2stop, as it sends truncation messages,
|
|
||||||
which in some cases are not delivered
|
|
||||||
|
|
||||||
This caused divergence in the log, and the user would have to manually rebuild
|
|
||||||
the DB to recover from it.
|
|
||||||
---
|
|
||||||
heartbeat/db2 | 104 +++++++-------------------------------------------
|
|
||||||
1 file changed, 13 insertions(+), 91 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
|
||||||
index 4420b9989..9de18639d 100755
|
|
||||||
--- a/heartbeat/db2
|
|
||||||
+++ b/heartbeat/db2
|
|
||||||
@@ -596,45 +596,10 @@ db2_start() {
|
|
||||||
return $OCF_SUCCESS
|
|
||||||
}
|
|
||||||
|
|
||||||
-#
|
|
||||||
-# helper function to be spawned
|
|
||||||
-# so we can detect a hang of the db2stop command
|
|
||||||
-#
|
|
||||||
-db2_stop_bg() {
|
|
||||||
- local rc output
|
|
||||||
- local stop_opts="dbpartitionnum $db2node"
|
|
||||||
-
|
|
||||||
- rc=$OCF_SUCCESS
|
|
||||||
-
|
|
||||||
- if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -e "$db2sql/db2nodes.cfg" ]; then
|
|
||||||
- stop_opts=""
|
|
||||||
- fi
|
|
||||||
-
|
|
||||||
- if output=$(runasdb2 db2stop force $stop_opts)
|
|
||||||
- then
|
|
||||||
- ocf_log info "DB2 instance $instance($db2node) stopped: $output"
|
|
||||||
- else
|
|
||||||
- case $output in
|
|
||||||
- *SQL1032N*)
|
|
||||||
- #SQL1032N No start database manager command was issued
|
|
||||||
- ocf_log info "$output"
|
|
||||||
- ;;
|
|
||||||
-
|
|
||||||
- *)
|
|
||||||
- ocf_log err "DB2 instance $instance($db2node) stop failed: $output"
|
|
||||||
- rc=$OCF_ERR_GENERIC
|
|
||||||
- esac
|
|
||||||
- fi
|
|
||||||
-
|
|
||||||
- return $rc
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
#
|
|
||||||
# Stop the given db2 database instance
|
|
||||||
#
|
|
||||||
db2_stop() {
|
|
||||||
- local stop_timeout grace_timeout stop_bg_pid i must_kill
|
|
||||||
-
|
|
||||||
# remove master score
|
|
||||||
master_score -D -l reboot
|
|
||||||
|
|
||||||
@@ -647,67 +612,24 @@ db2_stop() {
|
|
||||||
return $OCF_SUCCESS
|
|
||||||
fi
|
|
||||||
|
|
||||||
- stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000}
|
|
||||||
-
|
|
||||||
- # grace_time is 4/5 (unit is ms)
|
|
||||||
- grace_timeout=$((stop_timeout/1250))
|
|
||||||
-
|
|
||||||
- # start db2stop in background as this may hang
|
|
||||||
- db2_stop_bg &
|
|
||||||
- stop_bg_pid=$!
|
|
||||||
-
|
|
||||||
- # wait for grace_timeout
|
|
||||||
- i=0
|
|
||||||
- while [ $i -lt $grace_timeout ]
|
|
||||||
- do
|
|
||||||
- kill -0 $stop_bg_pid 2>/dev/null || break;
|
|
||||||
- sleep 1
|
|
||||||
- i=$((i+1))
|
|
||||||
- done
|
|
||||||
-
|
|
||||||
- # collect exit status but don't hang
|
|
||||||
- if kill -0 $stop_bg_pid 2>/dev/null
|
|
||||||
- then
|
|
||||||
- stoprc=1
|
|
||||||
- kill -9 $stop_bg_pid 2>/dev/null
|
|
||||||
- else
|
|
||||||
- wait $stop_bg_pid
|
|
||||||
- stoprc=$?
|
|
||||||
- fi
|
|
||||||
-
|
|
||||||
- must_kill=0
|
|
||||||
-
|
|
||||||
- if [ $stoprc -ne 0 ]
|
|
||||||
+ # db2nkill kills *all* partitions on the node
|
|
||||||
+ if [ -x $db2bin/db2nkill ]
|
|
||||||
then
|
|
||||||
- ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill"
|
|
||||||
- must_kill=1
|
|
||||||
- elif ! db2_instance_dead
|
|
||||||
+ logasdb2 $db2bin/db2nkill $db2node
|
|
||||||
+ elif [ -x $db2bin/db2_kill ]
|
|
||||||
then
|
|
||||||
- ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill"
|
|
||||||
- must_kill=1
|
|
||||||
+ logasdb2 $db2bin/db2_kill
|
|
||||||
fi
|
|
||||||
|
|
||||||
- if [ $must_kill -eq 1 ]
|
|
||||||
- then
|
|
||||||
- # db2nkill kills *all* partitions on the node
|
|
||||||
- if [ -x $db2bin/db2nkill ]
|
|
||||||
- then
|
|
||||||
- logasdb2 $db2bin/db2nkill $db2node
|
|
||||||
- elif [ -x $db2bin/db2_kill ]
|
|
||||||
- then
|
|
||||||
- logasdb2 $db2bin/db2_kill
|
|
||||||
- fi
|
|
||||||
-
|
|
||||||
- # loop forever (or lrmd kills us due to timeout) until the
|
|
||||||
- # instance is dead
|
|
||||||
- while ! db2_instance_dead
|
|
||||||
- do
|
|
||||||
- ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit"
|
|
||||||
- sleep 1
|
|
||||||
- done
|
|
||||||
+ # loop forever (or lrmd kills us due to timeout) until the
|
|
||||||
+ # instance is dead
|
|
||||||
+ while ! db2_instance_dead
|
|
||||||
+ do
|
|
||||||
+ ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit"
|
|
||||||
+ sleep 1
|
|
||||||
+ done
|
|
||||||
|
|
||||||
- ocf_log info "DB2 instance $instance($db2node) is now dead"
|
|
||||||
- fi
|
|
||||||
+ ocf_log info "DB2 instance $instance($db2node) is now dead"
|
|
||||||
|
|
||||||
return $OCF_SUCCESS
|
|
||||||
}
|
|
||||||
@ -0,0 +1,71 @@
|
|||||||
|
From 54fa7a59c36697cd8df5b619fff0b50af00df76e Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 20 Nov 2023 16:35:52 +0100
|
||||||
|
Subject: [PATCH 1/2] storage_mon: fix file handler out of scope leak and
|
||||||
|
uninitialized values
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 11 +++++++++--
|
||||||
|
1 file changed, 9 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index 1aae29e58..cc415e97f 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -382,9 +382,11 @@ static int write_pid_file(const char *pidfile)
|
||||||
|
syslog(LOG_ERR, "Failed to write '%s' to %s: %s", pid, pidfile, strerror(errno));
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
- close(fd);
|
||||||
|
rc = 0;
|
||||||
|
done:
|
||||||
|
+ if (fd != -1) {
|
||||||
|
+ close(fd);
|
||||||
|
+ }
|
||||||
|
if (pid != NULL) {
|
||||||
|
free(pid);
|
||||||
|
}
|
||||||
|
@@ -663,6 +665,7 @@ storage_mon_client(void)
|
||||||
|
snprintf(request.message, SMON_MAX_MSGSIZE, "%s", SMON_GET_RESULT_COMMAND);
|
||||||
|
request.hdr.id = 0;
|
||||||
|
request.hdr.size = sizeof(struct storage_mon_check_value_req);
|
||||||
|
+ response.hdr.id = 0;
|
||||||
|
rc = qb_ipcc_send(conn, &request, request.hdr.size);
|
||||||
|
if (rc < 0) {
|
||||||
|
syslog(LOG_ERR, "qb_ipcc_send error : %d\n", rc);
|
||||||
|
@@ -683,7 +686,11 @@ storage_mon_client(void)
|
||||||
|
/* greater than 0 : monitoring error. */
|
||||||
|
/* -1 : communication system error. */
|
||||||
|
/* -2 : Not all checks completed for first device in daemon mode. */
|
||||||
|
- rc = atoi(response.message);
|
||||||
|
+ if (strnlen(response.message, 1)) {
|
||||||
|
+ rc = atoi(response.message);
|
||||||
|
+ } else {
|
||||||
|
+ rc = -1;
|
||||||
|
+ }
|
||||||
|
|
||||||
|
syslog(LOG_DEBUG, "daemon response[%d]: %s \n", response.hdr.id, response.message);
|
||||||
|
|
||||||
|
|
||||||
|
From b23ba4eaefb500199c4845751f4c5545c81f42f1 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 20 Nov 2023 16:37:37 +0100
|
||||||
|
Subject: [PATCH 2/2] findif: also check that netmaskbits != EOS
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/findif.c | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/tools/findif.c b/tools/findif.c
|
||||||
|
index a25395fec..ab108a3c4 100644
|
||||||
|
--- a/tools/findif.c
|
||||||
|
+++ b/tools/findif.c
|
||||||
|
@@ -669,7 +669,7 @@ main(int argc, char ** argv) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
- if (netmaskbits) {
|
||||||
|
+ if (netmaskbits != NULL && *netmaskbits != EOS) {
|
||||||
|
best_netmask = netmask;
|
||||||
|
}else if (best_netmask == 0L) {
|
||||||
|
/*
|
||||||
@ -0,0 +1,23 @@
|
|||||||
|
From cb968378959b8aa334e98daf62a1b08ef6525fb4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Wed, 22 Nov 2023 10:32:31 +0100
|
||||||
|
Subject: [PATCH] storage_mon: use memset() to fix "uninitialized value"
|
||||||
|
covscan error, as qb_ipcc_recv() will always set a message (according to
|
||||||
|
honzaf)
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 1 +
|
||||||
|
1 file changed, 1 insertion(+)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index cc415e97f..a9227ef90 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -655,6 +655,7 @@ storage_mon_client(void)
|
||||||
|
int32_t rc;
|
||||||
|
|
||||||
|
|
||||||
|
+ memset(&response, 0, sizeof(response));
|
||||||
|
snprintf(ipcs_name, SMON_MAX_IPCSNAME, "storage_mon_%s", attrname);
|
||||||
|
conn = qb_ipcc_connect(ipcs_name, 0);
|
||||||
|
if (conn == NULL) {
|
||||||
@ -1,22 +0,0 @@
|
|||||||
From b23ba4eaefb500199c4845751f4c5545c81f42f1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
|
||||||
Date: Mon, 20 Nov 2023 16:37:37 +0100
|
|
||||||
Subject: [PATCH 2/2] findif: also check that netmaskbits != EOS
|
|
||||||
|
|
||||||
---
|
|
||||||
tools/findif.c | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/tools/findif.c b/tools/findif.c
|
|
||||||
index a25395fec..ab108a3c4 100644
|
|
||||||
--- a/tools/findif.c
|
|
||||||
+++ b/tools/findif.c
|
|
||||||
@@ -669,7 +669,7 @@ main(int argc, char ** argv) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
- if (netmaskbits) {
|
|
||||||
+ if (netmaskbits != NULL && *netmaskbits != EOS) {
|
|
||||||
best_netmask = netmask;
|
|
||||||
}else if (best_netmask == 0L) {
|
|
||||||
/*
|
|
||||||
46
SOURCES/RHEL-22715-LVM-activate-fix-false-positive.patch
Normal file
46
SOURCES/RHEL-22715-LVM-activate-fix-false-positive.patch
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
From 65a066cf9066390db65c4875e21c4c391793b9ae Mon Sep 17 00:00:00 2001
|
||||||
|
From: Arslan Ahmad <arslan.ahmad97@googlemail.com>
|
||||||
|
Date: Tue, 16 Jan 2024 09:11:17 +0530
|
||||||
|
Subject: [PATCH] Avoid false positive for VG activation
|
||||||
|
|
||||||
|
When lvm.conf file has `volume_list` parameter configured and the
|
||||||
|
cluster is managing the shared storage using `system_id_source`,
|
||||||
|
then the activation of the LV fails to happen. However it is
|
||||||
|
reported as a success.
|
||||||
|
|
||||||
|
The fixes will avoid starting of `LVM-activate` resource when
|
||||||
|
the cluster is configured with both `system_id_source` and
|
||||||
|
`volume_list`.
|
||||||
|
|
||||||
|
Signed-off-by: Arslan Ahmad <arslan.ahmad97@googlemail.com>
|
||||||
|
---
|
||||||
|
heartbeat/LVM-activate | 9 +++++++++
|
||||||
|
1 file changed, 9 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate
|
||||||
|
index f6f24a3b5..3858ed8dc 100755
|
||||||
|
--- a/heartbeat/LVM-activate
|
||||||
|
+++ b/heartbeat/LVM-activate
|
||||||
|
@@ -448,6 +448,10 @@ systemid_check()
|
||||||
|
{
|
||||||
|
# system_id_source is set in lvm.conf
|
||||||
|
source=$(lvmconfig 'global/system_id_source' 2>/dev/null | cut -d"=" -f2)
|
||||||
|
+
|
||||||
|
+ # Is volume_list set in lvm.conf
|
||||||
|
+ vol_list=$(lvmconfig 'activation/volume_list' 2>/dev/null | cut -d"=" -f2)
|
||||||
|
+
|
||||||
|
if [ "$source" = "" ] || [ "$source" = "none" ]; then
|
||||||
|
ocf_exit_reason "system_id_source in lvm.conf is not set correctly!"
|
||||||
|
exit $OCF_ERR_ARGS
|
||||||
|
@@ -458,6 +462,11 @@ systemid_check()
|
||||||
|
exit $OCF_ERR_ARGS
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ if [ -n "$source" ] && [ -n "$vol_list" ]; then
|
||||||
|
+ ocf_exit_reason "Both system_id_source & volume_list cannot be defined!"
|
||||||
|
+ exit $OCF_ERR_ARGS
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
From 264e38e02cb4c04877e412bac254e42c7f6b2e1c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 20 Feb 2024 12:34:42 +0100
|
||||||
|
Subject: [PATCH] Filesystem: fail when leading or trailing whitespace is
|
||||||
|
present in device or directory parameters
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 12 ++++++++++++
|
||||||
|
1 file changed, 12 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index e1378f781..f88e3b552 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -995,6 +995,12 @@ if [ -n "${OCF_RESKEY_force_unmount}" ]; then
|
||||||
|
fi
|
||||||
|
|
||||||
|
DEVICE="$OCF_RESKEY_device"
|
||||||
|
+case "$DEVICE" in
|
||||||
|
+ [[:space:]]*|*[[:space:]])
|
||||||
|
+ ocf_exit_reason "device parameter does not accept leading or trailing whitespace characters"
|
||||||
|
+ exit $OCF_ERR_CONFIGURED
|
||||||
|
+ ;;
|
||||||
|
+esac
|
||||||
|
FSTYPE=$OCF_RESKEY_fstype
|
||||||
|
if [ ! -z "$OCF_RESKEY_options" ]; then
|
||||||
|
options="-o $OCF_RESKEY_options"
|
||||||
|
@@ -1032,6 +1038,12 @@ if [ -z "$OCF_RESKEY_directory" ]; then
|
||||||
|
else
|
||||||
|
MOUNTPOINT="$(echo "$OCF_RESKEY_directory" | sed 's/\/*$//')"
|
||||||
|
: ${MOUNTPOINT:=/}
|
||||||
|
+ case "$MOUNTPOINT" in
|
||||||
|
+ [[:space:]]*|*[[:space:]])
|
||||||
|
+ ocf_exit_reason "directory parameter does not accept leading or trailing whitespace characters"
|
||||||
|
+ exit $OCF_ERR_CONFIGURED
|
||||||
|
+ ;;
|
||||||
|
+ esac
|
||||||
|
if [ -e "$MOUNTPOINT" ] ; then
|
||||||
|
CANONICALIZED_MOUNTPOINT="$(readlink -f "$MOUNTPOINT")"
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
@ -0,0 +1,30 @@
|
|||||||
|
From 1317efc72af6b72d9fb37aea18dc16129c146148 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 25 Jun 2024 13:33:19 +0200
|
||||||
|
Subject: [PATCH] Filesystem: return success during stop-action when leading or
|
||||||
|
trailing whitespace is present in device or directory parameters
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 2 ++
|
||||||
|
1 file changed, 2 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index 8e0127531..3eb520e0c 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -1037,6 +1037,7 @@ fi
|
||||||
|
DEVICE="$OCF_RESKEY_device"
|
||||||
|
case "$DEVICE" in
|
||||||
|
[[:space:]]*|*[[:space:]])
|
||||||
|
+ [ "$__OCF_ACTION" = "stop" ] && exit $OCF_SUCCESS
|
||||||
|
ocf_exit_reason "device parameter does not accept leading or trailing whitespace characters"
|
||||||
|
exit $OCF_ERR_CONFIGURED
|
||||||
|
;;
|
||||||
|
@@ -1080,6 +1081,7 @@ else
|
||||||
|
: ${MOUNTPOINT:=/}
|
||||||
|
case "$MOUNTPOINT" in
|
||||||
|
[[:space:]]*|*[[:space:]])
|
||||||
|
+ [ "$__OCF_ACTION" = "stop" ] && exit $OCF_SUCCESS
|
||||||
|
ocf_exit_reason "directory parameter does not accept leading or trailing whitespace characters"
|
||||||
|
exit $OCF_ERR_CONFIGURED
|
||||||
|
;;
|
||||||
@ -0,0 +1,75 @@
|
|||||||
|
From 4357f0dbb8668ac4090cd7070c2ea195e5683326 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Damien Ciabrini <dciabrin@redhat.com>
|
||||||
|
Date: Wed, 24 Jan 2024 13:27:26 +0100
|
||||||
|
Subject: [PATCH] galera: allow joiner to report non-Primary during initial IST
|
||||||
|
|
||||||
|
It seems that with recent galera versions, when a galera node
|
||||||
|
joins a cluster, there is a small time window where the node is
|
||||||
|
connected to the primary component of the galera cluster, but it
|
||||||
|
might still be preparing its IST. During this time, it can report
|
||||||
|
itself as being 'not ready' and in 'non-primary' state.
|
||||||
|
|
||||||
|
Update the galera resource agent to allow the node to be in
|
||||||
|
non-primary state, but only if running a "promote" operation. Any
|
||||||
|
network partition during the promotion will be caught by the
|
||||||
|
promote timeout.
|
||||||
|
|
||||||
|
In reworking the promotion code, we move the check for primary
|
||||||
|
partition into the "galera_monitor" function. The check works
|
||||||
|
as before for regular "monitor" or "probe" operations.
|
||||||
|
|
||||||
|
Related-Bug: rhbz#2255414
|
||||||
|
---
|
||||||
|
heartbeat/galera.in | 25 +++++++++++++++++--------
|
||||||
|
1 file changed, 17 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/galera.in b/heartbeat/galera.in
|
||||||
|
index 6aed3e4b6d..b518595cb0 100755
|
||||||
|
--- a/heartbeat/galera.in
|
||||||
|
+++ b/heartbeat/galera.in
|
||||||
|
@@ -822,6 +822,11 @@ galera_promote()
|
||||||
|
return $rc
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ # At this point, the mysql pidfile is created on disk and the
|
||||||
|
+ # mysql server is reacheable via its UNIX socket. If we are a
|
||||||
|
+ # joiner, SST transfers (rsync) have finished, but an IST may
|
||||||
|
+ # still be requested or ongoing
|
||||||
|
+
|
||||||
|
galera_monitor
|
||||||
|
rc=$?
|
||||||
|
if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then
|
||||||
|
@@ -835,12 +840,6 @@ galera_promote()
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
|
||||||
|
- is_primary
|
||||||
|
- if [ $? -ne 0 ]; then
|
||||||
|
- ocf_exit_reason "Failure. Master instance started, but is not in Primary mode."
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
- fi
|
||||||
|
-
|
||||||
|
if ocf_is_true $bootstrap; then
|
||||||
|
promote_everyone
|
||||||
|
clear_bootstrap_node
|
||||||
|
@@ -991,8 +990,18 @@ galera_monitor()
|
||||||
|
fi
|
||||||
|
rc=$OCF_RUNNING_MASTER
|
||||||
|
else
|
||||||
|
- ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
|
||||||
|
- rc=$OCF_ERR_GENERIC
|
||||||
|
+ # It seems that with recent galera (26.4+), a joiner that is
|
||||||
|
+ # connected to a Primary component and is preparing its IST
|
||||||
|
+ # request might still temporarily report its state as
|
||||||
|
+ # Non-Primary. Do not fail in this case as the promote
|
||||||
|
+ # operation will loop until the IST finishes or the promote
|
||||||
|
+ # times out.
|
||||||
|
+ if [ "$__OCF_ACTION" = "promote" ] && ! ocf_is_true $(is_bootstrap); then
|
||||||
|
+ ocf_log info "local node <${NODENAME}> is receiving a State Transfer."
|
||||||
|
+ else
|
||||||
|
+ ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state."
|
||||||
|
+ rc=$OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
return $rc
|
||||||
25
SOURCES/RHEL-32265-1-findif.sh-fix-corner-cases.patch
Normal file
25
SOURCES/RHEL-32265-1-findif.sh-fix-corner-cases.patch
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
From f717b4a3aa83c9124e62716f421b99e314d00233 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Fri, 12 Apr 2024 12:23:21 +0200
|
||||||
|
Subject: [PATCH] findif.sh: fix corner cases
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/findif.sh | 4 ++--
|
||||||
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||||
|
index 7c766e6e0..13484f827 100644
|
||||||
|
--- a/heartbeat/findif.sh
|
||||||
|
+++ b/heartbeat/findif.sh
|
||||||
|
@@ -215,9 +215,9 @@ findif()
|
||||||
|
fi
|
||||||
|
if [ -n "$nic" ] ; then
|
||||||
|
# NIC supports more than two.
|
||||||
|
- set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | awk 'BEGIN{best=0} /\// { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}')
|
||||||
|
+ set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
else
|
||||||
|
- set -- $(ip -o -f $family route list match $match $scope | awk 'BEGIN{best=0} /\// { mask=$1; sub(".*/", "", mask); if( int(mask)>=best ) { best=int(mask); best_ln=$0; } } END{print best_ln}')
|
||||||
|
+ set -- $(ip -o -f $family route list match $match $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
fi
|
||||||
|
if [ $# = 0 ] ; then
|
||||||
|
case $OCF_RESKEY_ip in
|
||||||
365
SOURCES/RHEL-32265-2-IPsrcaddr-add-IPv6-support.patch
Normal file
365
SOURCES/RHEL-32265-2-IPsrcaddr-add-IPv6-support.patch
Normal file
@ -0,0 +1,365 @@
|
|||||||
|
From 12d73d53026d219be67c0d5353010ba08ab49e98 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 28 May 2024 09:45:55 +0200
|
||||||
|
Subject: [PATCH 1/3] findif.sh: add metric for IPv6 support and fail when
|
||||||
|
matching more than 1 route
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/findif.sh | 19 ++++++++++++++++---
|
||||||
|
1 file changed, 16 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||||
|
index 13484f827..ca5d1a5c1 100644
|
||||||
|
--- a/heartbeat/findif.sh
|
||||||
|
+++ b/heartbeat/findif.sh
|
||||||
|
@@ -196,10 +196,13 @@ findif()
|
||||||
|
{
|
||||||
|
local match="$OCF_RESKEY_ip"
|
||||||
|
local family
|
||||||
|
+ local proto
|
||||||
|
local scope
|
||||||
|
local nic="$OCF_RESKEY_nic"
|
||||||
|
local netmask="$OCF_RESKEY_cidr_netmask"
|
||||||
|
local brdcast="$OCF_RESKEY_broadcast"
|
||||||
|
+ local metric
|
||||||
|
+ local routematch
|
||||||
|
|
||||||
|
echo $match | grep -qs ":"
|
||||||
|
if [ $? = 0 ] ; then
|
||||||
|
@@ -215,10 +218,19 @@ findif()
|
||||||
|
fi
|
||||||
|
if [ -n "$nic" ] ; then
|
||||||
|
# NIC supports more than two.
|
||||||
|
- set -- $(ip -o -f $family route list match $match $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
else
|
||||||
|
- set -- $(ip -o -f $family route list match $match $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
+ routematch=$(ip -o -f $family route list match $match $proto $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
fi
|
||||||
|
+ if [ "$family" = "inet6" ]; then
|
||||||
|
+ routematch=$(echo "$routematch" | grep -v "^default")
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
+ if [ $(echo "$routematch" | wc -l) -gt 1 ]; then
|
||||||
|
+ ocf_exit_reason "More than 1 routes match $match. Unable to decide which route to use."
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ set -- $routematch
|
||||||
|
if [ $# = 0 ] ; then
|
||||||
|
case $OCF_RESKEY_ip in
|
||||||
|
127.*)
|
||||||
|
@@ -255,6 +267,7 @@ findif()
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
- echo "$nic netmask $netmask broadcast $brdcast"
|
||||||
|
+ metric=$(echo "$@" | sed "s/.*metric[[:blank:]]\([^ ]\+\).*/\1/")
|
||||||
|
+ echo "$nic netmask $netmask broadcast $brdcast metric $metric"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
From 488c096d63fe0f7e15938e65483ba20628080198 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 28 May 2024 09:47:11 +0200
|
||||||
|
Subject: [PATCH 2/3] IPaddr2: use metric for IPv6
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/IPaddr2 | 11 ++++++++---
|
||||||
|
1 file changed, 8 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||||
|
index 5f30b8f98..091bea418 100755
|
||||||
|
--- a/heartbeat/IPaddr2
|
||||||
|
+++ b/heartbeat/IPaddr2
|
||||||
|
@@ -561,10 +561,11 @@ ip_init() {
|
||||||
|
if
|
||||||
|
[ $rc -eq 0 ]
|
||||||
|
then
|
||||||
|
- NICINFO=`echo "$NICINFO" | sed -e 's/netmask\ //;s/broadcast\ //'`
|
||||||
|
+ NICINFO=`echo "$NICINFO" | sed -e 's/netmask\ //;s/broadcast\ //;s/metric\ //'`
|
||||||
|
NIC=`echo "$NICINFO" | cut -d" " -f1`
|
||||||
|
NETMASK=`echo "$NICINFO" | cut -d" " -f2`
|
||||||
|
BRDCAST=`echo "$NICINFO" | cut -d" " -f3`
|
||||||
|
+ METRIC=`echo "$NICINFO" | cut -d" " -f4`
|
||||||
|
else
|
||||||
|
# findif couldn't find the interface
|
||||||
|
if ocf_is_probe; then
|
||||||
|
@@ -659,13 +660,14 @@ delete_interface () {
|
||||||
|
# Add an interface
|
||||||
|
#
|
||||||
|
add_interface () {
|
||||||
|
- local cmd msg extra_opts ipaddr netmask broadcast iface label
|
||||||
|
+ local cmd msg extra_opts ipaddr netmask broadcast iface label metric
|
||||||
|
|
||||||
|
ipaddr="$1"
|
||||||
|
netmask="$2"
|
||||||
|
broadcast="$3"
|
||||||
|
iface="$4"
|
||||||
|
label="$5"
|
||||||
|
+ metric="$6"
|
||||||
|
|
||||||
|
if [ "$FAMILY" = "inet" ] && ocf_is_true $OCF_RESKEY_run_arping &&
|
||||||
|
check_binary arping; then
|
||||||
|
@@ -688,6 +690,9 @@ add_interface () {
|
||||||
|
fi
|
||||||
|
|
||||||
|
extra_opts=""
|
||||||
|
+ if [ "$FAMILY" = "inet6" ]; then
|
||||||
|
+ extra_opts="$extra_opts metric $metric"
|
||||||
|
+ fi
|
||||||
|
if [ "$FAMILY" = "inet6" ] && ocf_is_true "${OCF_RESKEY_nodad}"; then
|
||||||
|
extra_opts="$extra_opts nodad"
|
||||||
|
fi
|
||||||
|
@@ -1083,7 +1088,7 @@ ip_start() {
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
- add_interface $OCF_RESKEY_ip $NETMASK ${BRDCAST:-none} $NIC $IFLABEL
|
||||||
|
+ add_interface "$OCF_RESKEY_ip" "$NETMASK" "${BRDCAST:-none}" "$NIC" "$IFLABEL" "$METRIC"
|
||||||
|
rc=$?
|
||||||
|
|
||||||
|
if [ $rc -ne $OCF_SUCCESS ]; then
|
||||||
|
|
||||||
|
From d1c4d1969381d3e35cfaaaaae522e5687a9ed88a Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 28 May 2024 09:47:56 +0200
|
||||||
|
Subject: [PATCH 3/3] IPsrcaddr: add IPv6 support
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/IPsrcaddr | 116 ++++++++++++++++++++++++++++++++------------
|
||||||
|
1 file changed, 85 insertions(+), 31 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
|
||||||
|
index c732ce8df..1c87d5b7f 100755
|
||||||
|
--- a/heartbeat/IPsrcaddr
|
||||||
|
+++ b/heartbeat/IPsrcaddr
|
||||||
|
@@ -60,6 +60,7 @@ OCF_RESKEY_cidr_netmask_default=""
|
||||||
|
OCF_RESKEY_destination_default="0.0.0.0/0"
|
||||||
|
OCF_RESKEY_proto_default=""
|
||||||
|
OCF_RESKEY_metric_default=""
|
||||||
|
+OCF_RESKEY_pref_default=""
|
||||||
|
OCF_RESKEY_table_default=""
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_ipaddress=${OCF_RESKEY_ipaddress_default}}
|
||||||
|
@@ -67,6 +68,7 @@ OCF_RESKEY_table_default=""
|
||||||
|
: ${OCF_RESKEY_destination=${OCF_RESKEY_destination_default}}
|
||||||
|
: ${OCF_RESKEY_proto=${OCF_RESKEY_proto_default}}
|
||||||
|
: ${OCF_RESKEY_metric=${OCF_RESKEY_metric_default}}
|
||||||
|
+: ${OCF_RESKEY_pref=${OCF_RESKEY_pref_default}}
|
||||||
|
: ${OCF_RESKEY_table=${OCF_RESKEY_table_default}}
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
@@ -75,10 +77,13 @@ OCF_RESKEY_table_default=""
|
||||||
|
|
||||||
|
USAGE="usage: $0 {start|stop|status|monitor|validate-all|meta-data}";
|
||||||
|
|
||||||
|
- CMDSHOW="$IP2UTIL route show $TABLE to exact $OCF_RESKEY_destination"
|
||||||
|
-CMDCHANGE="$IP2UTIL route change to "
|
||||||
|
+echo "$OCF_RESKEY_ipaddress" | grep -q ":" && FAMILY="inet6" || FAMILY="inet"
|
||||||
|
+[ "$FAMILY" = "inet6" ] && [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] && OCF_RESKEY_destination="::/0"
|
||||||
|
|
||||||
|
-if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ]; then
|
||||||
|
+ CMDSHOW="$IP2UTIL -f $FAMILY route show $TABLE to exact $OCF_RESKEY_destination"
|
||||||
|
+CMDCHANGE="$IP2UTIL -f $FAMILY route change to "
|
||||||
|
+
|
||||||
|
+if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ] && [ "$OCF_RESKEY_destination" != "::/0" ]; then
|
||||||
|
CMDSHOW="$CMDSHOW src $OCF_RESKEY_ipaddress"
|
||||||
|
fi
|
||||||
|
|
||||||
|
@@ -153,6 +158,14 @@ Metric. Only needed if incorrect metric value is used.
|
||||||
|
<content type="string" default="${OCF_RESKEY_metric_default}" />
|
||||||
|
</parameter>
|
||||||
|
|
||||||
|
+<parameter name="pref">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+IPv6 route preference (low, medium or high). Only needed if incorrect pref value is used.
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">IPv6 route preference.</shortdesc>
|
||||||
|
+<content type="string" default="${OCF_RESKEY_pref_default}" />
|
||||||
|
+</parameter>
|
||||||
|
+
|
||||||
|
<parameter name="table">
|
||||||
|
<longdesc lang="en">
|
||||||
|
Table to modify and use for interface lookup. E.g. "local".
|
||||||
|
@@ -196,12 +209,21 @@ errorexit() {
|
||||||
|
# where the src clause "src Y.Y.Y.Y" may or may not be present
|
||||||
|
|
||||||
|
WS="[[:blank:]]"
|
||||||
|
-OCTET="[0-9]\{1,3\}"
|
||||||
|
-IPADDR="\($OCTET\.\)\{3\}$OCTET"
|
||||||
|
+case "$FAMILY" in
|
||||||
|
+ inet)
|
||||||
|
+ GROUP="[0-9]\{1,3\}"
|
||||||
|
+ IPADDR="\($GROUP\.\)\{3\}$GROUP"
|
||||||
|
+ ;;
|
||||||
|
+ inet6)
|
||||||
|
+ GROUP="[0-9a-f]\{0,4\}"
|
||||||
|
+ IPADDR="\($GROUP\:\)\{0,\}$GROUP"
|
||||||
|
+ ;;
|
||||||
|
+esac
|
||||||
|
SRCCLAUSE="src$WS$WS*\($IPADDR\)"
|
||||||
|
-MATCHROUTE="\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)"
|
||||||
|
-METRICCLAUSE=".*\(metric$WS[^ ]\+\)"
|
||||||
|
+MATCHROUTE="\(.*${WS}\)proto [^ ]\+\(.*${WS}\)\($SRCCLAUSE\)\($WS.*\|$\)"
|
||||||
|
+METRICCLAUSE=".*\(metric$WS[^ ]\+\).*"
|
||||||
|
PROTOCLAUSE=".*\(proto$WS[^ ]\+\).*"
|
||||||
|
+PREFCLAUSE=".*\(pref$WS[^ ]\+\).*"
|
||||||
|
FINDIF=findif
|
||||||
|
|
||||||
|
# findif needs that to be set
|
||||||
|
@@ -216,17 +238,17 @@ srca_read() {
|
||||||
|
errorexit "more than 1 matching route exists"
|
||||||
|
|
||||||
|
# But there might still be no matching route
|
||||||
|
- [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] && [ -z "$ROUTE" ] && \
|
||||||
|
+ ([ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]) && [ -z "$ROUTE" ] && \
|
||||||
|
! ocf_is_probe && [ "$__OCF_ACTION" != stop ] && errorexit "no matching route exists"
|
||||||
|
|
||||||
|
# Sed out the source ip address if it exists
|
||||||
|
- SRCIP=`echo $ROUTE | sed -n "s/$MATCHROUTE/\3/p"`
|
||||||
|
+ SRCIP=`echo $ROUTE | sed -n "s/$MATCHROUTE/\4/p"`
|
||||||
|
|
||||||
|
# and what remains after stripping out the source ip address clause
|
||||||
|
- ROUTE_WO_SRC=`echo $ROUTE | sed "s/$MATCHROUTE/\1\5/"`
|
||||||
|
+ ROUTE_WO_SRC=`echo $ROUTE | sed "s/$MATCHROUTE/\1\2\6/"`
|
||||||
|
|
||||||
|
# using "src <ip>" only returns output if there's a match
|
||||||
|
- if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ]; then
|
||||||
|
+ if [ "$OCF_RESKEY_destination" != "0.0.0.0/0" ] && [ "$OCF_RESKEY_destination" != "::/0" ]; then
|
||||||
|
[ -z "$ROUTE" ] && return 1 || return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
@@ -249,12 +271,15 @@ srca_start() {
|
||||||
|
rc=$OCF_SUCCESS
|
||||||
|
ocf_log info "The ip route has been already set.($NETWORK, $INTERFACE, $ROUTE_WO_SRC)"
|
||||||
|
else
|
||||||
|
- $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC || \
|
||||||
|
- errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC' failed"
|
||||||
|
+ # NetworkManager manages routes with proto static/kernel
|
||||||
|
+ [ -z "$OCF_RESKEY_proto" ] && echo "$PROTO" | grep -q "proto \(kernel\|static\)" && PROTO="proto keepalived"
|
||||||
|
|
||||||
|
- if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
|
||||||
|
- $CMDCHANGE $ROUTE_WO_SRC src $1 || \
|
||||||
|
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $1' failed"
|
||||||
|
+ $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC $PREF || \
|
||||||
|
+ errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC $PREF' failed"
|
||||||
|
+
|
||||||
|
+ if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||||
|
+ $CMDCHANGE $ROUTE_WO_SRC $PROTO src $1 || \
|
||||||
|
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC $PROTO src $1' failed"
|
||||||
|
fi
|
||||||
|
rc=$?
|
||||||
|
fi
|
||||||
|
@@ -290,14 +315,15 @@ srca_stop() {
|
||||||
|
fi
|
||||||
|
|
||||||
|
PRIMARY_IP="$($IP2UTIL -4 -o addr show dev $INTERFACE primary | awk '{split($4,a,"/");print a[1]}')"
|
||||||
|
- OPTS="proto kernel scope $SCOPE src $PRIMARY_IP"
|
||||||
|
+ OPTS="proto kernel scope $SCOPE"
|
||||||
|
+ [ "$FAMILY" = "inet" ] && OPTS="$OPTS src $PRIMARY_IP"
|
||||||
|
|
||||||
|
- $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC || \
|
||||||
|
- errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC' failed"
|
||||||
|
+ $IP2UTIL route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC $PREF || \
|
||||||
|
+ errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC $PREF' failed"
|
||||||
|
|
||||||
|
- if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
|
||||||
|
- $CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP || \
|
||||||
|
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC src $PRIMARY_IP' failed"
|
||||||
|
+ if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||||
|
+ $CMDCHANGE $ROUTE_WO_SRC proto static || \
|
||||||
|
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC proto static' failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return $?
|
||||||
|
@@ -330,7 +356,7 @@ CheckIP() {
|
||||||
|
case $ip in
|
||||||
|
*[!0-9.]*) #got invalid char
|
||||||
|
false;;
|
||||||
|
- .*|*.) #begin or end by ".", which is invalid
|
||||||
|
+ .*|*.) #begin or end with ".", which is invalid
|
||||||
|
false;;
|
||||||
|
*..*) #consecutive ".", which is invalid
|
||||||
|
false;;
|
||||||
|
@@ -356,6 +382,18 @@ CheckIP() {
|
||||||
|
return $? # This return is unnecessary, this comment too :)
|
||||||
|
}
|
||||||
|
|
||||||
|
+CheckIP6() {
|
||||||
|
+ ip="$1"
|
||||||
|
+ case $ip in
|
||||||
|
+ *[!0-9a-f:]*) #got invalid char
|
||||||
|
+ false;;
|
||||||
|
+ *:::*) # more than 2 consecutive ":", which is invalid
|
||||||
|
+ false;;
|
||||||
|
+ *::*::*) # more than 1 "::", which is invalid
|
||||||
|
+ false;;
|
||||||
|
+ esac
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
#
|
||||||
|
# Find out which interface or alias serves the given IP address
|
||||||
|
# The argument is an IP address, and its output
|
||||||
|
@@ -396,8 +434,7 @@ find_interface_solaris() {
|
||||||
|
# is an (aliased) interface name (e.g., "eth0" and "eth0:0").
|
||||||
|
#
|
||||||
|
find_interface_generic() {
|
||||||
|
-
|
||||||
|
- local iface=`$IP2UTIL -o -f inet addr show | grep "\ $BASEIP" \
|
||||||
|
+ local iface=`$IP2UTIL -o -f $FAMILY addr show | grep "\ $BASEIP" \
|
||||||
|
| cut -d ' ' -f2 | grep -v '^ipsec[0-9][0-9]*$'`
|
||||||
|
if [ -z "$iface" ]; then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
@@ -502,7 +539,9 @@ srca_validate_all() {
|
||||||
|
|
||||||
|
# The IP address should be in good shape
|
||||||
|
if CheckIP "$ipaddress"; then
|
||||||
|
- :
|
||||||
|
+ :
|
||||||
|
+ elif CheckIP6 "$ipaddress"; then
|
||||||
|
+ :
|
||||||
|
else
|
||||||
|
ocf_exit_reason "Invalid IP address [$ipaddress]"
|
||||||
|
return $OCF_ERR_CONFIGURED
|
||||||
|
@@ -570,21 +609,36 @@ rc=$?
|
||||||
|
}
|
||||||
|
|
||||||
|
INTERFACE=`echo $findif_out | awk '{print $1}'`
|
||||||
|
-LISTROUTE=`$IP2UTIL route list dev $INTERFACE scope link $PROTO match $ipaddress`
|
||||||
|
+case "$FAMILY" in
|
||||||
|
+ inet)
|
||||||
|
+ LISTCMD="$IP2UTIL -f $FAMILY route list dev $INTERFACE scope link $PROTO match $ipaddress"
|
||||||
|
+ ;;
|
||||||
|
+ inet6)
|
||||||
|
+ LISTCMD="$IP2UTIL -f $FAMILY route list dev $INTERFACE $PROTO match $ipaddress"
|
||||||
|
+ ;;
|
||||||
|
+esac
|
||||||
|
+LISTROUTE=`$LISTCMD`
|
||||||
|
+
|
||||||
|
[ -z "$PROTO" ] && PROTO=`echo $LISTROUTE | sed -n "s/$PROTOCLAUSE/\1/p"`
|
||||||
|
if [ -n "$OCF_RESKEY_metric" ]; then
|
||||||
|
METRIC="metric $OCF_RESKEY_metric"
|
||||||
|
-elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ]; then
|
||||||
|
+elif [ -z "$TABLE" ] || [ "${TABLE#table }" = "main" ] || [ "$FAMILY" = "inet6" ]; then
|
||||||
|
METRIC=`echo $LISTROUTE | sed -n "s/$METRICCLAUSE/\1/p"`
|
||||||
|
else
|
||||||
|
METRIC=""
|
||||||
|
fi
|
||||||
|
-if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] ;then
|
||||||
|
+if [ "$FAMILY" = "inet6" ]; then
|
||||||
|
+ if [ -z "$OCF_RESKEY_pref" ]; then
|
||||||
|
+ PREF=`echo $LISTROUTE | sed -n "s/$PREFCLAUSE/\1/p"`
|
||||||
|
+ else
|
||||||
|
+ PREF="pref $OCF_RESKEY_pref"
|
||||||
|
+ fi
|
||||||
|
+fi
|
||||||
|
+if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ] ;then
|
||||||
|
NETWORK=`echo $LISTROUTE | grep -m 1 -o '^[^ ]*'`
|
||||||
|
|
||||||
|
if [ -z "$NETWORK" ]; then
|
||||||
|
- err_str="command '$IP2UTIL route list dev $INTERFACE scope link $PROTO"
|
||||||
|
- err_str="$err_str match $ipaddress' failed to find a matching route"
|
||||||
|
+ err_str="command '$LISTCMD' failed to find a matching route"
|
||||||
|
|
||||||
|
if [ "$__OCF_ACTION" = "start" ]; then
|
||||||
|
ocf_exit_reason "$err_str"
|
||||||
@ -0,0 +1,22 @@
|
|||||||
|
From 4075aff88776e2811ebc83b735b2a70bcf46247f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 24 Jun 2024 09:45:29 +0200
|
||||||
|
Subject: [PATCH] IPaddr2: only set metric value for IPv6 when detected
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/IPaddr2 | 2 +-
|
||||||
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||||
|
index 091bea418..3bc5abec1 100755
|
||||||
|
--- a/heartbeat/IPaddr2
|
||||||
|
+++ b/heartbeat/IPaddr2
|
||||||
|
@@ -690,7 +690,7 @@ add_interface () {
|
||||||
|
fi
|
||||||
|
|
||||||
|
extra_opts=""
|
||||||
|
- if [ "$FAMILY" = "inet6" ]; then
|
||||||
|
+ if [ "$FAMILY" = "inet6" ] && [ -n "$metric" ]; then
|
||||||
|
extra_opts="$extra_opts metric $metric"
|
||||||
|
fi
|
||||||
|
if [ "$FAMILY" = "inet6" ] && ocf_is_true "${OCF_RESKEY_nodad}"; then
|
||||||
@ -0,0 +1,25 @@
|
|||||||
|
From f561e272e9b7fe94ba598b70c6d2f44d034446ed Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Wed, 14 Aug 2024 12:05:54 +0200
|
||||||
|
Subject: [PATCH] findif.sh: ignore unreachable, blackhole, and prohibit routes
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/findif.sh | 4 ++--
|
||||||
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||||
|
index ca5d1a5c1..7b817f75c 100644
|
||||||
|
--- a/heartbeat/findif.sh
|
||||||
|
+++ b/heartbeat/findif.sh
|
||||||
|
@@ -218,9 +218,9 @@ findif()
|
||||||
|
fi
|
||||||
|
if [ -n "$nic" ] ; then
|
||||||
|
# NIC supports more than two.
|
||||||
|
- routematch=$(ip -o -f $family route list match $match $proto $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
else
|
||||||
|
- routematch=$(ip -o -f $family route list match $match $proto $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
fi
|
||||||
|
if [ "$family" = "inet6" ]; then
|
||||||
|
routematch=$(echo "$routematch" | grep -v "^default")
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
From f23ae9c1e9ff9a44a053c7c2378975ac5b807478 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Thu, 29 Aug 2024 16:24:02 +0200
|
||||||
|
Subject: [PATCH] IPsrcaddr: specify dev for default route, as e.g. fe80::
|
||||||
|
routes can be present on multiple interfaces
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/IPsrcaddr | 8 ++++----
|
||||||
|
1 file changed, 4 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/IPsrcaddr b/heartbeat/IPsrcaddr
|
||||||
|
index 1c87d5b7f..58d89a280 100755
|
||||||
|
--- a/heartbeat/IPsrcaddr
|
||||||
|
+++ b/heartbeat/IPsrcaddr
|
||||||
|
@@ -278,8 +278,8 @@ srca_start() {
|
||||||
|
errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $PROTO src $1 $METRIC $PREF' failed"
|
||||||
|
|
||||||
|
if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||||
|
- $CMDCHANGE $ROUTE_WO_SRC $PROTO src $1 || \
|
||||||
|
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC $PROTO src $1' failed"
|
||||||
|
+ $CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE $PROTO src $1 || \
|
||||||
|
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE $PROTO src $1' failed"
|
||||||
|
fi
|
||||||
|
rc=$?
|
||||||
|
fi
|
||||||
|
@@ -322,8 +322,8 @@ srca_stop() {
|
||||||
|
errorexit "command 'ip route replace $TABLE $NETWORK dev $INTERFACE $OPTS $METRIC $PREF' failed"
|
||||||
|
|
||||||
|
if [ "$OCF_RESKEY_destination" = "0.0.0.0/0" ] || [ "$OCF_RESKEY_destination" = "::/0" ]; then
|
||||||
|
- $CMDCHANGE $ROUTE_WO_SRC proto static || \
|
||||||
|
- errorexit "command '$CMDCHANGE $ROUTE_WO_SRC proto static' failed"
|
||||||
|
+ $CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE proto static || \
|
||||||
|
+ errorexit "command '$CMDCHANGE $ROUTE_WO_SRC dev $INTERFACE proto static' failed"
|
||||||
|
fi
|
||||||
|
|
||||||
|
return $?
|
||||||
@ -0,0 +1,110 @@
|
|||||||
|
From 66a5308d2e8f61093716a076f4386416dc18045c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 22 Apr 2024 11:26:09 +0200
|
||||||
|
Subject: [PATCH] Filesystem: fail when incorrect device mounted on mountpoint,
|
||||||
|
and dont unmount the mountpoint in this case, or if mountpoint set to "/"
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 71 ++++++++++++++++++++++++++++++++++++--------
|
||||||
|
1 file changed, 58 insertions(+), 13 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index e1378f781..cec71f1a6 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -582,10 +582,16 @@ Filesystem_start()
|
||||||
|
fi
|
||||||
|
|
||||||
|
# See if the device is already mounted.
|
||||||
|
- if Filesystem_status >/dev/null 2>&1 ; then
|
||||||
|
- ocf_log info "Filesystem $MOUNTPOINT is already mounted."
|
||||||
|
- return $OCF_SUCCESS
|
||||||
|
- fi
|
||||||
|
+ Filesystem_status
|
||||||
|
+ case "$?" in
|
||||||
|
+ $OCF_SUCCESS)
|
||||||
|
+ ocf_log info "Filesystem $MOUNTPOINT is already mounted."
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ ;;
|
||||||
|
+ $OCF_ERR_CONFIGURED)
|
||||||
|
+ return $OCF_ERR_CONFIGURED
|
||||||
|
+ ;;
|
||||||
|
+ esac
|
||||||
|
|
||||||
|
fstype_supported || exit $OCF_ERR_INSTALLED
|
||||||
|
|
||||||
|
@@ -801,10 +807,42 @@ Filesystem_stop()
|
||||||
|
#
|
||||||
|
Filesystem_status()
|
||||||
|
{
|
||||||
|
- match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
|
||||||
|
- if list_mounts | grep "$match_string" >/dev/null 2>&1; then
|
||||||
|
- rc=$OCF_SUCCESS
|
||||||
|
- msg="$MOUNTPOINT is mounted (running)"
|
||||||
|
+ local match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
|
||||||
|
+ local mounted_device=$(list_mounts | grep "$match_string" | awk '{print $1}')
|
||||||
|
+
|
||||||
|
+ if [ -n "$mounted_device" ]; then
|
||||||
|
+ if [ "X$blockdevice" = "Xyes" ]; then
|
||||||
|
+ if [ -e "$DEVICE" ] ; then
|
||||||
|
+ local canonicalized_device="$(readlink -f "$DEVICE")"
|
||||||
|
+ if [ $? -ne 0 ]; then
|
||||||
|
+ ocf_exit_reason "Could not canonicalize $DEVICE because readlink failed"
|
||||||
|
+ exit $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ else
|
||||||
|
+ local canonicalized_device="$DEVICE"
|
||||||
|
+ fi
|
||||||
|
+ if [ -e "$mounted_device" ] ; then
|
||||||
|
+ local canonicalized_mounted_device="$(readlink -f "$mounted_device")"
|
||||||
|
+ if [ $? -ne 0 ]; then
|
||||||
|
+ ocf_exit_reason "Could not canonicalize $mounted_device because readlink failed"
|
||||||
|
+ exit $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ else
|
||||||
|
+ local canonicalized_mounted_device="$mounted_device"
|
||||||
|
+ fi
|
||||||
|
+ if [ "$canonicalized_device" != "$canonicalized_mounted_device" ]; then
|
||||||
|
+ if ocf_is_probe || [ "$__OCF_ACTION" = "stop" ]; then
|
||||||
|
+ ocf_log debug "Another device ($mounted_device) is already mounted on $MOUNTPOINT"
|
||||||
|
+ rc=$OCF_NOT_RUNNING
|
||||||
|
+ else
|
||||||
|
+ ocf_exit_reason "Another device ($mounted_device) is already mounted on $MOUNTPOINT"
|
||||||
|
+ rc=$OCF_ERR_CONFIGURED
|
||||||
|
+ fi
|
||||||
|
+ fi
|
||||||
|
+ else
|
||||||
|
+ rc=$OCF_SUCCESS
|
||||||
|
+ msg="$MOUNTPOINT is mounted (running)"
|
||||||
|
+ fi
|
||||||
|
else
|
||||||
|
rc=$OCF_NOT_RUNNING
|
||||||
|
msg="$MOUNTPOINT is unmounted (stopped)"
|
||||||
|
@@ -1041,9 +1079,18 @@ else
|
||||||
|
else
|
||||||
|
CANONICALIZED_MOUNTPOINT="$MOUNTPOINT"
|
||||||
|
fi
|
||||||
|
- # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/"
|
||||||
|
- # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll
|
||||||
|
- # kill the whole system. Is that a good idea?
|
||||||
|
+
|
||||||
|
+ if echo "$CANONICALIZED_MOUNTPOINT" | grep -q "^\s*/\s*$"; then
|
||||||
|
+ if ocf_is_probe; then
|
||||||
|
+ ocf_log debug "/ cannot be managed in a cluster"
|
||||||
|
+ exit $OCF_NOT_RUNNING
|
||||||
|
+ elif [ "$__OCF_ACTION" = "start" ] || [ "$__OCF_ACTION" = "monitor" ] || [ "$__OCF_ACTION" = "status" ]; then
|
||||||
|
+ ocf_exit_reason "/ cannot be managed in a cluster"
|
||||||
|
+ exit $OCF_ERR_CONFIGURED
|
||||||
|
+ elif [ "$__OCF_ACTION" = "stop" ]; then
|
||||||
|
+ exit $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check to make sure the utilites are found
|
||||||
|
@@ -1124,5 +1171,3 @@ case $OP in
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
exit $?
|
||||||
|
-
|
||||||
|
-
|
||||||
@ -0,0 +1,333 @@
|
|||||||
|
From 7739c2a802c1dddb6757ff75cf7f6582a89bd518 Mon Sep 17 00:00:00 2001
|
||||||
|
From: id <happytobi@tscoding.de>
|
||||||
|
Date: Fri, 31 May 2024 09:00:18 +0200
|
||||||
|
Subject: [PATCH] azure-events-az: update to API versions, add retry
|
||||||
|
functionality for metadata requests, update tests
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/azure-events-az.in | 117 ++++++++++++++++++++++++-----------
|
||||||
|
heartbeat/ocf.py | 50 +++++++++++++--
|
||||||
|
2 files changed, 126 insertions(+), 41 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
|
||||||
|
index 46d4d1f3d9..6d31e5abae 100644
|
||||||
|
--- a/heartbeat/azure-events-az.in
|
||||||
|
+++ b/heartbeat/azure-events-az.in
|
||||||
|
@@ -27,7 +27,7 @@ import ocf
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
|
||||||
|
-VERSION = "0.10"
|
||||||
|
+VERSION = "0.20"
|
||||||
|
USER_AGENT = "Pacemaker-ResourceAgent/%s %s" % (VERSION, ocf.distro())
|
||||||
|
|
||||||
|
attr_globalPullState = "azure-events-az_globalPullState"
|
||||||
|
@@ -39,9 +39,6 @@ attr_healthstate = "#health-azure"
|
||||||
|
default_loglevel = ocf.logging.INFO
|
||||||
|
default_relevantEventTypes = set(["Reboot", "Redeploy"])
|
||||||
|
|
||||||
|
-global_pullMaxAttempts = 3
|
||||||
|
-global_pullDelaySecs = 1
|
||||||
|
-
|
||||||
|
##############################################################################
|
||||||
|
|
||||||
|
class attrDict(defaultdict):
|
||||||
|
@@ -71,16 +68,22 @@ class azHelper:
|
||||||
|
metadata_host = "http://169.254.169.254/metadata"
|
||||||
|
instance_api = "instance"
|
||||||
|
events_api = "scheduledevents"
|
||||||
|
- api_version = "2019-08-01"
|
||||||
|
+ events_api_version = "2020-07-01"
|
||||||
|
+ instance_api_version = "2021-12-13"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
- def _sendMetadataRequest(endpoint, postData=None):
|
||||||
|
+ def _sendMetadataRequest(endpoint, postData=None, api_version="2019-08-01"):
|
||||||
|
"""
|
||||||
|
Send a request to Azure's Azure Metadata Service API
|
||||||
|
"""
|
||||||
|
- url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, azHelper.api_version)
|
||||||
|
+
|
||||||
|
+ retryCount = int(ocf.get_parameter("retry_count",3))
|
||||||
|
+ retryWaitTime = int(ocf.get_parameter("retry_wait",20))
|
||||||
|
+ requestTimeout = int(ocf.get_parameter("request_timeout",15))
|
||||||
|
+
|
||||||
|
+ url = "%s/%s?api-version=%s" % (azHelper.metadata_host, endpoint, api_version)
|
||||||
|
data = ""
|
||||||
|
- ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s" % (endpoint, postData))
|
||||||
|
+ ocf.logger.debug("_sendMetadataRequest: begin; endpoint = %s, postData = %s, retry_count = %s, retry_wait time = %s, request_timeout = %s" % (endpoint, postData, retryCount, retryWaitTime, requestTimeout))
|
||||||
|
ocf.logger.debug("_sendMetadataRequest: url = %s" % url)
|
||||||
|
|
||||||
|
if postData and type(postData) != bytes:
|
||||||
|
@@ -89,18 +92,37 @@ class azHelper:
|
||||||
|
req = urllib2.Request(url, postData)
|
||||||
|
req.add_header("Metadata", "true")
|
||||||
|
req.add_header("User-Agent", USER_AGENT)
|
||||||
|
- try:
|
||||||
|
- resp = urllib2.urlopen(req)
|
||||||
|
- except URLError as e:
|
||||||
|
- if hasattr(e, 'reason'):
|
||||||
|
- ocf.logger.warning("Failed to reach the server: %s" % e.reason)
|
||||||
|
- clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
||||||
|
- elif hasattr(e, 'code'):
|
||||||
|
- ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code)
|
||||||
|
- clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
||||||
|
- else:
|
||||||
|
- data = resp.read()
|
||||||
|
- ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
|
||||||
|
+
|
||||||
|
+ if retryCount > 0:
|
||||||
|
+ ocf.logger.debug("_sendMetadataRequest: retry enabled")
|
||||||
|
+
|
||||||
|
+ successful = None
|
||||||
|
+ for retry in range(retryCount+1):
|
||||||
|
+ try:
|
||||||
|
+ resp = urllib2.urlopen(req, timeout=requestTimeout)
|
||||||
|
+ except Exception as e:
|
||||||
|
+ excType = e.__class__.__name__
|
||||||
|
+ if excType == TimeoutError.__name__:
|
||||||
|
+ ocf.logger.warning("Request timed out after %s seconds Error: %s" % (requestTimeout, e))
|
||||||
|
+ if excType == URLError.__name__:
|
||||||
|
+ if hasattr(e, 'reason'):
|
||||||
|
+ ocf.logger.warning("Failed to reach the server: %s" % e.reason)
|
||||||
|
+ elif hasattr(e, 'code'):
|
||||||
|
+ ocf.logger.warning("The server couldn\'t fulfill the request. Error code: %s" % e.code)
|
||||||
|
+
|
||||||
|
+ if retryCount > 1 and retry != retryCount:
|
||||||
|
+ ocf.logger.warning("Request failed, retry (%s/%s) wait %s seconds before retry (wait time)" % (retry + 1,retryCount,retryWaitTime))
|
||||||
|
+ time.sleep(retryWaitTime)
|
||||||
|
+
|
||||||
|
+ else:
|
||||||
|
+ data = resp.read()
|
||||||
|
+ ocf.logger.debug("_sendMetadataRequest: response = %s" % data)
|
||||||
|
+ successful = 1
|
||||||
|
+ break
|
||||||
|
+
|
||||||
|
+ # When no request was successful also with retry enabled, set the cluster to idle
|
||||||
|
+ if successful is None:
|
||||||
|
+ clusterHelper.setAttr(attr_globalPullState, "IDLE")
|
||||||
|
|
||||||
|
if data:
|
||||||
|
data = json.loads(data)
|
||||||
|
@@ -115,14 +137,15 @@ class azHelper:
|
||||||
|
"""
|
||||||
|
ocf.logger.debug("getInstanceInfo: begin")
|
||||||
|
|
||||||
|
- jsondata = azHelper._sendMetadataRequest(azHelper.instance_api)
|
||||||
|
+ jsondata = azHelper._sendMetadataRequest(azHelper.instance_api, None, azHelper.instance_api_version)
|
||||||
|
ocf.logger.debug("getInstanceInfo: json = %s" % jsondata)
|
||||||
|
|
||||||
|
if jsondata:
|
||||||
|
ocf.logger.debug("getInstanceInfo: finished, returning {}".format(jsondata["compute"]))
|
||||||
|
return attrDict(jsondata["compute"])
|
||||||
|
else:
|
||||||
|
- ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info")
|
||||||
|
+ apiCall = "%s/%s?api-version=%s" % (azHelper.metadata_host, azHelper.instance_api, azHelper.instance_api_version)
|
||||||
|
+ ocf.ocf_exit_reason("getInstanceInfo: Unable to get instance info - call: %s" % apiCall)
|
||||||
|
sys.exit(ocf.OCF_ERR_GENERIC)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@@ -132,11 +155,17 @@ class azHelper:
|
||||||
|
"""
|
||||||
|
ocf.logger.debug("pullScheduledEvents: begin")
|
||||||
|
|
||||||
|
- jsondata = azHelper._sendMetadataRequest(azHelper.events_api)
|
||||||
|
+ jsondata = azHelper._sendMetadataRequest(azHelper.events_api, None, azHelper.events_api_version)
|
||||||
|
ocf.logger.debug("pullScheduledEvents: json = %s" % jsondata)
|
||||||
|
|
||||||
|
- ocf.logger.debug("pullScheduledEvents: finished")
|
||||||
|
- return attrDict(jsondata)
|
||||||
|
+ if jsondata:
|
||||||
|
+ ocf.logger.debug("pullScheduledEvents: finished")
|
||||||
|
+ return attrDict(jsondata)
|
||||||
|
+ else:
|
||||||
|
+ apiCall = "%s/%s?api-version=%s" % (azHelper.metadata_host, azHelper.events_api, azHelper.events_api_version)
|
||||||
|
+ ocf.ocf_exit_reason("pullScheduledEvents: Unable to get scheduledevents info - call: %s" % apiCall)
|
||||||
|
+ sys.exit(ocf.OCF_ERR_GENERIC)
|
||||||
|
+
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def forceEvents(eventIDs):
|
||||||
|
@@ -534,7 +563,7 @@ class Node:
|
||||||
|
except ValueError:
|
||||||
|
# Handle the exception
|
||||||
|
ocf.logger.warn("Health attribute %s on node %s cannot be converted to an integer value" % (healthAttributeStr, node))
|
||||||
|
-
|
||||||
|
+
|
||||||
|
ocf.logger.debug("isNodeInStandby: finished - result %s" % isInStandy)
|
||||||
|
return isInStandy
|
||||||
|
|
||||||
|
@@ -584,7 +613,7 @@ class raAzEvents:
|
||||||
|
|
||||||
|
def monitor(self):
|
||||||
|
ocf.logger.debug("monitor: begin")
|
||||||
|
-
|
||||||
|
+
|
||||||
|
events = azHelper.pullScheduledEvents()
|
||||||
|
|
||||||
|
# get current document version
|
||||||
|
@@ -600,21 +629,21 @@ class raAzEvents:
|
||||||
|
ocf.logger.info("monitor: already handled curDocVersion, skip")
|
||||||
|
return ocf.OCF_SUCCESS
|
||||||
|
|
||||||
|
- localAzEventIDs = set()
|
||||||
|
+ localAzEventIds = dict()
|
||||||
|
for e in localEvents:
|
||||||
|
- localAzEventIDs.add(e.EventId)
|
||||||
|
+ localAzEventIds[e.EventId] = json.dumps(e)
|
||||||
|
|
||||||
|
curState = self.node.getState()
|
||||||
|
clusterEventIDs = self.node.getEventIDs()
|
||||||
|
|
||||||
|
ocf.logger.debug("monitor: curDocVersion has not been handled yet")
|
||||||
|
-
|
||||||
|
+
|
||||||
|
if clusterEventIDs:
|
||||||
|
# there are pending events set, so our state must be STOPPING or IN_EVENT
|
||||||
|
i = 0; touchedEventIDs = False
|
||||||
|
while i < len(clusterEventIDs):
|
||||||
|
# clean up pending events that are already finished according to AZ
|
||||||
|
- if clusterEventIDs[i] not in localAzEventIDs:
|
||||||
|
+ if clusterEventIDs[i] not in localAzEventIds.keys():
|
||||||
|
ocf.logger.info("monitor: remove finished local clusterEvent %s" % (clusterEventIDs[i]))
|
||||||
|
clusterEventIDs.pop(i)
|
||||||
|
touchedEventIDs = True
|
||||||
|
@@ -644,12 +673,12 @@ class raAzEvents:
|
||||||
|
ocf.logger.info("monitor: all local events finished, but some resources have not completed startup yet -> wait")
|
||||||
|
else:
|
||||||
|
if curState == AVAILABLE:
|
||||||
|
- if len(localAzEventIDs) > 0:
|
||||||
|
+ if len(localAzEventIds) > 0:
|
||||||
|
if clusterHelper.otherNodesAvailable(self.node):
|
||||||
|
- ocf.logger.info("monitor: can handle local events %s -> set state STOPPING" % (str(localAzEventIDs)))
|
||||||
|
- curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIDs)
|
||||||
|
+ ocf.logger.info("monitor: can handle local events %s -> set state STOPPING - %s" % (str(list(localAzEventIds.keys())), str(list(localAzEventIds.values()))))
|
||||||
|
+ curState = self.node.updateNodeStateAndEvents(STOPPING, localAzEventIds.keys())
|
||||||
|
else:
|
||||||
|
- ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD" % str(localAzEventIDs))
|
||||||
|
+ ocf.logger.info("monitor: cannot handle azEvents %s (only node available) -> set state ON_HOLD - %s" % (str(list(localAzEventIds.keys())), str(list(localAzEventIds.values()))))
|
||||||
|
self.node.setState(ON_HOLD)
|
||||||
|
else:
|
||||||
|
ocf.logger.debug("monitor: no local azEvents to handle")
|
||||||
|
@@ -761,6 +790,24 @@ def main():
|
||||||
|
longdesc="Set to true to enable verbose logging",
|
||||||
|
content_type="boolean",
|
||||||
|
default="false")
|
||||||
|
+ agent.add_parameter(
|
||||||
|
+ "retry_count",
|
||||||
|
+ shortdesc="Azure IMDS webservice retry count",
|
||||||
|
+ longdesc="Set to any number bigger than zero to enable retry count",
|
||||||
|
+ content_type="integer",
|
||||||
|
+ default="3")
|
||||||
|
+ agent.add_parameter(
|
||||||
|
+ "retry_wait",
|
||||||
|
+ shortdesc="Configure a retry wait time",
|
||||||
|
+ longdesc="Set retry wait time in seconds",
|
||||||
|
+ content_type="integer",
|
||||||
|
+ default="20")
|
||||||
|
+ agent.add_parameter(
|
||||||
|
+ "request_timeout",
|
||||||
|
+ shortdesc="Configure a request timeout",
|
||||||
|
+ longdesc="Set request timeout in seconds",
|
||||||
|
+ content_type="integer",
|
||||||
|
+ default="15")
|
||||||
|
agent.add_action("start", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
|
||||||
|
agent.add_action("stop", timeout=10, handler=lambda: ocf.OCF_SUCCESS)
|
||||||
|
agent.add_action("validate-all", timeout=20, handler=validate_action)
|
||||||
|
diff --git a/heartbeat/ocf.py b/heartbeat/ocf.py
|
||||||
|
index dda2fed4bb..571cd19664 100644
|
||||||
|
--- a/heartbeat/ocf.py
|
||||||
|
+++ b/heartbeat/ocf.py
|
||||||
|
@@ -16,7 +16,7 @@
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with this library; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
-#
|
||||||
|
+#
|
||||||
|
|
||||||
|
import sys, os, logging, syslog
|
||||||
|
|
||||||
|
@@ -42,19 +42,19 @@
|
||||||
|
# OCF does not include the concept of master/slave resources so we
|
||||||
|
# need to extend it so we can discover a resource's complete state.
|
||||||
|
#
|
||||||
|
-# OCF_RUNNING_MASTER:
|
||||||
|
+# OCF_RUNNING_MASTER:
|
||||||
|
# The resource is in "master" mode and fully operational
|
||||||
|
# OCF_FAILED_MASTER:
|
||||||
|
# The resource is in "master" mode but in a failed state
|
||||||
|
-#
|
||||||
|
+#
|
||||||
|
# The extra two values should only be used during a probe.
|
||||||
|
#
|
||||||
|
# Probes are used to discover resources that were started outside of
|
||||||
|
# the CRM and/or left behind if the LRM fails.
|
||||||
|
-#
|
||||||
|
+#
|
||||||
|
# They can be identified in RA scripts by checking for:
|
||||||
|
# [ "${__OCF_ACTION}" = "monitor" -a "${OCF_RESKEY_CRM_meta_interval}" = "0" ]
|
||||||
|
-#
|
||||||
|
+#
|
||||||
|
# Failed "slaves" should continue to use: OCF_ERR_GENERIC
|
||||||
|
# Fully operational "slaves" should continue to use: OCF_SUCCESS
|
||||||
|
#
|
||||||
|
@@ -451,15 +451,17 @@ def value_for_parameter(param):
|
||||||
|
sys.exit(OCF_ERR_UNIMPLEMENTED)
|
||||||
|
|
||||||
|
|
||||||
|
+
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import unittest
|
||||||
|
+ import logging
|
||||||
|
|
||||||
|
class TestMetadata(unittest.TestCase):
|
||||||
|
def test_noparams_noactions(self):
|
||||||
|
m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc")
|
||||||
|
self.assertEqual("""<?xml version="1.0"?>
|
||||||
|
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
|
||||||
|
-<resource-agent name="foo">
|
||||||
|
+<resource-agent name="foo" version="1.0">
|
||||||
|
<version>1.0</version>
|
||||||
|
<longdesc lang="en">
|
||||||
|
longdesc
|
||||||
|
@@ -483,4 +485,40 @@ def test_params_actions(self):
|
||||||
|
m.add_action("start")
|
||||||
|
self.assertEqual(str(m.actions[0]), '<action name="start" />\n')
|
||||||
|
|
||||||
|
+ def test_retry_params_actions(self):
|
||||||
|
+ log= logging.getLogger( "test_retry_params_actions" )
|
||||||
|
+
|
||||||
|
+ m = Agent("foo", shortdesc="shortdesc", longdesc="longdesc")
|
||||||
|
+ m.add_parameter(
|
||||||
|
+ "retry_count",
|
||||||
|
+ shortdesc="Azure ims webservice retry count",
|
||||||
|
+ longdesc="Set to any number bigger than zero to enable retry count",
|
||||||
|
+ content_type="integer",
|
||||||
|
+ default="0")
|
||||||
|
+ m.add_parameter(
|
||||||
|
+ "retry_wait",
|
||||||
|
+ shortdesc="Configure a retry wait time",
|
||||||
|
+ longdesc="Set retry wait time in seconds",
|
||||||
|
+ content_type="integer",
|
||||||
|
+ default="20")
|
||||||
|
+ m.add_parameter(
|
||||||
|
+ "request_timeout",
|
||||||
|
+ shortdesc="Configure a request timeout",
|
||||||
|
+ longdesc="Set request timeout in seconds",
|
||||||
|
+ content_type="integer",
|
||||||
|
+ default="15")
|
||||||
|
+
|
||||||
|
+ m.add_action("start")
|
||||||
|
+
|
||||||
|
+ log.debug( "actions= %s", str(m.actions[0] ))
|
||||||
|
+ self.assertEqual(str(m.actions[0]), '<action name="start" />\n')
|
||||||
|
+
|
||||||
|
+ log.debug( "parameters= %s", str(m.parameters[0] ))
|
||||||
|
+ log.debug( "parameters= %s", str(m.parameters[1] ))
|
||||||
|
+ log.debug( "parameters= %s", str(m.parameters[2] ))
|
||||||
|
+ self.assertEqual(str(m.parameters[0]), '<parameter name="retry_count">\n<longdesc lang="en">Set to any number bigger than zero to enable retry count</longdesc>\n<shortdesc lang="en">Azure ims webservice retry count</shortdesc>\n<content type="integer" default="0" />\n</parameter>\n')
|
||||||
|
+ self.assertEqual(str(m.parameters[1]), '<parameter name="retry_wait">\n<longdesc lang="en">Set retry wait time in seconds</longdesc>\n<shortdesc lang="en">Configure a retry wait time</shortdesc>\n<content type="integer" default="20" />\n</parameter>\n')
|
||||||
|
+ self.assertEqual(str(m.parameters[2]), '<parameter name="request_timeout">\n<longdesc lang="en">Set request timeout in seconds</longdesc>\n<shortdesc lang="en">Configure a request timeout</shortdesc>\n<content type="integer" default="15" />\n</parameter>\n')
|
||||||
|
+
|
||||||
|
+ logging.basicConfig( stream=sys.stderr )
|
||||||
|
unittest.main()
|
||||||
1165
SOURCES/RHEL-42513-1-powervs-subnet-new-ra.patch
Normal file
1165
SOURCES/RHEL-42513-1-powervs-subnet-new-ra.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,84 @@
|
|||||||
|
From 277370f569b34e1cfb49637f9a00afc20bcd4c54 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Wed, 17 Jul 2024 10:43:29 +0200
|
||||||
|
Subject: [PATCH] build: dont build powervs-subnet if dependencies are missing
|
||||||
|
|
||||||
|
---
|
||||||
|
configure.ac | 9 +++++++++
|
||||||
|
doc/man/Makefile.am | 5 ++++-
|
||||||
|
heartbeat/Makefile.am | 5 ++++-
|
||||||
|
3 files changed, 17 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/configure.ac b/configure.ac
|
||||||
|
index b785e2c2c..21ce27423 100644
|
||||||
|
--- a/configure.ac
|
||||||
|
+++ b/configure.ac
|
||||||
|
@@ -519,6 +519,8 @@ fi
|
||||||
|
|
||||||
|
AC_PYTHON_MODULE(json)
|
||||||
|
AC_PYTHON_MODULE(pyroute2)
|
||||||
|
+AC_PYTHON_MODULE(requests)
|
||||||
|
+AC_PYTHON_MODULE(urllib3)
|
||||||
|
|
||||||
|
AS_VERSION_COMPARE([$PYTHON_VERSION], [3.6], [BUILD_OCF_PY=0], [BUILD_OCF_PY=1], [BUILD_OCF_PY=1])
|
||||||
|
|
||||||
|
@@ -557,6 +559,13 @@ if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0; then
|
||||||
|
fi
|
||||||
|
AM_CONDITIONAL(BUILD_GCP_VPC_MOVE_VIP, test $BUILD_GCP_VPC_MOVE_VIP -eq 1)
|
||||||
|
|
||||||
|
+BUILD_POWERVS_SUBNET=1
|
||||||
|
+if test -z "$PYTHON" || test $BUILD_OCF_PY -eq 0 || test "x${HAVE_PYMOD_REQUESTS}" != xyes || test "x${HAVE_PYMOD_URLLIB3}" != xyes; then
|
||||||
|
+ BUILD_POWERVS_SUBNET=0
|
||||||
|
+ AC_MSG_WARN("Not building powervs-subnet")
|
||||||
|
+fi
|
||||||
|
+AM_CONDITIONAL(BUILD_POWERVS_SUBNET, test $BUILD_POWERVS_SUBNET -eq 1)
|
||||||
|
+
|
||||||
|
AC_PATH_PROGS(ROUTE, route)
|
||||||
|
AC_DEFINE_UNQUOTED(ROUTE, "$ROUTE", path to route command)
|
||||||
|
|
||||||
|
diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am
|
||||||
|
index e577e6357..ef7639bff 100644
|
||||||
|
--- a/doc/man/Makefile.am
|
||||||
|
+++ b/doc/man/Makefile.am
|
||||||
|
@@ -190,7 +190,6 @@ man_MANS = ocf_heartbeat_AoEtarget.7 \
|
||||||
|
ocf_heartbeat_portblock.7 \
|
||||||
|
ocf_heartbeat_postfix.7 \
|
||||||
|
ocf_heartbeat_pound.7 \
|
||||||
|
- ocf_heartbeat_powervs-subnet.7 \
|
||||||
|
ocf_heartbeat_proftpd.7 \
|
||||||
|
ocf_heartbeat_rabbitmq-cluster.7 \
|
||||||
|
ocf_heartbeat_rabbitmq-server-ha.7 \
|
||||||
|
@@ -238,6 +237,10 @@ if BUILD_GCP_VPC_MOVE_VIP
|
||||||
|
man_MANS += ocf_heartbeat_gcp-vpc-move-vip.7
|
||||||
|
endif
|
||||||
|
|
||||||
|
+if BUILD_POWERVS_SUBNET
|
||||||
|
+man_MANS += ocf_heartbeat_powervs-subnet.7
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
xmlfiles = $(man_MANS:.7=.xml)
|
||||||
|
|
||||||
|
%.1 %.5 %.7 %.8: %.xml
|
||||||
|
diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am
|
||||||
|
index ff73a15aa..409847970 100644
|
||||||
|
--- a/heartbeat/Makefile.am
|
||||||
|
+++ b/heartbeat/Makefile.am
|
||||||
|
@@ -162,7 +162,6 @@ ocf_SCRIPTS = AoEtarget \
|
||||||
|
portblock \
|
||||||
|
postfix \
|
||||||
|
pound \
|
||||||
|
- powervs-subnet \
|
||||||
|
proftpd \
|
||||||
|
rabbitmq-cluster \
|
||||||
|
rabbitmq-server-ha \
|
||||||
|
@@ -207,6 +206,10 @@ if BUILD_GCP_VPC_MOVE_VIP
|
||||||
|
ocf_SCRIPTS += gcp-vpc-move-vip
|
||||||
|
endif
|
||||||
|
|
||||||
|
+if BUILD_POWERVS_SUBNET
|
||||||
|
+ocf_SCRIPTS += powervs-subnet
|
||||||
|
+endif
|
||||||
|
+
|
||||||
|
ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat
|
||||||
|
ocfcommon_DATA = ocf-shellfuncs \
|
||||||
|
ocf-binaries \
|
||||||
43
SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch
Normal file
43
SOURCES/RHEL-42513-powervs-subnet-wait-for-IP.patch
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
From 0b4bf9c23eb60455da6c6a16c1df19282ab2a8b5 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Fri, 9 Jan 2026 12:56:14 +0100
|
||||||
|
Subject: [PATCH] powervs-subnet: wait until IP is activated before running
|
||||||
|
monitor-check
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/powervs-subnet.in | 15 +++++++++++++--
|
||||||
|
1 file changed, 13 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/powervs-subnet.in b/heartbeat/powervs-subnet.in
|
||||||
|
index 84e86c0c4..062b1235e 100755
|
||||||
|
--- a/heartbeat/powervs-subnet.in
|
||||||
|
+++ b/heartbeat/powervs-subnet.in
|
||||||
|
@@ -243,7 +243,16 @@ class nmcli:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def up(name, **kwargs):
|
||||||
|
- return nmcli._nmcli_cmd("connection", "up", name, **kwargs)
|
||||||
|
+ nmcli._nmcli_cmd("connection", "up", name, **kwargs)
|
||||||
|
+
|
||||||
|
+ for i in range(1, 10):
|
||||||
|
+ time.sleep(1)
|
||||||
|
+ status = nmcli._nmcli_cmd("connection", "show", name, **kwargs)
|
||||||
|
+ if len(status.get("IP4.ADDRESS[1]", "")) > 0:
|
||||||
|
+ return ocf.OCF_SUCCESS
|
||||||
|
+ ocf.logger.warning(f"nmcli.connection.up: check {i} of 10: IP not yet available.")
|
||||||
|
+
|
||||||
|
+ return ocf.OCF_ERR_GENERIC
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def find(match_key, match_value):
|
||||||
|
@@ -824,7 +833,9 @@ def start_action(
|
||||||
|
conn_options.update({"802-3-ethernet.mtu": "9000", "ethtool.feature-tso": "on"})
|
||||||
|
|
||||||
|
nmcli.connection.add(conn_name, options=conn_options)
|
||||||
|
- nmcli.connection.up(conn_name)
|
||||||
|
+ rc = nmcli.connection.up(conn_name)
|
||||||
|
+ if rc != ocf.OCF_SUCCESS:
|
||||||
|
+ return rc
|
||||||
|
|
||||||
|
if monitor_action(**res_options) != ocf.OCF_SUCCESS:
|
||||||
|
raise PowerCloudAPIError(f"start_action: start subnet: {ws.subnet_name} failed")
|
||||||
@ -0,0 +1,61 @@
|
|||||||
|
From 481672f73d05666ab20a883cf8fc746cb1f3050f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Thu, 20 Jun 2024 09:29:21 +0200
|
||||||
|
Subject: [PATCH] galera/mariadb/mysql/redis: remove Unpromoted monitor-action,
|
||||||
|
as it's covered by the regular monitor-action
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/galera.in | 1 -
|
||||||
|
heartbeat/mariadb.in | 1 -
|
||||||
|
heartbeat/mysql | 1 -
|
||||||
|
heartbeat/redis.in | 1 -
|
||||||
|
4 files changed, 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/galera.in b/heartbeat/galera.in
|
||||||
|
index b518595cb0..b29d68bf73 100755
|
||||||
|
--- a/heartbeat/galera.in
|
||||||
|
+++ b/heartbeat/galera.in
|
||||||
|
@@ -299,7 +299,6 @@ Use it with caution! (and fencing)
|
||||||
|
<action name="status" timeout="60s" />
|
||||||
|
<action name="monitor" depth="0" timeout="30s" interval="20s" />
|
||||||
|
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
|
||||||
|
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
|
||||||
|
<action name="promote" timeout="300s" />
|
||||||
|
<action name="demote" timeout="120s" />
|
||||||
|
<action name="validate-all" timeout="5s" />
|
||||||
|
diff --git a/heartbeat/mariadb.in b/heartbeat/mariadb.in
|
||||||
|
index e0f1f3c9f1..1dca98ba68 100644
|
||||||
|
--- a/heartbeat/mariadb.in
|
||||||
|
+++ b/heartbeat/mariadb.in
|
||||||
|
@@ -255,7 +255,6 @@ The port on which the Promoted MariaDB instance is listening.
|
||||||
|
<action name="status" timeout="60s" />
|
||||||
|
<action name="monitor" depth="0" timeout="30s" interval="20s" />
|
||||||
|
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
|
||||||
|
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
|
||||||
|
<action name="promote" timeout="120s" />
|
||||||
|
<action name="demote" timeout="120s" />
|
||||||
|
<action name="notify" timeout="90s" />
|
||||||
|
diff --git a/heartbeat/mysql b/heartbeat/mysql
|
||||||
|
index 1df2fc0f28..6b00889ff4 100755
|
||||||
|
--- a/heartbeat/mysql
|
||||||
|
+++ b/heartbeat/mysql
|
||||||
|
@@ -322,7 +322,6 @@ whether a node is usable for clients to read from.</shortdesc>
|
||||||
|
<action name="status" timeout="60s" />
|
||||||
|
<action name="monitor" depth="0" timeout="30s" interval="20s" />
|
||||||
|
<action name="monitor" role="Promoted" depth="0" timeout="30s" interval="10s" />
|
||||||
|
-<action name="monitor" role="Unpromoted" depth="0" timeout="30s" interval="30s" />
|
||||||
|
<action name="promote" timeout="120s" />
|
||||||
|
<action name="demote" timeout="120s" />
|
||||||
|
<action name="notify" timeout="90s" />
|
||||||
|
diff --git a/heartbeat/redis.in b/heartbeat/redis.in
|
||||||
|
index 6429477e11..1e541f13d5 100755
|
||||||
|
--- a/heartbeat/redis.in
|
||||||
|
+++ b/heartbeat/redis.in
|
||||||
|
@@ -221,7 +221,6 @@ is in use.
|
||||||
|
<action name="status" timeout="60s" />
|
||||||
|
<action name="monitor" depth="0" timeout="60s" interval="45s" />
|
||||||
|
<action name="monitor" role="Promoted" depth="0" timeout="60s" interval="20s" />
|
||||||
|
-<action name="monitor" role="Unpromoted" depth="0" timeout="60s" interval="60s" />
|
||||||
|
<action name="promote" timeout="120s" />
|
||||||
|
<action name="demote" timeout="120s" />
|
||||||
|
<action name="notify" timeout="90s" />
|
||||||
@ -1,48 +0,0 @@
|
|||||||
From accff72ecc2f6cf5a76d9570198a93ac7c90270e Mon Sep 17 00:00:00 2001
|
|
||||||
From: Quentin Pradet <quentin.pradet@gmail.com>
|
|
||||||
Date: Mon, 17 Jun 2024 11:09:06 +0400
|
|
||||||
Subject: [PATCH] Merge pull request from GHSA-34jh-p97f-mpxf
|
|
||||||
|
|
||||||
* Strip Proxy-Authorization header on redirects
|
|
||||||
|
|
||||||
* Fix test_retry_default_remove_headers_on_redirect
|
|
||||||
|
|
||||||
* Set release date
|
|
||||||
---
|
|
||||||
CHANGES.rst | 5 +++++
|
|
||||||
src/urllib3/util/retry.py | 4 +++-
|
|
||||||
test/test_retry.py | 6 ++++-
|
|
||||||
test/with_dummyserver/test_poolmanager.py | 27 ++++++++++++++++++++---
|
|
||||||
4 files changed, 37 insertions(+), 5 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/util/retry.py b/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/util/retry.py
|
|
||||||
index 7a76a4a6ad..0456cceba4 100644
|
|
||||||
--- a/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/util/retry.py
|
|
||||||
+++ b/aliyun/aliyunsdkcore/vendored/requests/packages/urllib3/util/retry.py
|
|
||||||
@@ -189,7 +189,9 @@ class Retry:
|
|
||||||
RETRY_AFTER_STATUS_CODES = frozenset([413, 429, 503])
|
|
||||||
|
|
||||||
#: Default headers to be used for ``remove_headers_on_redirect``
|
|
||||||
- DEFAULT_REMOVE_HEADERS_ON_REDIRECT = frozenset(["Cookie", "Authorization"])
|
|
||||||
+ DEFAULT_REMOVE_HEADERS_ON_REDIRECT = frozenset(
|
|
||||||
+ ["Cookie", "Authorization", "Proxy-Authorization"]
|
|
||||||
+ )
|
|
||||||
|
|
||||||
#: Default maximum backoff time.
|
|
||||||
DEFAULT_BACKOFF_MAX = 120
|
|
||||||
|
|
||||||
diff --git a/gcp/google-cloud-sdk/lib/third_party/urllib3/util/retry.py b/gcp/google-cloud-sdk/lib/third_party/urllib3/util/retry.py
|
|
||||||
index 7a76a4a6ad..0456cceba4 100644
|
|
||||||
--- a/gcp/google-cloud-sdk/lib/third_party/urllib3/util/retry.py
|
|
||||||
+++ b/gcp/google-cloud-sdk/lib/third_party/urllib3/util/retry.py
|
|
||||||
@@ -189,7 +189,9 @@ class Retry:
|
|
||||||
RETRY_AFTER_STATUS_CODES = frozenset([413, 429, 503])
|
|
||||||
|
|
||||||
#: Default headers to be used for ``remove_headers_on_redirect``
|
|
||||||
- DEFAULT_REMOVE_HEADERS_ON_REDIRECT = frozenset(["Cookie", "Authorization"])
|
|
||||||
+ DEFAULT_REMOVE_HEADERS_ON_REDIRECT = frozenset(
|
|
||||||
+ ["Cookie", "Authorization", "Proxy-Authorization"]
|
|
||||||
+ )
|
|
||||||
|
|
||||||
#: Default maximum backoff time.
|
|
||||||
DEFAULT_BACKOFF_MAX = 120
|
|
||||||
@ -1,201 +0,0 @@
|
|||||||
--- a/setuptools/package_index.py 1980-01-01 09:00:00.000000000 +0100
|
|
||||||
+++ b/setuptools/package_index.py 2024-07-25 10:11:40.537307665 +0200
|
|
||||||
@@ -1,5 +1,6 @@
|
|
||||||
"""PyPI and direct package downloading"""
|
|
||||||
import sys
|
|
||||||
+import subprocess
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
@@ -563,7 +564,7 @@
|
|
||||||
scheme = URL_SCHEME(spec)
|
|
||||||
if scheme:
|
|
||||||
# It's a url, download it to tmpdir
|
|
||||||
- found = self._download_url(scheme.group(1), spec, tmpdir)
|
|
||||||
+ found = self._download_url(spec, tmpdir)
|
|
||||||
base, fragment = egg_info_for_url(spec)
|
|
||||||
if base.endswith('.py'):
|
|
||||||
found = self.gen_setup(found, fragment, tmpdir)
|
|
||||||
@@ -775,7 +776,7 @@
|
|
||||||
raise DistutilsError("Download error for %s: %s"
|
|
||||||
% (url, v))
|
|
||||||
|
|
||||||
- def _download_url(self, scheme, url, tmpdir):
|
|
||||||
+ def _download_url(self, url, tmpdir):
|
|
||||||
# Determine download filename
|
|
||||||
#
|
|
||||||
name, fragment = egg_info_for_url(url)
|
|
||||||
@@ -790,19 +791,59 @@
|
|
||||||
|
|
||||||
filename = os.path.join(tmpdir, name)
|
|
||||||
|
|
||||||
- # Download the file
|
|
||||||
- #
|
|
||||||
- if scheme == 'svn' or scheme.startswith('svn+'):
|
|
||||||
- return self._download_svn(url, filename)
|
|
||||||
- elif scheme == 'git' or scheme.startswith('git+'):
|
|
||||||
- return self._download_git(url, filename)
|
|
||||||
- elif scheme.startswith('hg+'):
|
|
||||||
- return self._download_hg(url, filename)
|
|
||||||
- elif scheme == 'file':
|
|
||||||
- return urllib.request.url2pathname(urllib.parse.urlparse(url)[2])
|
|
||||||
- else:
|
|
||||||
- self.url_ok(url, True) # raises error if not allowed
|
|
||||||
- return self._attempt_download(url, filename)
|
|
||||||
+ return self._download_vcs(url, filename) or self._download_other(url, filename)
|
|
||||||
+
|
|
||||||
+ @staticmethod
|
|
||||||
+ def _resolve_vcs(url):
|
|
||||||
+ """
|
|
||||||
+ >>> rvcs = PackageIndex._resolve_vcs
|
|
||||||
+ >>> rvcs('git+http://foo/bar')
|
|
||||||
+ 'git'
|
|
||||||
+ >>> rvcs('hg+https://foo/bar')
|
|
||||||
+ 'hg'
|
|
||||||
+ >>> rvcs('git:myhost')
|
|
||||||
+ 'git'
|
|
||||||
+ >>> rvcs('hg:myhost')
|
|
||||||
+ >>> rvcs('http://foo/bar')
|
|
||||||
+ """
|
|
||||||
+ scheme = urllib.parse.urlsplit(url).scheme
|
|
||||||
+ pre, sep, post = scheme.partition('+')
|
|
||||||
+ # svn and git have their own protocol; hg does not
|
|
||||||
+ allowed = set(['svn', 'git'] + ['hg'] * bool(sep))
|
|
||||||
+ return next(iter({pre} & allowed), None)
|
|
||||||
+
|
|
||||||
+ def _download_vcs(self, url, spec_filename):
|
|
||||||
+ vcs = self._resolve_vcs(url)
|
|
||||||
+ if not vcs:
|
|
||||||
+ return
|
|
||||||
+ if vcs == 'svn':
|
|
||||||
+ raise DistutilsError(
|
|
||||||
+ f"Invalid config, SVN download is not supported: {url}"
|
|
||||||
+ )
|
|
||||||
+
|
|
||||||
+ filename, _, _ = spec_filename.partition('#')
|
|
||||||
+ url, rev = self._vcs_split_rev_from_url(url)
|
|
||||||
+
|
|
||||||
+ self.info(f"Doing {vcs} clone from {url} to {filename}")
|
|
||||||
+ subprocess.check_call([vcs, 'clone', '--quiet', url, filename])
|
|
||||||
+
|
|
||||||
+ co_commands = dict(
|
|
||||||
+ git=[vcs, '-C', filename, 'checkout', '--quiet', rev],
|
|
||||||
+ hg=[vcs, '--cwd', filename, 'up', '-C', '-r', rev, '-q'],
|
|
||||||
+ )
|
|
||||||
+ if rev is not None:
|
|
||||||
+ self.info(f"Checking out {rev}")
|
|
||||||
+ subprocess.check_call(co_commands[vcs])
|
|
||||||
+
|
|
||||||
+ return filename
|
|
||||||
+
|
|
||||||
+ def _download_other(self, url, filename):
|
|
||||||
+ scheme = urllib.parse.urlsplit(url).scheme
|
|
||||||
+ if scheme == 'file': # pragma: no cover
|
|
||||||
+ return urllib.request.url2pathname(urllib.parse.urlparse(url).path)
|
|
||||||
+ # raise error if not allowed
|
|
||||||
+ self.url_ok(url, True)
|
|
||||||
+ return self._attempt_download(url, filename)
|
|
||||||
|
|
||||||
def scan_url(self, url):
|
|
||||||
self.process_url(url, True)
|
|
||||||
@@ -829,76 +870,37 @@
|
|
||||||
os.unlink(filename)
|
|
||||||
raise DistutilsError("Unexpected HTML page found at " + url)
|
|
||||||
|
|
||||||
- def _download_svn(self, url, filename):
|
|
||||||
- url = url.split('#', 1)[0] # remove any fragment for svn's sake
|
|
||||||
- creds = ''
|
|
||||||
- if url.lower().startswith('svn:') and '@' in url:
|
|
||||||
- scheme, netloc, path, p, q, f = urllib.parse.urlparse(url)
|
|
||||||
- if not netloc and path.startswith('//') and '/' in path[2:]:
|
|
||||||
- netloc, path = path[2:].split('/', 1)
|
|
||||||
- auth, host = splituser(netloc)
|
|
||||||
- if auth:
|
|
||||||
- if ':' in auth:
|
|
||||||
- user, pw = auth.split(':', 1)
|
|
||||||
- creds = " --username=%s --password=%s" % (user, pw)
|
|
||||||
- else:
|
|
||||||
- creds = " --username=" + auth
|
|
||||||
- netloc = host
|
|
||||||
- parts = scheme, netloc, url, p, q, f
|
|
||||||
- url = urllib.parse.urlunparse(parts)
|
|
||||||
- self.info("Doing subversion checkout from %s to %s", url, filename)
|
|
||||||
- os.system("svn checkout%s -q %s %s" % (creds, url, filename))
|
|
||||||
- return filename
|
|
||||||
-
|
|
||||||
@staticmethod
|
|
||||||
- def _vcs_split_rev_from_url(url, pop_prefix=False):
|
|
||||||
- scheme, netloc, path, query, frag = urllib.parse.urlsplit(url)
|
|
||||||
-
|
|
||||||
- scheme = scheme.split('+', 1)[-1]
|
|
||||||
-
|
|
||||||
- # Some fragment identification fails
|
|
||||||
- path = path.split('#', 1)[0]
|
|
||||||
-
|
|
||||||
- rev = None
|
|
||||||
- if '@' in path:
|
|
||||||
- path, rev = path.rsplit('@', 1)
|
|
||||||
-
|
|
||||||
- # Also, discard fragment
|
|
||||||
- url = urllib.parse.urlunsplit((scheme, netloc, path, query, ''))
|
|
||||||
-
|
|
||||||
- return url, rev
|
|
||||||
-
|
|
||||||
- def _download_git(self, url, filename):
|
|
||||||
- filename = filename.split('#', 1)[0]
|
|
||||||
- url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
|
|
||||||
-
|
|
||||||
- self.info("Doing git clone from %s to %s", url, filename)
|
|
||||||
- os.system("git clone --quiet %s %s" % (url, filename))
|
|
||||||
+ def _vcs_split_rev_from_url(url):
|
|
||||||
+ """
|
|
||||||
+ Given a possible VCS URL, return a clean URL and resolved revision if any.
|
|
||||||
|
|
||||||
- if rev is not None:
|
|
||||||
- self.info("Checking out %s", rev)
|
|
||||||
- os.system("(cd %s && git checkout --quiet %s)" % (
|
|
||||||
- filename,
|
|
||||||
- rev,
|
|
||||||
- ))
|
|
||||||
+ >>> vsrfu = PackageIndex._vcs_split_rev_from_url
|
|
||||||
+ >>> vsrfu('git+https://github.com/pypa/setuptools@v69.0.0#egg-info=setuptools')
|
|
||||||
+ ('https://github.com/pypa/setuptools', 'v69.0.0')
|
|
||||||
+ >>> vsrfu('git+https://github.com/pypa/setuptools#egg-info=setuptools')
|
|
||||||
+ ('https://github.com/pypa/setuptools', None)
|
|
||||||
+ >>> vsrfu('http://foo/bar')
|
|
||||||
+ ('http://foo/bar', None)
|
|
||||||
+ """
|
|
||||||
+ parts = urllib.parse.urlsplit(url)
|
|
||||||
|
|
||||||
- return filename
|
|
||||||
+ clean_scheme = parts.scheme.split('+', 1)[-1]
|
|
||||||
|
|
||||||
- def _download_hg(self, url, filename):
|
|
||||||
- filename = filename.split('#', 1)[0]
|
|
||||||
- url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
|
|
||||||
+ # Some fragment identification fails
|
|
||||||
+ no_fragment_path, _, _ = parts.path.partition('#')
|
|
||||||
|
|
||||||
- self.info("Doing hg clone from %s to %s", url, filename)
|
|
||||||
- os.system("hg clone --quiet %s %s" % (url, filename))
|
|
||||||
+ pre, sep, post = no_fragment_path.rpartition('@')
|
|
||||||
+ clean_path, rev = (pre, post) if sep else (post, None)
|
|
||||||
|
|
||||||
- if rev is not None:
|
|
||||||
- self.info("Updating to %s", rev)
|
|
||||||
- os.system("(cd %s && hg up -C -r %s >&-)" % (
|
|
||||||
- filename,
|
|
||||||
- rev,
|
|
||||||
- ))
|
|
||||||
+ resolved = parts._replace(
|
|
||||||
+ scheme=clean_scheme,
|
|
||||||
+ path=clean_path,
|
|
||||||
+ # discard the fragment
|
|
||||||
+ fragment='',
|
|
||||||
+ ).geturl()
|
|
||||||
|
|
||||||
- return filename
|
|
||||||
+ return resolved, rev
|
|
||||||
|
|
||||||
def debug(self, msg, *args):
|
|
||||||
log.debug(msg, *args)
|
|
||||||
@ -0,0 +1,43 @@
|
|||||||
|
From 2ab2c832180dacb2e66d38541beae0957416eb96 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Antonio Romito <aromito@redhat.com>
|
||||||
|
Date: Mon, 9 Sep 2024 17:30:38 +0200
|
||||||
|
Subject: [PATCH] Improve handling of "stopping" container removal in
|
||||||
|
remove_container()
|
||||||
|
|
||||||
|
- Added handling for containers in a stopping state by checking the state and force-removing if necessary.
|
||||||
|
- Improved log messages to provide clearer information when force removal is needed.
|
||||||
|
|
||||||
|
Related: https://issues.redhat.com/browse/RHEL-58008
|
||||||
|
---
|
||||||
|
heartbeat/podman | 11 +++++++++--
|
||||||
|
1 file changed, 9 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/podman b/heartbeat/podman
|
||||||
|
index 53867bff20..643ec4d894 100755
|
||||||
|
--- a/heartbeat/podman
|
||||||
|
+++ b/heartbeat/podman
|
||||||
|
@@ -254,6 +254,13 @@ remove_container()
|
||||||
|
ocf_run podman rm -v $CONTAINER
|
||||||
|
rc=$?
|
||||||
|
if [ $rc -ne 0 ]; then
|
||||||
|
+ if [ $rc -eq 2 ]; then
|
||||||
|
+ if podman inspect --format '{{.State.Status}}' $CONTAINER | grep -wq "stopping"; then
|
||||||
|
+ ocf_log err "Inactive container ${CONTAINER} is stuck in 'stopping' state. Force-remove it."
|
||||||
|
+ ocf_run podman rm -f $CONTAINER
|
||||||
|
+ rc=$?
|
||||||
|
+ fi
|
||||||
|
+ fi
|
||||||
|
# due to a podman bug (rhbz#1841485), sometimes a stopped
|
||||||
|
# container can still be associated with Exec sessions, in
|
||||||
|
# which case the "podman rm" has to be forced
|
||||||
|
@@ -517,8 +524,8 @@ podman_stop()
|
||||||
|
# but the associated container exit code is -1. If that's the case,
|
||||||
|
# assume there's no failure and continue with the rm as usual.
|
||||||
|
if [ $rc -eq 125 ] && \
|
||||||
|
- podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -wq "stopped:-1"; then
|
||||||
|
- ocf_log warn "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
|
||||||
|
+ podman inspect --format '{{.State.Status}}:{{.State.ExitCode}}' $CONTAINER | grep -Eq '^(exited|stopped):-1$'; then
|
||||||
|
+ ocf_log err "Container ${CONTAINER} had an unexpected stop outcome. Trying to remove it anyway."
|
||||||
|
else
|
||||||
|
ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}."
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
@ -0,0 +1,106 @@
|
|||||||
|
From d66a52cfb25f5436255ecc65a407c0166a720146 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 3 Sep 2024 12:55:28 +0200
|
||||||
|
Subject: [PATCH 1/2] Filesystem: dont sleep during stop-action when there are
|
||||||
|
no processes to kill
|
||||||
|
|
||||||
|
Thanks @SatomiOSAWA for the initial code.
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 10 ++++++----
|
||||||
|
1 file changed, 6 insertions(+), 4 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index 3eb520e0c..f54969f20 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -685,12 +685,13 @@ signal_processes() {
|
||||||
|
pids=$(get_pids "$dir")
|
||||||
|
if [ -z "$pids" ]; then
|
||||||
|
ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'"
|
||||||
|
- return
|
||||||
|
+ return 1
|
||||||
|
fi
|
||||||
|
for pid in $pids; do
|
||||||
|
ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)"
|
||||||
|
kill -s $sig $pid
|
||||||
|
done
|
||||||
|
+ return 0
|
||||||
|
}
|
||||||
|
try_umount() {
|
||||||
|
local SUB="$1"
|
||||||
|
@@ -717,12 +718,13 @@ timeout_child() {
|
||||||
|
return $ret
|
||||||
|
}
|
||||||
|
fs_stop_loop() {
|
||||||
|
- local SUB="$1" signals="$2" sig
|
||||||
|
+ local SUB="$1" signals="$2" sig send_signal
|
||||||
|
while true; do
|
||||||
|
+ send_signal=false
|
||||||
|
for sig in $signals; do
|
||||||
|
- signal_processes "$SUB" $sig
|
||||||
|
+ signal_processes "$SUB" $sig && send_signal=true
|
||||||
|
done
|
||||||
|
- sleep $OCF_RESKEY_signal_delay
|
||||||
|
+ $send_signal && sleep $OCF_RESKEY_signal_delay
|
||||||
|
try_umount "$SUB" && return $OCF_SUCCESS
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
From cb6aaffc260eea0f0fee6fab44393c6cf12b8a83 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 9 Sep 2024 10:58:12 +0200
|
||||||
|
Subject: [PATCH 2/2] Filesystem: only use $umount_force after sending
|
||||||
|
kill_signals
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 12 ++++++------
|
||||||
|
1 file changed, 6 insertions(+), 6 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index f54969f20..4dd962fd9 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -694,8 +694,8 @@ signal_processes() {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
try_umount() {
|
||||||
|
- local SUB="$1"
|
||||||
|
- $UMOUNT $umount_force "$SUB"
|
||||||
|
+ local force_arg="$1" SUB="$2"
|
||||||
|
+ $UMOUNT $force_arg "$SUB"
|
||||||
|
list_mounts | grep "${TAB}${SUB}${TAB}" >/dev/null 2>&1 || {
|
||||||
|
ocf_log info "unmounted $SUB successfully"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
@@ -718,14 +718,14 @@ timeout_child() {
|
||||||
|
return $ret
|
||||||
|
}
|
||||||
|
fs_stop_loop() {
|
||||||
|
- local SUB="$1" signals="$2" sig send_signal
|
||||||
|
+ local force_arg="$1" SUB="$2" signals="$3" sig send_signal
|
||||||
|
while true; do
|
||||||
|
send_signal=false
|
||||||
|
for sig in $signals; do
|
||||||
|
signal_processes "$SUB" $sig && send_signal=true
|
||||||
|
done
|
||||||
|
$send_signal && sleep $OCF_RESKEY_signal_delay
|
||||||
|
- try_umount "$SUB" && return $OCF_SUCCESS
|
||||||
|
+ try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
|
||||||
|
done
|
||||||
|
}
|
||||||
|
fs_stop() {
|
||||||
|
@@ -733,13 +733,13 @@ fs_stop() {
|
||||||
|
grace_time=$((timeout/2))
|
||||||
|
|
||||||
|
# try gracefully terminating processes for up to half of the configured timeout
|
||||||
|
- fs_stop_loop "$SUB" "$OCF_RESKEY_term_signals" &
|
||||||
|
+ fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
|
||||||
|
timeout_child $! $grace_time
|
||||||
|
ret=$?
|
||||||
|
[ $ret -eq $OCF_SUCCESS ] && return $ret
|
||||||
|
|
||||||
|
# try killing them for the rest of the timeout
|
||||||
|
- fs_stop_loop "$SUB" "$OCF_RESKEY_kill_signals" &
|
||||||
|
+ fs_stop_loop "$umount_force" "$SUB" "$OCF_RESKEY_kill_signals" &
|
||||||
|
timeout_child $! $grace_time
|
||||||
|
ret=$?
|
||||||
|
[ $ret -eq $OCF_SUCCESS ] && return $ret
|
||||||
@ -0,0 +1,37 @@
|
|||||||
|
From c72dc2f2e502486d93aeec26abc12e720b14a0a7 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Thu, 10 Oct 2024 16:41:03 +0200
|
||||||
|
Subject: [PATCH] azure-events*: use node name from cluster instead of hostname
|
||||||
|
to avoid failing if they're not the same
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/azure-events-az.in | 2 +-
|
||||||
|
heartbeat/azure-events.in | 2 +-
|
||||||
|
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/azure-events-az.in b/heartbeat/azure-events-az.in
|
||||||
|
index 6d31e5aba..0ed001037 100644
|
||||||
|
--- a/heartbeat/azure-events-az.in
|
||||||
|
+++ b/heartbeat/azure-events-az.in
|
||||||
|
@@ -441,7 +441,7 @@ class Node:
|
||||||
|
self.raOwner = ra
|
||||||
|
self.azInfo = azHelper.getInstanceInfo()
|
||||||
|
self.azName = self.azInfo.name
|
||||||
|
- self.hostName = socket.gethostname()
|
||||||
|
+ self.hostName = clusterHelper._exec("crm_node", "-n")
|
||||||
|
self.setAttr("azName", self.azName)
|
||||||
|
clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/azure-events.in b/heartbeat/azure-events.in
|
||||||
|
index 90acaba62..32f71ee26 100644
|
||||||
|
--- a/heartbeat/azure-events.in
|
||||||
|
+++ b/heartbeat/azure-events.in
|
||||||
|
@@ -411,7 +411,7 @@ class Node:
|
||||||
|
self.raOwner = ra
|
||||||
|
self.azInfo = azHelper.getInstanceInfo()
|
||||||
|
self.azName = self.azInfo.name
|
||||||
|
- self.hostName = socket.gethostname()
|
||||||
|
+ self.hostName = clusterHelper._exec("crm_node", "-n")
|
||||||
|
self.setAttr("azName", self.azName)
|
||||||
|
clusterHelper.setAttr("hostName_%s" % self.azName, self.hostName)
|
||||||
|
|
||||||
@ -0,0 +1,100 @@
|
|||||||
|
From f02afd0fadb581ca0fc9798beaf28044cf211200 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||||
|
Date: Wed, 18 Sep 2024 11:53:52 +0200
|
||||||
|
Subject: [PATCH 1/2] Filesystem: on stop, try umount directly, before scanning
|
||||||
|
for users
|
||||||
|
|
||||||
|
48ed6e6d (Filesystem: improve stop-action and allow setting term/kill signals and signal_delay for large filesystems, 2023-07-04)
|
||||||
|
changed the logic from
|
||||||
|
"try umount; if that fails, find and kill users; repeat" to
|
||||||
|
"try to find and kill users; then try umount; repeat"
|
||||||
|
|
||||||
|
But even just walking /proc may take "a long time" on busy systems,
|
||||||
|
and may still turn up with "no users found".
|
||||||
|
|
||||||
|
It will take even longer for "force_umount=safe"
|
||||||
|
(observed 8 to 10 seconds just for "get_pids() with "safe" to return nothing)
|
||||||
|
than for "force_umount=yes" (still ~ 2 to 3 seconds),
|
||||||
|
but it will take "a long time" in any case.
|
||||||
|
(BTW, that may be longer than the hardcoded default of 6 seconds for "fast_stop",
|
||||||
|
which is also the default on many systems now)
|
||||||
|
|
||||||
|
If the dependencies are properly configured,
|
||||||
|
there should be no users left,
|
||||||
|
and the umount should just work.
|
||||||
|
|
||||||
|
Revert back to "try umount first", and only then try to find "rogue" users.
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 5 +++++
|
||||||
|
1 file changed, 5 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index 4dd962fd9..99bddaf62 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -732,6 +732,11 @@ fs_stop() {
|
||||||
|
local SUB="$1" timeout=$2 grace_time ret
|
||||||
|
grace_time=$((timeout/2))
|
||||||
|
|
||||||
|
+ # Just walking /proc may take "a long time", even if we don't find any users of this FS.
|
||||||
|
+ # If dependencies are properly configured, umount should just work.
|
||||||
|
+ # Only if that fails, try to find and kill processes that still use it.
|
||||||
|
+ try_umount "" "$SUB" && return $OCF_SUCCESS
|
||||||
|
+
|
||||||
|
# try gracefully terminating processes for up to half of the configured timeout
|
||||||
|
fs_stop_loop "" "$SUB" "$OCF_RESKEY_term_signals" &
|
||||||
|
timeout_child $! $grace_time
|
||||||
|
|
||||||
|
From b42d698f12aaeb871f4cc6a3c0327a27862b4376 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Lars Ellenberg <lars.ellenberg@linbit.com>
|
||||||
|
Date: Wed, 18 Sep 2024 13:42:38 +0200
|
||||||
|
Subject: [PATCH 2/2] Filesystem: stop/get_pids to be signaled
|
||||||
|
|
||||||
|
The "safe" way to get process ids that may be using a particular filesystem
|
||||||
|
currently uses shell globs ("find /proc/[0-9]*").
|
||||||
|
With a million processes (and/or a less capable shell),
|
||||||
|
that may result in "Argument list too long".
|
||||||
|
|
||||||
|
Replace with find /proc -path "/proc/[0-9]*" instead.
|
||||||
|
While at it, also fix the non-posix -or to be -o,
|
||||||
|
and add explicit grouping parentheses \( \) and explicit -print.
|
||||||
|
|
||||||
|
Add a comment to not include "interesting" characters in mount point names.
|
||||||
|
---
|
||||||
|
heartbeat/Filesystem | 23 ++++++++++++++++++++---
|
||||||
|
1 file changed, 20 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem
|
||||||
|
index 99bddaf62..3405e2c26 100755
|
||||||
|
--- a/heartbeat/Filesystem
|
||||||
|
+++ b/heartbeat/Filesystem
|
||||||
|
@@ -669,9 +669,26 @@ get_pids()
|
||||||
|
$FUSER -Mm $dir 2>/dev/null
|
||||||
|
fi
|
||||||
|
elif [ "$FORCE_UNMOUNT" = "safe" ]; then
|
||||||
|
- procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}')
|
||||||
|
- mmap_procs=$(grep " ${dir}/" /proc/[0-9]*/maps | awk -F/ '{print $3}')
|
||||||
|
- printf "${procs}\n${mmap_procs}" | sort | uniq
|
||||||
|
+ # Yes, in theory, ${dir} could contain "intersting" characters
|
||||||
|
+ # and would need to be quoted for glob (find) and regex (grep).
|
||||||
|
+ # Don't do that, then.
|
||||||
|
+
|
||||||
|
+ # Avoid /proc/[0-9]*, it may cause "Argument list too long".
|
||||||
|
+ # There are several ways to filter for /proc/<pid>
|
||||||
|
+ # -mindepth 1 -not -path "/proc/[0-9]*" -prune -o ...
|
||||||
|
+ # -path "/proc/[!0-9]*" -prune -o ...
|
||||||
|
+ # -path "/proc/[0-9]*" -a ...
|
||||||
|
+ # the latter seemd to be significantly faster for this one in my naive test.
|
||||||
|
+ procs=$(exec 2>/dev/null;
|
||||||
|
+ find /proc -path "/proc/[0-9]*" -type l \( -lname "${dir}/*" -o -lname "${dir}" \) -print |
|
||||||
|
+ awk -F/ '{print $3}' | uniq)
|
||||||
|
+
|
||||||
|
+ # This finds both /proc/<pid>/maps and /proc/<pid>/task/<tid>/maps;
|
||||||
|
+ # if you don't want the latter, add -maxdepth.
|
||||||
|
+ mmap_procs=$(exec 2>/dev/null;
|
||||||
|
+ find /proc -path "/proc/[0-9]*/maps" -print |
|
||||||
|
+ xargs -r grep -l " ${dir}/" | awk -F/ '{print $3}' | uniq)
|
||||||
|
+ printf "${procs}\n${mmap_procs}" | sort -u
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,48 @@
|
|||||||
|
From 82958dc115c47232ae0468b1ddf64e728ec325e4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Georg Pfuetzenreuter <mail@georg-pfuetzenreuter.net>
|
||||||
|
Date: Wed, 9 Oct 2024 00:16:44 +0200
|
||||||
|
Subject: [PATCH] ocf-shellfuncs: systemd_drop_in only if needed
|
||||||
|
|
||||||
|
Avoid dbus overload upon many simultaneous "daemon-reload" invocations
|
||||||
|
(when a resource agent using systemd_drop_in() is called multiple times
|
||||||
|
as part of parallel resource operations in Pacemaker) by skipping the
|
||||||
|
file creation and reload if the expected data already exists.
|
||||||
|
|
||||||
|
Whilst at it, align the indentation of the heredoc with the other parts
|
||||||
|
of the function.
|
||||||
|
|
||||||
|
Signed-off-by: Georg Pfuetzenreuter <mail@georg-pfuetzenreuter.net>
|
||||||
|
---
|
||||||
|
heartbeat/ocf-shellfuncs.in | 19 +++++++++++--------
|
||||||
|
1 file changed, 11 insertions(+), 8 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in
|
||||||
|
index 9335cbf00..5c4bb3264 100644
|
||||||
|
--- a/heartbeat/ocf-shellfuncs.in
|
||||||
|
+++ b/heartbeat/ocf-shellfuncs.in
|
||||||
|
@@ -662,14 +662,17 @@ systemd_drop_in()
|
||||||
|
systemdrundir="/run/systemd/system/resource-agents-deps.target.d"
|
||||||
|
mkdir -p "$systemdrundir"
|
||||||
|
conf_file="$systemdrundir/$1.conf"
|
||||||
|
- cat >"$conf_file" <<EOF
|
||||||
|
-[Unit]
|
||||||
|
-$2=$3
|
||||||
|
-EOF
|
||||||
|
- # The information is accessible through systemd API and systemd would
|
||||||
|
- # complain about improper permissions.
|
||||||
|
- chmod o+r "$conf_file"
|
||||||
|
- systemctl daemon-reload
|
||||||
|
+ conf_line="$2=$3"
|
||||||
|
+ if ! { [ -f "$conf_file" ] && grep -q "^$conf_line$" "$conf_file" ; } ; then
|
||||||
|
+ cat > "$conf_file" <<-EOF
|
||||||
|
+ [Unit]
|
||||||
|
+ $conf_line
|
||||||
|
+ EOF
|
||||||
|
+ # The information is accessible through systemd API and systemd would
|
||||||
|
+ # complain about improper permissions.
|
||||||
|
+ chmod o+r "$conf_file"
|
||||||
|
+ systemctl daemon-reload
|
||||||
|
+ fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# usage: curl_retry RETRIES SLEEP ARGS URL
|
||||||
@ -0,0 +1,132 @@
|
|||||||
|
From 6fab544e702a7601714cd017aecc00193f23ae72 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Fri, 11 Oct 2024 13:13:10 +0200
|
||||||
|
Subject: [PATCH] IPaddr2: improve fail logic and check ip_status after adding
|
||||||
|
IP
|
||||||
|
|
||||||
|
* check that the label got applied
|
||||||
|
* return OCF_ERR_GENERIC to avoid false-positive when IP was manually added before starting the resource
|
||||||
|
* check ip_status after adding IP to fail without having to wait for the first monitor-action
|
||||||
|
|
||||||
|
Co-authored-by: Evan J. Felix <evan.felix@pnnl.gov>
|
||||||
|
---
|
||||||
|
heartbeat/IPaddr2 | 35 ++++++++++++++++++++++++++---------
|
||||||
|
1 file changed, 26 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||||
|
index e325aa574..27cae2d11 100755
|
||||||
|
--- a/heartbeat/IPaddr2
|
||||||
|
+++ b/heartbeat/IPaddr2
|
||||||
|
@@ -586,7 +586,7 @@ ip_init() {
|
||||||
|
exit $rc
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
SENDARPPIDFILE="$SENDARPPIDDIR/send_arp-$OCF_RESKEY_ip"
|
||||||
|
|
||||||
|
if [ -n "$IFLABEL" ]; then
|
||||||
|
@@ -985,6 +985,7 @@ run_send_ua() {
|
||||||
|
# ok = served (for CIP: + hash bucket)
|
||||||
|
# partial = served and no hash bucket (CIP only)
|
||||||
|
# partial2 = served and no CIP iptables rule
|
||||||
|
+# partial3 = served with no label
|
||||||
|
# no = nothing
|
||||||
|
#
|
||||||
|
ip_served() {
|
||||||
|
@@ -1002,6 +1003,11 @@ ip_served() {
|
||||||
|
|
||||||
|
if [ -z "$IP_CIP" ]; then
|
||||||
|
for i in $cur_nic; do
|
||||||
|
+ # check address label
|
||||||
|
+ if [ -n "$IFLABEL" ] && [ -z "`$IP2UTIL -o -f $FAMILY addr show $nic label $IFLABEL`" ]; then
|
||||||
|
+ echo partial3
|
||||||
|
+ return 0
|
||||||
|
+ fi
|
||||||
|
# only mark as served when on the same interfaces as $NIC
|
||||||
|
[ "$i" = "$NIC" ] || continue
|
||||||
|
echo "ok"
|
||||||
|
@@ -1065,7 +1071,12 @@ ip_start() {
|
||||||
|
if [ "$ip_status" = "ok" ]; then
|
||||||
|
exit $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
+ if [ "$ip_status" = "partial3" ]; then
|
||||||
|
+ ocf_exit_reason "IP $OCF_RESKEY_ip available, but label missing"
|
||||||
|
+ exit $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
if [ -n "$IP_CIP" ] && ([ $ip_status = "no" ] || [ $ip_status = "partial2" ]); then
|
||||||
|
$MODPROBE ip_conntrack
|
||||||
|
$IPADDR2_CIP_IPTABLES -I INPUT -d $OCF_RESKEY_ip -i $NIC -j CLUSTERIP \
|
||||||
|
@@ -1083,7 +1094,7 @@ ip_start() {
|
||||||
|
if [ -n "$IP_CIP" ] && [ $ip_status = "partial" ]; then
|
||||||
|
echo "+$IP_INC_NO" >$IP_CIP_FILE
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
if [ "$ip_status" = "no" ]; then
|
||||||
|
if ocf_is_true ${OCF_RESKEY_lvs_support}; then
|
||||||
|
for i in `find_interface $OCF_RESKEY_ip 32`; do
|
||||||
|
@@ -1094,7 +1105,7 @@ ip_start() {
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
add_interface "$OCF_RESKEY_ip" "$NETMASK" "${BRDCAST:-none}" "$NIC" "$IFLABEL" "$METRIC"
|
||||||
|
rc=$?
|
||||||
|
|
||||||
|
@@ -1102,6 +1113,12 @@ ip_start() {
|
||||||
|
ocf_exit_reason "Failed to add $OCF_RESKEY_ip"
|
||||||
|
exit $rc
|
||||||
|
fi
|
||||||
|
+
|
||||||
|
+ ip_status=`ip_served`
|
||||||
|
+ if [ "$ip_status" != "ok" ]; then
|
||||||
|
+ ocf_exit_reason "Failed to add $OCF_RESKEY_ip with error $ip_status"
|
||||||
|
+ exit $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
case $NIC in
|
||||||
|
@@ -1134,7 +1151,7 @@ ip_stop() {
|
||||||
|
ocf_take_lock $CIP_lockfile
|
||||||
|
ocf_release_lock_on_exit $CIP_lockfile
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
if [ -f "$SENDARPPIDFILE" ] ; then
|
||||||
|
kill `cat "$SENDARPPIDFILE"`
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
@@ -1171,17 +1188,17 @@ ip_stop() {
|
||||||
|
i=`expr $i + 1`
|
||||||
|
done
|
||||||
|
else
|
||||||
|
- ip_del_if="no"
|
||||||
|
+ ip_del_if="no"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
if [ "$ip_del_if" = "yes" ]; then
|
||||||
|
delete_interface $OCF_RESKEY_ip $NIC $NETMASK
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
ocf_exit_reason "Unable to remove IP [${OCF_RESKEY_ip} from interface [ $NIC ]"
|
||||||
|
exit $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
-
|
||||||
|
+
|
||||||
|
if ocf_is_true ${OCF_RESKEY_lvs_support}; then
|
||||||
|
restore_loopback "$OCF_RESKEY_ip"
|
||||||
|
fi
|
||||||
|
@@ -1200,7 +1217,7 @@ ip_monitor() {
|
||||||
|
run_arp_sender refresh
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
;;
|
||||||
|
- partial|no|partial2)
|
||||||
|
+ no)
|
||||||
|
exit $OCF_NOT_RUNNING
|
||||||
|
;;
|
||||||
|
*)
|
||||||
@ -0,0 +1,23 @@
|
|||||||
|
From eac983c14f4695f491fe430a78d8d18a1481c60c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Wed, 29 Oct 2025 15:15:54 +0100
|
||||||
|
Subject: [PATCH] oracle: improve monpassword description
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/oracle | 3 +--
|
||||||
|
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/oracle b/heartbeat/oracle
|
||||||
|
index 8cf4e3649c..c85e499833 100755
|
||||||
|
--- a/heartbeat/oracle
|
||||||
|
+++ b/heartbeat/oracle
|
||||||
|
@@ -132,8 +132,7 @@ that the password for this user does not expire.
|
||||||
|
<longdesc lang="en">
|
||||||
|
Password for the monitoring user. Make sure
|
||||||
|
that the password for this user does not expire.
|
||||||
|
-Need to explicitly set a password to a new monitor
|
||||||
|
-user for the security reason.
|
||||||
|
+Set to avoid using the agents default password for "monuser".
|
||||||
|
</longdesc>
|
||||||
|
<shortdesc lang="en">monpassword</shortdesc>
|
||||||
|
<content type="string" default="$OCF_RESKEY_monpassword_default" />
|
||||||
184
SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch
Normal file
184
SOURCES/RHEL-68739-awsvip-add-interface-parameter.patch
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
From 392d40048a25d7cb73ec5b5e9f7a5862f7a3fd48 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 11 Nov 2024 12:22:27 +0100
|
||||||
|
Subject: [PATCH 1/2] aws.sh: add get_interface_mac()
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/aws.sh | 21 +++++++++++++++++++++
|
||||||
|
1 file changed, 21 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||||
|
index 64f2e13a7..ebb4eb1f4 100644
|
||||||
|
--- a/heartbeat/aws.sh
|
||||||
|
+++ b/heartbeat/aws.sh
|
||||||
|
@@ -69,3 +69,24 @@ get_instance_id() {
|
||||||
|
echo "$INSTANCE_ID"
|
||||||
|
return "$OCF_SUCCESS"
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+get_interface_mac() {
|
||||||
|
+ local MAC_FILE MAC_ADDR rc
|
||||||
|
+ MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address"
|
||||||
|
+ if [ -f "$MAC_FILE" ]; then
|
||||||
|
+ cmd="cat ${MAC_FILE}"
|
||||||
|
+ else
|
||||||
|
+ cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3"
|
||||||
|
+ fi
|
||||||
|
+ ocf_log debug "executing command: $cmd"
|
||||||
|
+ MAC_ADDR="$(eval $cmd)"
|
||||||
|
+ rc=$?
|
||||||
|
+ if [ $rc != 0 ]; then
|
||||||
|
+ ocf_log warn "command failed, rc: $rc"
|
||||||
|
+ return $OCF_ERR_GENERIC
|
||||||
|
+ fi
|
||||||
|
+ ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}"
|
||||||
|
+
|
||||||
|
+ echo $MAC_ADDR
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+}
|
||||||
|
|
||||||
|
From 87337ac4da931d5a53c83d53d4bab17ee123ba9f Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 11 Nov 2024 12:26:38 +0100
|
||||||
|
Subject: [PATCH 2/2] awsvip: let user specify which interface to use, and make
|
||||||
|
the parameter optional in aws-vpc-move-ip
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/aws-vpc-move-ip | 20 ++++----------------
|
||||||
|
heartbeat/aws.sh | 4 +++-
|
||||||
|
heartbeat/awsvip | 24 +++++++++++++++++-------
|
||||||
|
3 files changed, 24 insertions(+), 24 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/aws-vpc-move-ip b/heartbeat/aws-vpc-move-ip
|
||||||
|
index 09ae68b57..2afc0ba53 100755
|
||||||
|
--- a/heartbeat/aws-vpc-move-ip
|
||||||
|
+++ b/heartbeat/aws-vpc-move-ip
|
||||||
|
@@ -157,7 +157,7 @@ Role to use to query/update the route table
|
||||||
|
<content type="string" default="${OCF_RESKEY_routing_table_role_default}" />
|
||||||
|
</parameter>
|
||||||
|
|
||||||
|
-<parameter name="interface" required="1">
|
||||||
|
+<parameter name="interface" required="0">
|
||||||
|
<longdesc lang="en">
|
||||||
|
Name of the network interface, i.e. eth0
|
||||||
|
</longdesc>
|
||||||
|
@@ -321,7 +321,7 @@ ec2ip_monitor() {
|
||||||
|
ocf_log debug "monitor: Enhanced Monitoring disabled - omitting API call"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- cmd="ip addr show to $OCF_RESKEY_ip up"
|
||||||
|
+ cmd="ip addr show dev $OCF_RESKEY_interface to $OCF_RESKEY_ip up"
|
||||||
|
ocf_log debug "executing command: $cmd"
|
||||||
|
RESULT=$($cmd | grep "$OCF_RESKEY_ip")
|
||||||
|
if [ -z "$RESULT" ]; then
|
||||||
|
@@ -331,7 +331,7 @@ ec2ip_monitor() {
|
||||||
|
level="info"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to running interface"
|
||||||
|
+ ocf_log "$level" "IP $OCF_RESKEY_ip not assigned to interface $OCF_RESKEY_interface"
|
||||||
|
return $OCF_NOT_RUNNING
|
||||||
|
fi
|
||||||
|
|
||||||
|
@@ -369,19 +369,7 @@ ec2ip_drop() {
|
||||||
|
}
|
||||||
|
|
||||||
|
ec2ip_get_instance_eni() {
|
||||||
|
- MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address"
|
||||||
|
- if [ -f $MAC_FILE ]; then
|
||||||
|
- cmd="cat ${MAC_FILE}"
|
||||||
|
- else
|
||||||
|
- cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3"
|
||||||
|
- fi
|
||||||
|
- ocf_log debug "executing command: $cmd"
|
||||||
|
- MAC_ADDR="$(eval $cmd)"
|
||||||
|
- rc=$?
|
||||||
|
- if [ $rc != 0 ]; then
|
||||||
|
- ocf_log warn "command failed, rc: $rc"
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
- fi
|
||||||
|
+ MAC_ADDR=$(get_interface_mac)
|
||||||
|
ocf_log debug "MAC address associated with interface ${OCF_RESKEY_interface}: ${MAC_ADDR}"
|
||||||
|
|
||||||
|
cmd="curl_retry \"$OCF_RESKEY_curl_retries\" \"$OCF_RESKEY_curl_sleep\" \"--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'\" \"http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDR}/interface-id\""
|
||||||
|
diff --git a/heartbeat/aws.sh b/heartbeat/aws.sh
|
||||||
|
index ebb4eb1f4..216033afe 100644
|
||||||
|
--- a/heartbeat/aws.sh
|
||||||
|
+++ b/heartbeat/aws.sh
|
||||||
|
@@ -73,7 +73,9 @@ get_instance_id() {
|
||||||
|
get_interface_mac() {
|
||||||
|
local MAC_FILE MAC_ADDR rc
|
||||||
|
MAC_FILE="/sys/class/net/${OCF_RESKEY_interface}/address"
|
||||||
|
- if [ -f "$MAC_FILE" ]; then
|
||||||
|
+ if [ -z "$OCF_RESKEY_interface" ]; then
|
||||||
|
+ cmd="curl_retry \"$OCF_RESKEY_curl_retries\" \"$OCF_RESKEY_curl_sleep\" \"--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'\" \"http://169.254.169.254/latest/meta-data/mac\""
|
||||||
|
+ elif [ -f "$MAC_FILE" ]; then
|
||||||
|
cmd="cat ${MAC_FILE}"
|
||||||
|
else
|
||||||
|
cmd="ip -br link show dev ${OCF_RESKEY_interface} | tr -s ' ' | cut -d' ' -f3"
|
||||||
|
diff --git a/heartbeat/awsvip b/heartbeat/awsvip
|
||||||
|
index 0856ac5e4..015180d5a 100755
|
||||||
|
--- a/heartbeat/awsvip
|
||||||
|
+++ b/heartbeat/awsvip
|
||||||
|
@@ -49,12 +49,14 @@ OCF_RESKEY_auth_type_default="key"
|
||||||
|
OCF_RESKEY_profile_default="default"
|
||||||
|
OCF_RESKEY_region_default=""
|
||||||
|
OCF_RESKEY_api_delay_default="3"
|
||||||
|
+OCF_RESKEY_interface_default=""
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}}
|
||||||
|
: ${OCF_RESKEY_auth_type=${OCF_RESKEY_auth_type_default}}
|
||||||
|
: ${OCF_RESKEY_profile=${OCF_RESKEY_profile_default}}
|
||||||
|
: ${OCF_RESKEY_region=${OCF_RESKEY_region_default}}
|
||||||
|
: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}}
|
||||||
|
+: ${OCF_RESKEY_interface=${OCF_RESKEY_interface_default}}
|
||||||
|
|
||||||
|
meta_data() {
|
||||||
|
cat <<END
|
||||||
|
@@ -125,6 +127,14 @@ a short delay between API calls, to avoid sending API too quick
|
||||||
|
<content type="integer" default="${OCF_RESKEY_api_delay_default}" />
|
||||||
|
</parameter>
|
||||||
|
|
||||||
|
+<parameter name="interface" required="0">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+Name of the network interface, i.e. eth0
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">network interface name</shortdesc>
|
||||||
|
+<content type="string" default="${OCF_RESKEY_interface_default}" />
|
||||||
|
+</parameter>
|
||||||
|
+
|
||||||
|
<parameter name="curl_retries" unique="0">
|
||||||
|
<longdesc lang="en">
|
||||||
|
curl retries before failing
|
||||||
|
@@ -207,16 +217,16 @@ awsvip_stop() {
|
||||||
|
}
|
||||||
|
|
||||||
|
awsvip_monitor() {
|
||||||
|
- $AWSCLI_CMD ec2 describe-instances \
|
||||||
|
- --instance-id "${INSTANCE_ID}" \
|
||||||
|
- --query 'Reservations[].Instances[].NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \
|
||||||
|
+ $AWSCLI_CMD ec2 describe-network-interfaces \
|
||||||
|
+ --network-interface-ids "${NETWORK_ID}" \
|
||||||
|
+ --query 'NetworkInterfaces[].PrivateIpAddresses[].PrivateIpAddress[]' \
|
||||||
|
--output text | \
|
||||||
|
grep -qE "(^|\s)${SECONDARY_PRIVATE_IP}(\s|$)"
|
||||||
|
- RET=$?
|
||||||
|
-
|
||||||
|
- if [ $RET -ne 0 ]; then
|
||||||
|
+ if [ $? -ne 0 ]; then
|
||||||
|
+ [ "$__OCF_ACTION" = "monitor" ] && ! ocf_is_probe && ocf_log error "IP $SECONDARY_PRIVATE_IP not assigned to interface ${NETWORK_ID}"
|
||||||
|
return $OCF_NOT_RUNNING
|
||||||
|
fi
|
||||||
|
+
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -267,7 +277,7 @@ TOKEN=$(get_token)
|
||||||
|
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||||
|
INSTANCE_ID=$(get_instance_id)
|
||||||
|
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||||
|
-MAC_ADDRESS=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/mac")
|
||||||
|
+MAC_ADDRESS=$(get_interface_mac)
|
||||||
|
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||||
|
NETWORK_ID=$(curl_retry "$OCF_RESKEY_curl_retries" "$OCF_RESKEY_curl_sleep" "--show-error -s -H 'X-aws-ec2-metadata-token: $TOKEN'" "http://169.254.169.254/latest/meta-data/network/interfaces/macs/${MAC_ADDRESS}/interface-id")
|
||||||
|
[ $? -ne 0 ] && exit $OCF_ERR_GENERIC
|
||||||
@ -0,0 +1,37 @@
|
|||||||
|
From d0d2a0ff92dd23ee36cb57324c1eeaa3daed65bc Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Tue, 4 Feb 2025 16:13:27 +0100
|
||||||
|
Subject: [PATCH] findif.sh: fix to avoid duplicate route issues
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/findif.sh | 14 +++++---------
|
||||||
|
1 file changed, 5 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/findif.sh b/heartbeat/findif.sh
|
||||||
|
index 2ae91e958..6fb47110c 100644
|
||||||
|
--- a/heartbeat/findif.sh
|
||||||
|
+++ b/heartbeat/findif.sh
|
||||||
|
@@ -217,18 +217,14 @@ findif()
|
||||||
|
fi
|
||||||
|
if [ -n "$nic" ] ; then
|
||||||
|
# NIC supports more than two.
|
||||||
|
- routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
+ routematch=$(ip -o -f $family route list match $match $proto $scope | grep "dev $nic " | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
else
|
||||||
|
- routematch=$(ip -o -f $family route list match $match $proto $scope | grep -v "^\(unreachable\|prohibit\|blackhole\)" | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
- fi
|
||||||
|
- if [ "$family" = "inet6" ]; then
|
||||||
|
- routematch=$(echo "$routematch" | grep -v "^default")
|
||||||
|
+ routematch=$(ip -o -f $family route list match $match $proto $scope | sed -e 's,^\([0-9.]\+\) ,\1/32 ,;s,^\([0-9a-f:]\+\) ,\1/128 ,' | sort -t/ -k2,2nr)
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if [ $(echo "$routematch" | wc -l) -gt 1 ]; then
|
||||||
|
- ocf_exit_reason "More than 1 routes match $match. Unable to decide which route to use."
|
||||||
|
- return $OCF_ERR_GENERIC
|
||||||
|
- fi
|
||||||
|
+ # ignore matches from unrelated tables, and sort by metric to get the route with the lowest metric
|
||||||
|
+ routematch=$(echo "$routematch" | awk '!/^(default|unreachable|prohibit|blackhole)/{match($0, /metric ([^ ]+)/, arr); print arr[1], $0}' | sort -k 1n -u | cut -d" " -f 2- | head -1)
|
||||||
|
+
|
||||||
|
set -- $routematch
|
||||||
|
if [ $# = 0 ] ; then
|
||||||
|
case $OCF_RESKEY_ip in
|
||||||
@ -0,0 +1,23 @@
|
|||||||
|
From a1e22c5c612f369bac0830588642560dcea92e7c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Fujii Masao <fujii@postgresql.org>
|
||||||
|
Date: Sat, 9 Nov 2024 02:33:37 +0900
|
||||||
|
Subject: [PATCH] Remove unused macro variables from storage_mon.c.
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 3 ---
|
||||||
|
1 file changed, 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index f94268f6f..2519a9e72 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -33,9 +33,6 @@
|
||||||
|
#define DEFAULT_PIDFILE HA_VARRUNDIR "storage_mon.pid"
|
||||||
|
#define DEFAULT_ATTRNAME "#health-storage_mon"
|
||||||
|
#define SMON_GET_RESULT_COMMAND "get_check_value"
|
||||||
|
-#define SMON_RESULT_OK "green"
|
||||||
|
-#define SMON_RESULT_NG "red"
|
||||||
|
-#define SMON_RESULT_COMMAND_ERROR "unknown command"
|
||||||
|
#define SMON_BUFF_1MEG 1048576
|
||||||
|
#define SMON_MAX_IPCSNAME 256
|
||||||
|
#define SMON_MAX_MSGSIZE 128
|
||||||
@ -0,0 +1,79 @@
|
|||||||
|
From 46715c638829598d949dffab0898fe4c07074895 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
||||||
|
Date: Thu, 21 Nov 2024 15:21:19 +0900
|
||||||
|
Subject: [PATCH 1/2] High: storage-mon: Correct the timing of setting
|
||||||
|
notification values to storage-mon(RA) clients.
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/storage_mon.c | 17 ++++++++---------
|
||||||
|
1 file changed, 8 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/tools/storage_mon.c b/tools/storage_mon.c
|
||||||
|
index 2519a9e72..27d2ff1d1 100644
|
||||||
|
--- a/tools/storage_mon.c
|
||||||
|
+++ b/tools/storage_mon.c
|
||||||
|
@@ -320,7 +320,14 @@ static int32_t sigchld_handler(int32_t sig, void *data)
|
||||||
|
|
||||||
|
finished_count++;
|
||||||
|
test_forks[index] = 0;
|
||||||
|
-
|
||||||
|
+
|
||||||
|
+ /* Update the result value for the client response once all checks have completed. */
|
||||||
|
+ if (device_count == finished_count) {
|
||||||
|
+ response_final_score = final_score;
|
||||||
|
+ if (!daemon_check_first_all_devices) {
|
||||||
|
+ daemon_check_first_all_devices = TRUE;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
@@ -441,15 +448,7 @@ static int test_device_main(gpointer data)
|
||||||
|
if (is_child_runnning()) {
|
||||||
|
device_check = FALSE;
|
||||||
|
}
|
||||||
|
-
|
||||||
|
- if (device_count == finished_count && device_check) {
|
||||||
|
- /* Update the result value for the client response once all checks have completed. */
|
||||||
|
- response_final_score = final_score;
|
||||||
|
|
||||||
|
- if (!daemon_check_first_all_devices) {
|
||||||
|
- daemon_check_first_all_devices = TRUE;
|
||||||
|
- }
|
||||||
|
- }
|
||||||
|
}
|
||||||
|
|
||||||
|
if (device_check) {
|
||||||
|
|
||||||
|
From 1201390fb219d1b566c5d31463daacef60c31ab4 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Hideo Yamauchi <renayama19661014@ybb.ne.jp>
|
||||||
|
Date: Thu, 21 Nov 2024 15:43:33 +0900
|
||||||
|
Subject: [PATCH 2/2] Mid: storage-mon RA: Wait until monitor confirms the
|
||||||
|
startup pid according to the OCF resource specification.
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/storage-mon.in | 11 +++++++++++
|
||||||
|
1 file changed, 11 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||||
|
index 284dec30f..7c9943d4f 100644
|
||||||
|
--- a/heartbeat/storage-mon.in
|
||||||
|
+++ b/heartbeat/storage-mon.in
|
||||||
|
@@ -325,6 +325,17 @@ storage-mon_start() {
|
||||||
|
if [ "$?" -ne 0 ]; then
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
|
fi
|
||||||
|
+
|
||||||
|
+ #Wait until monitor confirms the startup pid according to the ocf resource specification.
|
||||||
|
+ while true; do
|
||||||
|
+ storage-mon_monitor pid_check_only
|
||||||
|
+ rc="$?"
|
||||||
|
+ if [ $rc -eq $OCF_SUCCESS ]; then
|
||||||
|
+ break
|
||||||
|
+ fi
|
||||||
|
+ sleep 1
|
||||||
|
+ ocf_log debug "storage-mon daemon still hasn't started yet. Waiting..."
|
||||||
|
+ done
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,148 @@
|
|||||||
|
From b72b329a45c058fda720c6739f881b9597fc8b30 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Thu, 23 Jan 2025 16:18:20 +0100
|
||||||
|
Subject: [PATCH] storage-mon: replace dashes with underscores in functions
|
||||||
|
|
||||||
|
Dashes in function names produce "`storage-mon_usage': not a valid identifier"
|
||||||
|
error when run with sh -x.
|
||||||
|
---
|
||||||
|
heartbeat/storage-mon.in | 44 ++++++++++++++++++++--------------------
|
||||||
|
1 file changed, 22 insertions(+), 22 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||||
|
index 7c9943d4f..5edb96979 100644
|
||||||
|
--- a/heartbeat/storage-mon.in
|
||||||
|
+++ b/heartbeat/storage-mon.in
|
||||||
|
@@ -152,7 +152,7 @@ END
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
-storage-mon_usage() {
|
||||||
|
+storage_mon_usage() {
|
||||||
|
cat <<END
|
||||||
|
usage: $0 {start|stop|monitor|validate-all|meta-data}
|
||||||
|
|
||||||
|
@@ -161,7 +161,7 @@ END
|
||||||
|
return $1
|
||||||
|
}
|
||||||
|
|
||||||
|
-storage-mon_init() {
|
||||||
|
+storage_mon_init() {
|
||||||
|
#Test for presence of storage_mon helper
|
||||||
|
if [ ! -x "$STORAGEMON" ] ; then
|
||||||
|
ocf_log err "${STORAGEMON} not installed."
|
||||||
|
@@ -205,7 +205,7 @@ storage-mon_init() {
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
-storage-mon_update_attribute() {
|
||||||
|
+storage_mon_update_attribute() {
|
||||||
|
|
||||||
|
while :
|
||||||
|
do
|
||||||
|
@@ -224,9 +224,9 @@ storage-mon_update_attribute() {
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
-storage-mon_monitor() {
|
||||||
|
+storage_mon_monitor() {
|
||||||
|
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||||
|
- storage-mon_init
|
||||||
|
+ storage_mon_init
|
||||||
|
|
||||||
|
# Monitor _MUST!_ differentiate correctly between running
|
||||||
|
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
|
||||||
|
@@ -252,7 +252,7 @@ storage-mon_monitor() {
|
||||||
|
status="green"
|
||||||
|
fi
|
||||||
|
|
||||||
|
- storage-mon_update_attribute $status
|
||||||
|
+ storage_mon_update_attribute $status
|
||||||
|
return "$?"
|
||||||
|
else
|
||||||
|
ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
|
||||||
|
@@ -298,20 +298,20 @@ storage-mon_monitor() {
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
- storage-mon_update_attribute $status
|
||||||
|
+ storage_mon_update_attribute $status
|
||||||
|
return "$?"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
-storage-mon_start() {
|
||||||
|
+storage_mon_start() {
|
||||||
|
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||||
|
- storage-mon_monitor
|
||||||
|
+ storage_mon_monitor
|
||||||
|
if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
touch "${OCF_RESKEY_state_file}"
|
||||||
|
else
|
||||||
|
- storage-mon_init
|
||||||
|
+ storage_mon_init
|
||||||
|
# generate command line
|
||||||
|
cmdline=""
|
||||||
|
for DRIVE in ${OCF_RESKEY_drives}; do
|
||||||
|
@@ -328,7 +328,7 @@ storage-mon_start() {
|
||||||
|
|
||||||
|
#Wait until monitor confirms the startup pid according to the ocf resource specification.
|
||||||
|
while true; do
|
||||||
|
- storage-mon_monitor pid_check_only
|
||||||
|
+ storage_mon_monitor pid_check_only
|
||||||
|
rc="$?"
|
||||||
|
if [ $rc -eq $OCF_SUCCESS ]; then
|
||||||
|
break
|
||||||
|
@@ -339,8 +339,8 @@ storage-mon_start() {
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
-storage-mon_stop() {
|
||||||
|
- storage-mon_monitor
|
||||||
|
+storage_mon_stop() {
|
||||||
|
+ storage_mon_monitor
|
||||||
|
rc=$?
|
||||||
|
|
||||||
|
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||||
|
@@ -363,7 +363,7 @@ storage-mon_stop() {
|
||||||
|
fi
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
- storage-mon_monitor pid_check_only
|
||||||
|
+ storage_mon_monitor pid_check_only
|
||||||
|
rc="$?"
|
||||||
|
case "$rc" in
|
||||||
|
$OCF_SUCCESS)
|
||||||
|
@@ -379,8 +379,8 @@ storage-mon_stop() {
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
-storage-mon_validate() {
|
||||||
|
- storage-mon_init
|
||||||
|
+storage_mon_validate() {
|
||||||
|
+ storage_mon_init
|
||||||
|
|
||||||
|
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
|
||||||
|
# Is the state directory writable?
|
||||||
|
@@ -396,13 +396,13 @@ storage-mon_validate() {
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$__OCF_ACTION" in
|
||||||
|
- start) storage-mon_start;;
|
||||||
|
- stop) storage-mon_stop;;
|
||||||
|
- monitor) storage-mon_monitor;;
|
||||||
|
- validate-all) storage-mon_validate;;
|
||||||
|
+ start) storage_mon_start;;
|
||||||
|
+ stop) storage_mon_stop;;
|
||||||
|
+ monitor) storage_mon_monitor;;
|
||||||
|
+ validate-all) storage_mon_validate;;
|
||||||
|
meta-data) meta_data;;
|
||||||
|
- usage|help) storage-mon_usage $OCF_SUCCESS;;
|
||||||
|
- *) storage-mon_usage $OCF_ERR_UNIMPLEMENTED;;
|
||||||
|
+ usage|help) storage_mon_usage $OCF_SUCCESS;;
|
||||||
|
+ *) storage_mon_usage $OCF_ERR_UNIMPLEMENTED;;
|
||||||
|
esac
|
||||||
|
rc=$?
|
||||||
|
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
|
||||||
@ -0,0 +1,25 @@
|
|||||||
|
From c6f520344e830a7c946b2222f9f251be038b1b28 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Fri, 24 Jan 2025 10:01:30 +0100
|
||||||
|
Subject: [PATCH] storage-mon: check if daemon is already running during
|
||||||
|
start-action
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/storage-mon.in | 4 ++++
|
||||||
|
1 file changed, 4 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||||
|
index 5edb96979..00e42f68d 100644
|
||||||
|
--- a/heartbeat/storage-mon.in
|
||||||
|
+++ b/heartbeat/storage-mon.in
|
||||||
|
@@ -311,6 +311,10 @@ storage_mon_start() {
|
||||||
|
fi
|
||||||
|
touch "${OCF_RESKEY_state_file}"
|
||||||
|
else
|
||||||
|
+ storage_mon_monitor pid_check_only
|
||||||
|
+ if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
+ return $OCF_SUCCESS
|
||||||
|
+ fi
|
||||||
|
storage_mon_init
|
||||||
|
# generate command line
|
||||||
|
cmdline=""
|
||||||
@ -0,0 +1,22 @@
|
|||||||
|
From de51a1705ce761f1fb5f1b2294cfc1153af70c1c Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 27 Jan 2025 09:54:06 +0100
|
||||||
|
Subject: [PATCH] storage-mon: log "storage_mon is already running" in
|
||||||
|
start-action
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/storage-mon.in | 1 +
|
||||||
|
1 file changed, 1 insertion(+)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/storage-mon.in b/heartbeat/storage-mon.in
|
||||||
|
index 00e42f68d..d60db4ad4 100644
|
||||||
|
--- a/heartbeat/storage-mon.in
|
||||||
|
+++ b/heartbeat/storage-mon.in
|
||||||
|
@@ -313,6 +313,7 @@ storage_mon_start() {
|
||||||
|
else
|
||||||
|
storage_mon_monitor pid_check_only
|
||||||
|
if [ $? -eq $OCF_SUCCESS ]; then
|
||||||
|
+ ocf_log info "storage_mon is already running. PID=`cat $PIDFILE`"
|
||||||
|
return $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
storage_mon_init
|
||||||
@ -0,0 +1,118 @@
|
|||||||
|
From 4a228f3d8212368124134c01f958ac43e32cec08 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||||
|
Date: Mon, 7 Apr 2025 09:19:37 +0200
|
||||||
|
Subject: [PATCH] IPaddr2: add link status DOWN/LOWERLAYERDOWN check
|
||||||
|
|
||||||
|
---
|
||||||
|
heartbeat/IPaddr2 | 42 +++++++++++++++++++++++++++++++++++++++++-
|
||||||
|
1 file changed, 41 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/heartbeat/IPaddr2 b/heartbeat/IPaddr2
|
||||||
|
index cf03e4426..230ac853c 100755
|
||||||
|
--- a/heartbeat/IPaddr2
|
||||||
|
+++ b/heartbeat/IPaddr2
|
||||||
|
@@ -92,6 +92,19 @@ OCF_RESKEY_nodad_default=false
|
||||||
|
OCF_RESKEY_noprefixroute_default="false"
|
||||||
|
OCF_RESKEY_preferred_lft_default="forever"
|
||||||
|
OCF_RESKEY_network_namespace_default=""
|
||||||
|
+OCF_RESKEY_check_link_status_default="true"
|
||||||
|
+
|
||||||
|
+# RHEL specific defaults
|
||||||
|
+if is_redhat_based; then
|
||||||
|
+ get_os_ver
|
||||||
|
+ ocf_version_cmp "$VER" "10.1" 2>/dev/null
|
||||||
|
+
|
||||||
|
+ case "$?" in
|
||||||
|
+ # RHEL < 10.1
|
||||||
|
+ 0)
|
||||||
|
+ OCF_RESKEY_check_link_status_default="false";;
|
||||||
|
+ esac
|
||||||
|
+fi
|
||||||
|
|
||||||
|
: ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}}
|
||||||
|
: ${OCF_RESKEY_cidr_netmask=${OCF_RESKEY_cidr_netmask_default}}
|
||||||
|
@@ -116,6 +129,7 @@ OCF_RESKEY_network_namespace_default=""
|
||||||
|
: ${OCF_RESKEY_noprefixroute=${OCF_RESKEY_noprefixroute_default}}
|
||||||
|
: ${OCF_RESKEY_preferred_lft=${OCF_RESKEY_preferred_lft_default}}
|
||||||
|
: ${OCF_RESKEY_network_namespace=${OCF_RESKEY_network_namespace_default}}
|
||||||
|
+: ${OCF_RESKEY_check_link_status=${OCF_RESKEY_check_link_status_default}}
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
|
||||||
|
@@ -449,6 +463,14 @@ the namespace.
|
||||||
|
<shortdesc lang="en">Network namespace to use</shortdesc>
|
||||||
|
<content type="string" default="${OCF_RESKEY_network_namespace_default}"/>
|
||||||
|
</parameter>
|
||||||
|
+
|
||||||
|
+<parameter name="check_link_status">
|
||||||
|
+<longdesc lang="en">
|
||||||
|
+Consider the resource failed if the interface has status DOWN or LOWERLAYERDOWN.
|
||||||
|
+</longdesc>
|
||||||
|
+<shortdesc lang="en">Consider the resource failed if the interface has status DOWN or LOWERLAYERDOWN</shortdesc>
|
||||||
|
+<content type="string" default="${OCF_RESKEY_check_link_status_default}"/>
|
||||||
|
+</parameter>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<actions>
|
||||||
|
@@ -581,6 +603,9 @@ ip_init() {
|
||||||
|
elif [ "$__OCF_ACTION" = stop ]; then
|
||||||
|
ocf_log warn "[$FINDIF] failed"
|
||||||
|
exit $OCF_SUCCESS
|
||||||
|
+ elif [ "$__OCF_ACTION" = start ]; then
|
||||||
|
+ ocf_exit_reason "[$FINDIF] failed"
|
||||||
|
+ exit $OCF_ERR_INSTALLED
|
||||||
|
else
|
||||||
|
ocf_exit_reason "[$FINDIF] failed"
|
||||||
|
exit $rc
|
||||||
|
@@ -1002,6 +1027,12 @@ ip_served() {
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ if ocf_is_true "$OCF_RESKEY_check_link_status" && $IP2UTIL -f $FAMILY addr show $cur_nic | \
|
||||||
|
+ grep -q "[[:space:]]\(DOWN\|LOWERLAYERDOWN\)[[:space:]]"; then
|
||||||
|
+ echo "down"
|
||||||
|
+ return 0
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
if [ -z "$IP_CIP" ]; then
|
||||||
|
for i in $cur_nic; do
|
||||||
|
# check address label
|
||||||
|
@@ -1073,6 +1104,11 @@ ip_start() {
|
||||||
|
exit $OCF_SUCCESS
|
||||||
|
fi
|
||||||
|
|
||||||
|
+ if [ "$ip_status" = "down" ]; then
|
||||||
|
+ ocf_exit_reason "IP $OCF_RESKEY_ip available, but device has status $ip_status"
|
||||||
|
+ exit $OCF_ERR_INSTALLED
|
||||||
|
+ fi
|
||||||
|
+
|
||||||
|
if [ "$ip_status" = "partial3" ]; then
|
||||||
|
ocf_exit_reason "IP $OCF_RESKEY_ip available, but label missing"
|
||||||
|
exit $OCF_ERR_GENERIC
|
||||||
|
@@ -1096,7 +1132,7 @@ ip_start() {
|
||||||
|
echo "+$IP_INC_NO" >$IP_CIP_FILE
|
||||||
|
fi
|
||||||
|
|
||||||
|
- if [ "$ip_status" = "no" ]; then
|
||||||
|
+ if [ "$ip_status" != "ok" ]; then
|
||||||
|
if ocf_is_true ${OCF_RESKEY_lvs_support}; then
|
||||||
|
for i in `find_interface $OCF_RESKEY_ip 32`; do
|
||||||
|
case $i in
|
||||||
|
@@ -1213,6 +1249,7 @@ ip_monitor() {
|
||||||
|
# interface health maybe via a daemon like FailSafe etc...
|
||||||
|
|
||||||
|
local ip_status=`ip_served`
|
||||||
|
+ ocf_log debug "monitor: $ip_status"
|
||||||
|
case $ip_status in
|
||||||
|
ok)
|
||||||
|
run_arp_sender refresh
|
||||||
|
@@ -1221,6 +1258,9 @@ ip_monitor() {
|
||||||
|
no)
|
||||||
|
exit $OCF_NOT_RUNNING
|
||||||
|
;;
|
||||||
|
+ down)
|
||||||
|
+ exit $OCF_ERR_INSTALLED
|
||||||
|
+ ;;
|
||||||
|
*)
|
||||||
|
# Errors on this interface?
|
||||||
|
return $OCF_ERR_GENERIC
|
||||||
@ -1,4 +1,4 @@
|
|||||||
--- a/heartbeat/portblock 2025-02-20 14:54:18.047134471 +0100
|
--- a/heartbeat/portblock 2021-11-03 10:12:01.000000000 +0100
|
||||||
+++ b/heartbeat/portblock 2025-02-20 14:09:44.546869740 +0100
|
+++ b/heartbeat/portblock 2025-02-20 14:09:44.546869740 +0100
|
||||||
@@ -25,6 +25,7 @@
|
@@ -25,6 +25,7 @@
|
||||||
# Defaults
|
# Defaults
|
||||||
@ -35,7 +35,7 @@
|
|||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<actions>
|
<actions>
|
||||||
@@ -240,36 +254,73 @@
|
@@ -240,19 +254,34 @@
|
||||||
# and disable us -- but we're still in some sense active...
|
# and disable us -- but we're still in some sense active...
|
||||||
#
|
#
|
||||||
|
|
||||||
@ -72,58 +72,10 @@
|
|||||||
+ [ "$4" = "OUTPUT" ] && ds="s" || ds="d"
|
+ [ "$4" = "OUTPUT" ] && ds="s" || ds="d"
|
||||||
+ PAT=$(active_grep_pat "$1" "$2" "$3" "$ds")
|
+ PAT=$(active_grep_pat "$1" "$2" "$3" "$ds")
|
||||||
+ $IPTABLES $wait -n -L "$4" | grep -qE "$PAT"
|
+ $IPTABLES $wait -n -L "$4" | grep -qE "$PAT"
|
||||||
+}
|
|
||||||
+
|
|
||||||
+# netstat -tn and ss -Htn, split on whitespace and colon,
|
|
||||||
+# look very similar:
|
|
||||||
+# tcp 0 0 10.43.55.1 675 10.43.9.8 2049 ESTABLISHED
|
|
||||||
+# ESTAB 0 0 10.43.55.1 675 10.43.9.8 2049
|
|
||||||
+# so we can write one awk script for both
|
|
||||||
+get_established_tcp_connections()
|
|
||||||
+{
|
|
||||||
+ local columns
|
|
||||||
+ if [ -z "$1" ] ; then
|
|
||||||
+ columns='$4,$5, $6,$7'
|
|
||||||
+ else
|
|
||||||
+ # swap local and remote for "tickle_local"
|
|
||||||
+ columns='$6,$7, $4,$5'
|
|
||||||
+ fi
|
|
||||||
+ $ss_or_netstat | awk -F '[:[:space:]]+' '
|
|
||||||
+ ( $8 == "ESTABLISHED" || $1 == "ESTAB" ) && $4 == "'$OCF_RESKEY_ip'" \
|
|
||||||
+ {printf "%s:%s\t%s:%s\n", '"$columns"'}'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
save_tcp_connections()
|
# netstat -tn and ss -Htn, split on whitespace and colon,
|
||||||
{
|
@@ -299,7 +328,6 @@
|
||||||
[ -z "$OCF_RESKEY_tickle_dir" ] && return
|
|
||||||
statefile=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip
|
|
||||||
+ # If we have _no_ sync script, we probably have a shared
|
|
||||||
+ # (or replicated) directory, and need to fsync, or we might
|
|
||||||
+ # end up with the just truncated file after failover, exactly
|
|
||||||
+ # when we need it.
|
|
||||||
+ #
|
|
||||||
+ # If we _do_ have a sync script, it is not that important whether
|
|
||||||
+ # the local state file is fsync'ed or not, the sync script is
|
|
||||||
+ # responsible to "atomically" communicate the state to the peer(s).
|
|
||||||
if [ -z "$OCF_RESKEY_sync_script" ]; then
|
|
||||||
- netstat -tn |awk -F '[:[:space:]]+' '
|
|
||||||
- $8 == "ESTABLISHED" && $4 == "'$OCF_RESKEY_ip'" \
|
|
||||||
- {printf "%s:%s\t%s:%s\n", $4,$5, $6,$7}' |
|
|
||||||
- dd of="$statefile".new conv=fsync status=none &&
|
|
||||||
- mv "$statefile".new "$statefile"
|
|
||||||
+ get_established_tcp_connections |
|
|
||||||
+ dd of="$statefile".new conv=fsync status=none &&
|
|
||||||
+ mv "$statefile".new "$statefile"
|
|
||||||
else
|
|
||||||
- netstat -tn |awk -F '[:[:space:]]+' '
|
|
||||||
- $8 == "ESTABLISHED" && $4 == "'$OCF_RESKEY_ip'" \
|
|
||||||
- {printf "%s:%s\t%s:%s\n", $4,$5, $6,$7}' \
|
|
||||||
- > $statefile
|
|
||||||
+ get_established_tcp_connections > $statefile
|
|
||||||
$OCF_RESKEY_sync_script $statefile > /dev/null 2>&1 &
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
@@ -277,7 +328,6 @@
|
|
||||||
tickle_remote()
|
tickle_remote()
|
||||||
{
|
{
|
||||||
[ -z "$OCF_RESKEY_tickle_dir" ] && return
|
[ -z "$OCF_RESKEY_tickle_dir" ] && return
|
||||||
@ -131,34 +83,7 @@
|
|||||||
f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip
|
f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip
|
||||||
[ -r $f ] || return
|
[ -r $f ] || return
|
||||||
$TICKLETCP -n 3 < $f
|
$TICKLETCP -n 3 < $f
|
||||||
@@ -289,11 +339,6 @@
|
@@ -331,112 +359,140 @@
|
||||||
f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip
|
|
||||||
[ -r $f ] || return
|
|
||||||
|
|
||||||
- checkcmd="netstat -tn"
|
|
||||||
- if ! have_binary "netstat"; then
|
|
||||||
- checkcmd="ss -Htn"
|
|
||||||
- fi
|
|
||||||
-
|
|
||||||
# swap "local" and "remote" address,
|
|
||||||
# so we tickle ourselves.
|
|
||||||
# We set up a REJECT with tcp-reset before we do so, so we get rid of
|
|
||||||
@@ -302,122 +347,152 @@
|
|
||||||
# the way if we switch-over and then switch-back in quick succession.
|
|
||||||
local i
|
|
||||||
awk '{ print $2, $1; }' $f | $TICKLETCP
|
|
||||||
- $checkcmd | grep -Fw $OCF_RESKEY_ip || return
|
|
||||||
+ $ss_or_netstat | grep -Fw $OCF_RESKEY_ip || return
|
|
||||||
for i in 0.1 0.5 1 2 4 ; do
|
|
||||||
sleep $i
|
|
||||||
- awk '{ print $2, $1; }' $f | $TICKLETCP
|
|
||||||
- $checkcmd | grep -Fw $OCF_RESKEY_ip || break
|
|
||||||
+ # now kill what is currently in the list,
|
|
||||||
+ # not what was recorded during last monitor
|
|
||||||
+ get_established_tcp_connections swap | $TICKLETCP
|
|
||||||
+ $ss_or_netstat | grep -Fw $OCF_RESKEY_ip || break
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
SayActive()
|
SayActive()
|
||||||
{
|
{
|
||||||
@ -374,7 +299,7 @@
|
|||||||
block) IptablesBLOCK "$@";;
|
block) IptablesBLOCK "$@";;
|
||||||
unblock)
|
unblock)
|
||||||
IptablesUNBLOCK "$@"
|
IptablesUNBLOCK "$@"
|
||||||
@@ -432,11 +507,11 @@
|
@@ -451,11 +507,11 @@
|
||||||
return $?
|
return $?
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -388,7 +313,7 @@
|
|||||||
block) IptablesUNBLOCK "$@";;
|
block) IptablesUNBLOCK "$@";;
|
||||||
unblock)
|
unblock)
|
||||||
save_tcp_connections
|
save_tcp_connections
|
||||||
@@ -454,7 +529,7 @@
|
@@ -473,7 +529,7 @@
|
||||||
CheckPort() {
|
CheckPort() {
|
||||||
# Examples of valid port: "1080", "1", "0080"
|
# Examples of valid port: "1080", "1", "0080"
|
||||||
# Examples of invalid port: "1080bad", "0", "0000", ""
|
# Examples of invalid port: "1080bad", "0", "0000", ""
|
||||||
@ -397,7 +322,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
IptablesValidateAll()
|
IptablesValidateAll()
|
||||||
@@ -543,7 +618,7 @@
|
@@ -562,7 +618,7 @@
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iptables v1.4.20+ is required to use -w (wait)
|
# iptables v1.4.20+ is required to use -w (wait)
|
||||||
@ -406,7 +331,7 @@
|
|||||||
ocf_version_cmp "$version" "1.4.19.1"
|
ocf_version_cmp "$version" "1.4.19.1"
|
||||||
if [ "$?" -eq "2" ]; then
|
if [ "$?" -eq "2" ]; then
|
||||||
wait="-w"
|
wait="-w"
|
||||||
@@ -553,21 +628,36 @@
|
@@ -572,6 +628,7 @@
|
||||||
|
|
||||||
protocol=$OCF_RESKEY_protocol
|
protocol=$OCF_RESKEY_protocol
|
||||||
portno=$OCF_RESKEY_portno
|
portno=$OCF_RESKEY_portno
|
||||||
@ -414,21 +339,8 @@
|
|||||||
action=$OCF_RESKEY_action
|
action=$OCF_RESKEY_action
|
||||||
ip=$OCF_RESKEY_ip
|
ip=$OCF_RESKEY_ip
|
||||||
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
|
reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop
|
||||||
|
@@ -592,15 +649,15 @@
|
||||||
|
|
||||||
+
|
|
||||||
+# If "tickle" is enabled, we need to record the list of currently established
|
|
||||||
+# connections during monitor. Use ss where available, and netstat otherwise.
|
|
||||||
+if [ -n "$OCF_RESKEY_tickle_dir" ] ; then
|
|
||||||
+ if have_binary ss ; then
|
|
||||||
+ ss_or_netstat="ss -Htn"
|
|
||||||
+ elif have_binary netstat ; then
|
|
||||||
+ ss_or_netstat="netstat -tn"
|
|
||||||
+ else
|
|
||||||
+ ocf_log err "Neither ss nor netstat found, but needed to record estblished connections."
|
|
||||||
+ exit $OCF_ERR_INSTALLED
|
|
||||||
+ fi
|
|
||||||
+fi
|
|
||||||
+
|
|
||||||
case $1 in
|
case $1 in
|
||||||
start)
|
start)
|
||||||
- IptablesStart $protocol $portno $ip $action
|
- IptablesStart $protocol $portno $ip $action
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user