- db2: add "skip_basic_sql_health_check" parameter to avoid failing on
systems with high load - db2: add "monitor_retries", "monitor_sleep", and "monitor_retry_all_errors" parameters to be able to avoid failing on first try Resolves: RHEL-115783, RHEL-115781
This commit is contained in:
parent
b6c14c941f
commit
97caa584c3
@ -0,0 +1,258 @@
|
||||
From fc240bdff60aae7133a532c7752c6253ce8f65ca Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Mon, 4 Aug 2025 16:53:09 +0200
|
||||
Subject: [PATCH 1/2] db2: add "skip_basic_sql_health_check" parameter to avoid
|
||||
failing on systems with high load
|
||||
|
||||
---
|
||||
heartbeat/db2 | 63 +++++++++++++++++++++++++++++++--------------------
|
||||
1 file changed, 38 insertions(+), 25 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index 1cd66f15a..da6c9d5f1 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -40,10 +40,12 @@
|
||||
# Parameter defaults
|
||||
|
||||
OCF_RESKEY_instance_default=""
|
||||
+OCF_RESKEY_skip_basic_sql_health_check_default="false"
|
||||
OCF_RESKEY_admin_default=""
|
||||
OCF_RESKEY_dbpartitionnum_default="0"
|
||||
|
||||
: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
|
||||
+: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}}
|
||||
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
|
||||
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
|
||||
|
||||
@@ -102,6 +104,15 @@ Defaults to all databases in the instance. Specify one db for HADR mode.
|
||||
<shortdesc lang="en">List of databases to be managed</shortdesc>
|
||||
<content type="string"/>
|
||||
</parameter>
|
||||
+<parameter name="skip_basic_sql_health_check" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Skip basic health check SQL query.
|
||||
+
|
||||
+Only set to "true" to avoid issues during high load.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Skip basic health check SQL query</shortdesc>
|
||||
+<content type="boolean" default="${OCF_RESKEY_skip_basic_sql_health_check_default}" />
|
||||
+</parameter>
|
||||
<parameter name="admin" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
DEPRECATED: The admin user of the instance.
|
||||
@@ -695,31 +706,33 @@ db2_monitor() {
|
||||
# set master preference accordingly
|
||||
case "$hadr" in
|
||||
PRIMARY/*|Primary/*|Standard/*)
|
||||
- # perform a basic health check
|
||||
- CMD="if db2 connect to $db;
|
||||
- then
|
||||
- db2 select \* from sysibm.sysversions ; rc=\$?;
|
||||
- db2 terminate;
|
||||
- else
|
||||
- rc=\$?;
|
||||
- fi;
|
||||
- exit \$rc"
|
||||
-
|
||||
- if ! output=$(runasdb2 $CMD)
|
||||
- then
|
||||
- case "$output" in
|
||||
- SQL1776N*)
|
||||
- # can't connect/select on standby, may be spurious turing takeover
|
||||
- ;;
|
||||
-
|
||||
- *)
|
||||
- ocf_log err "DB2 database $instance($db2node)/$db is not working"
|
||||
- ocf_log err "DB2 message: $output"
|
||||
-
|
||||
- # dead primary, remove master score
|
||||
- master_score -D -l reboot
|
||||
- return $OCF_ERR_GENERIC
|
||||
- esac
|
||||
+ if ! ocf_is_true "$OCF_RESKEY_skip_basic_sql_health_check"; then
|
||||
+ # perform a basic health check
|
||||
+ CMD="if db2 connect to $db;
|
||||
+ then
|
||||
+ db2 select \* from sysibm.sysversions ; rc=\$?;
|
||||
+ db2 terminate;
|
||||
+ else
|
||||
+ rc=\$?;
|
||||
+ fi;
|
||||
+ exit \$rc"
|
||||
+
|
||||
+ if ! output=$(runasdb2 $CMD)
|
||||
+ then
|
||||
+ case "$output" in
|
||||
+ SQL1776N*)
|
||||
+ # can't connect/select on standby, may be spurious turing takeover
|
||||
+ ;;
|
||||
+
|
||||
+ *)
|
||||
+ ocf_log err "DB2 database $instance($db2node)/$db is not working"
|
||||
+ ocf_log err "DB2 message: $output"
|
||||
+
|
||||
+ # dead primary, remove master score
|
||||
+ master_score -D -l reboot
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ esac
|
||||
+ fi
|
||||
fi
|
||||
|
||||
ocf_log debug "DB2 database $instance($db2node)/$db appears to be working"
|
||||
|
||||
From ded016f84d3fb77dc0542e3f4226774526910d97 Mon Sep 17 00:00:00 2001
|
||||
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
|
||||
Date: Thu, 7 Aug 2025 13:55:11 +0200
|
||||
Subject: [PATCH 2/2] db2: add "monitor_retries", "monitor_sleep", and
|
||||
"monitor_retry_all_errors" parameters to be able to avoid failing on first
|
||||
try
|
||||
|
||||
---
|
||||
heartbeat/db2 | 80 +++++++++++++++++++++++++++++++++++++++++++++------
|
||||
1 file changed, 72 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/heartbeat/db2 b/heartbeat/db2
|
||||
index da6c9d5f1..fe1d9b892 100755
|
||||
--- a/heartbeat/db2
|
||||
+++ b/heartbeat/db2
|
||||
@@ -41,11 +41,17 @@
|
||||
|
||||
OCF_RESKEY_instance_default=""
|
||||
OCF_RESKEY_skip_basic_sql_health_check_default="false"
|
||||
+OCF_RESKEY_monitor_retries_default="1"
|
||||
+OCF_RESKEY_monitor_sleep_default="1"
|
||||
+OCF_RESKEY_monitor_retry_all_errors_default="false"
|
||||
OCF_RESKEY_admin_default=""
|
||||
OCF_RESKEY_dbpartitionnum_default="0"
|
||||
|
||||
: ${OCF_RESKEY_instance=${OCF_RESKEY_instance_default}}
|
||||
: ${OCF_RESKEY_skip_basic_sql_health_check=${OCF_RESKEY_skip_basic_sql_health_check_default}}
|
||||
+: ${OCF_RESKEY_monitor_retries=${OCF_RESKEY_monitor_retries_default}}
|
||||
+: ${OCF_RESKEY_monitor_sleep=${OCF_RESKEY_monitor_sleep_default}}
|
||||
+: ${OCF_RESKEY_monitor_retry_all_errors=${OCF_RESKEY_monitor_retry_all_errors_default}}
|
||||
: ${OCF_RESKEY_admin=${OCF_RESKEY_admin_default}}
|
||||
: ${OCF_RESKEY_dbpartitionnum=${OCF_RESKEY_dbpartitionnum_default}}
|
||||
|
||||
@@ -108,11 +114,33 @@ Defaults to all databases in the instance. Specify one db for HADR mode.
|
||||
<longdesc lang="en">
|
||||
Skip basic health check SQL query.
|
||||
|
||||
-Only set to "true" to avoid issues during high load.
|
||||
+Only set to "true" when the "monitor_retries" and "monitor_retry_all_errors" parameters arent
|
||||
+enough to avoid issues under high load.
|
||||
</longdesc>
|
||||
<shortdesc lang="en">Skip basic health check SQL query</shortdesc>
|
||||
<content type="boolean" default="${OCF_RESKEY_skip_basic_sql_health_check_default}" />
|
||||
</parameter>
|
||||
+<parameter name="monitor_retries" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Monitor retries before failing.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Monitor retries</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_retries_default}" />
|
||||
+</parameter>
|
||||
+<parameter name="monitor_retries_sleep" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Monitor sleep between tries.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Monitor sleep</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_sleep_default}" />
|
||||
+</parameter>
|
||||
+<parameter name="monitor_retry_all_errors" unique="0" required="0">
|
||||
+<longdesc lang="en">
|
||||
+Set to true to retry monitor-action for all errors instead of the default "db2pd" race conditions.
|
||||
+</longdesc>
|
||||
+<shortdesc lang="en">Retry monitor for all errors</shortdesc>
|
||||
+<content type="string" default="${OCF_RESKEY_monitor_retry_all_errors_default}" />
|
||||
+</parameter>
|
||||
<parameter name="admin" unique="0" required="0">
|
||||
<longdesc lang="en">
|
||||
DEPRECATED: The admin user of the instance.
|
||||
@@ -666,6 +694,7 @@ db2_hadr_status() {
|
||||
local output
|
||||
|
||||
output=$(runasdb2 db2pd -hadr -db $db)
|
||||
+ ocf_log debug "db2_hadr_status: $output"
|
||||
if [ $? != 0 ]
|
||||
then
|
||||
echo "Down/Off"
|
||||
@@ -676,7 +705,34 @@ db2_hadr_status() {
|
||||
awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"}
|
||||
/^\s+HADR_CONNECT_STATUS =/ {print $3; exit; }
|
||||
/^HADR is not active/ {print "Standard/Standalone"; exit; }
|
||||
- /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }'
|
||||
+ /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }
|
||||
+ /^Option -hadr requires -db <database> or -alldbs option and active database./ { exit 255 }
|
||||
+ /^Another possibility of this failure is the Virtual Address Space Randomization is currently enabled on this system./ { exit 255 }
|
||||
+ /^Changing data structure forced command termination./ { exit 255 }'
|
||||
+}
|
||||
+
|
||||
+db2_monitor_retry() {
|
||||
+ local tries=$(($OCF_RESKEY_monitor_retries + 1))
|
||||
+
|
||||
+ for try in $(seq $tries); do
|
||||
+ ocf_log debug "monitor try $try of $tries"
|
||||
+ db2_monitor
|
||||
+ rc=$?
|
||||
+ [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ] && [ $rc -ne $OCF_NOT_RUNNING ] && ocf_log warn "Monitor failed with rc $rc."
|
||||
+ if [ $rc -eq $OCF_SUCCESS ] || [ $rc -eq $OCF_RUNNING_MASTER ] || [ $rc -eq $OCF_NOT_RUNNING ] || { [ $rc -ne 255 ] && ! ocf_is_true "$OCF_RESKEY_monitor_retry_all_errors" ;} ;then
|
||||
+ break
|
||||
+ fi
|
||||
+ [ $try -lt $tries ] && sleep $OCF_RESKEY_monitor_sleep
|
||||
+ done
|
||||
+
|
||||
+ [ $rc -eq 255 ] && rc=$OCF_ERR_GENERIC
|
||||
+
|
||||
+ if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_RUNNING_MASTER ]; then
|
||||
+ # instance is dead remove master score
|
||||
+ master_score -D -l reboot
|
||||
+ fi
|
||||
+
|
||||
+ return $rc
|
||||
}
|
||||
|
||||
#
|
||||
@@ -690,9 +746,7 @@ db2_monitor() {
|
||||
db2_instance_status
|
||||
rc=$?
|
||||
if [ $rc -ne $OCF_SUCCESS ]; then
|
||||
- # instance is dead remove master score
|
||||
- master_score -D -l reboot
|
||||
- exit $rc
|
||||
+ return $rc
|
||||
fi
|
||||
|
||||
[ $db2node = 0 ] || return 0
|
||||
@@ -700,8 +754,18 @@ db2_monitor() {
|
||||
|
||||
for db in $dblist
|
||||
do
|
||||
- hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC
|
||||
+ hadr=$(db2_hadr_status $db)
|
||||
+ rc=$?
|
||||
ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr"
|
||||
+ if [ "$rc" -eq 255 ]; then
|
||||
+ if [ "$__OCF_ACTION" = "monitor" ]; then
|
||||
+ return $rc
|
||||
+ else
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
+ elif [ "$rc" -ne 0 ]; then
|
||||
+ return $OCF_ERR_GENERIC
|
||||
+ fi
|
||||
|
||||
# set master preference accordingly
|
||||
case "$hadr" in
|
||||
@@ -915,9 +979,9 @@ case "$__OCF_ACTION" in
|
||||
exit $?
|
||||
;;
|
||||
|
||||
- monitor)
|
||||
+ monitor)
|
||||
db2_validate
|
||||
- db2_monitor
|
||||
+ db2_monitor_retry
|
||||
exit $?
|
||||
;;
|
||||
|
||||
@ -73,7 +73,7 @@
|
||||
Name: resource-agents
|
||||
Summary: Open Source HA Reusable Cluster Resource Scripts
|
||||
Version: 4.9.0
|
||||
Release: 54%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.16
|
||||
Release: 54%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}.17
|
||||
License: GPLv2+ and LGPLv2+
|
||||
URL: https://github.com/ClusterLabs/resource-agents
|
||||
%if 0%{?fedora} || 0%{?centos_version} || 0%{?rhel}
|
||||
@ -173,6 +173,7 @@ Patch76: RHEL-81960-2-aws-agents-reuse-imds-token-improvements.patch
|
||||
Patch77: RHEL-85048-tomcat-fix-CATALINA_PID-not-set-and-parameter-defaults.patch
|
||||
Patch78: RHEL-91257-Filesystem-add-support-for-aznfs.patch
|
||||
Patch79: RHEL-102731-ocf-shellfuncs-remove-extra-sleep-from-curl_retry.patch
|
||||
Patch80: RHEL-115783-RHEL-115781-db2-add-skip_basic_sql_health_check-and-monitor-parameters.patch
|
||||
|
||||
# bundle patches
|
||||
Patch1000: 7-gcp-bundled.patch
|
||||
@ -437,6 +438,7 @@ exit 1
|
||||
%patch -p1 -P 77
|
||||
%patch -p1 -P 78 -F2
|
||||
%patch -p1 -P 79
|
||||
%patch -p1 -P 80
|
||||
|
||||
chmod 755 heartbeat/nova-compute-wait
|
||||
chmod 755 heartbeat/NovaEvacuate
|
||||
@ -1027,6 +1029,14 @@ ccs_update_schema > /dev/null 2>&1 ||:
|
||||
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
|
||||
|
||||
%changelog
|
||||
* Thu Sep 18 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-54.17
|
||||
- db2: add "skip_basic_sql_health_check" parameter to avoid failing on
|
||||
systems with high load
|
||||
- db2: add "monitor_retries", "monitor_sleep", and "monitor_retry_all_errors"
|
||||
parameters to be able to avoid failing on first try
|
||||
|
||||
Resolves: RHEL-115783, RHEL-115781
|
||||
|
||||
* Fri Aug 15 2025 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.9.0-54.16
|
||||
- bundled requests: fix CVE-2024-47081
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user