- mysql: fix replication issues

- LVM-activate: failover with missing PVs

  Resolves: rhbz#2179003
  Resolves: rhbz#2174911
This commit is contained in:
Oyvind Albrigtsen 2023-03-21 13:52:22 +01:00
parent 99121b9174
commit 87b017967c
3 changed files with 238 additions and 1 deletions

View File

@ -0,0 +1,156 @@
From 51dd5d5d051aa3b3f0c104f8e80f212cd5780fc3 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Tue, 14 Mar 2023 09:14:28 +0100
Subject: [PATCH] LVM-activate: failover with missing PVs
There area two changes included:
- Allow the system ID to be changed on a VG when the VG is
missing PVs, as long as a majority of PVs are still present.
This requires a recent version of lvm that supports the
--majoritypvs option for vgchange.
- Use --activationmode degraded when activating LVs so that
raid LVs can be activated when legs are missing, as long as
sufficient devices are available for raid to provide all the
data in the LV.
By David Teigland.
---
heartbeat/LVM-activate | 82 ++++++++++++++++++++++++++++++++----------
1 file changed, 64 insertions(+), 18 deletions(-)
diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate
index e951a08e9c..f6f24a3b52 100755
--- a/heartbeat/LVM-activate
+++ b/heartbeat/LVM-activate
@@ -50,6 +50,8 @@ OCF_RESKEY_vg_access_mode_default=""
OCF_RESKEY_activation_mode_default="exclusive"
OCF_RESKEY_tag_default="pacemaker"
OCF_RESKEY_partial_activation_default="false"
+OCF_RESKEY_degraded_activation_default="false"
+OCF_RESKEY_majority_pvs_default="false"
: ${OCF_RESKEY_vgname=${OCF_RESKEY_vgname_default}}
: ${OCF_RESKEY_lvname=${OCF_RESKEY_lvname_default}}
@@ -57,6 +59,8 @@ OCF_RESKEY_partial_activation_default="false"
: ${OCF_RESKEY_activation_mode=${OCF_RESKEY_activation_mode_default}}
: ${OCF_RESKEY_tag=${OCF_RESKEY_tag_default}}
: ${OCF_RESKEY_partial_activation=${OCF_RESKEY_partial_activation_default}}
+: ${OCF_RESKEY_degraded_activation=${OCF_RESKEY_degraded_activation_default}}
+: ${OCF_RESKEY_majority_pvs=${OCF_RESKEY_majority_pvs_default}}
# If LV is given, only activate this named LV; otherwise, activate all
# LVs in the named VG.
@@ -191,6 +195,29 @@ logical volumes.
<content type="string" default="${OCF_RESKEY_partial_activation_default}" />
</parameter>
+<parameter name="degraded_activation" unique="0" required="0">
+<longdesc lang="en">
+Activate RAID LVs using the "degraded" activation mode. This allows RAID
+LVs to be activated with missing PVs if all data can be provided with
+RAID redundancy. The RAID level determines the number of PVs that are
+required for degraded activation to succeed. If fewer PVs are available,
+then degraded activation will fail. Also enable majority_pvs.
+</longdesc>
+<shortdesc lang="en">Activate RAID LVs in degraded mode when missing PVs</shortdesc>
+<content type="string" default="${OCF_RESKEY_degraded_activation_default}" />
+</parameter>
+
+<parameter name="majority_pvs" unique="0" required="0">
+<longdesc lang="en">
+If set, the VG system ID can be reassigned to a new host if a majority
+of PVs in the VG are present. Otherwise, VG failover with system ID
+will fail when the VG is missing PVs. Also enable degraded_activation
+when RAID LVs are used.
+</longdesc>
+<shortdesc lang="en">Allow changing the system ID of a VG with a majority of PVs</shortdesc>
+<content type="string" default="${OCF_RESKEY_majority_pvs_default}" />
+</parameter>
+
</parameters>
<actions>
@@ -524,24 +551,27 @@ lvm_validate() {
exit $OCF_ERR_GENERIC
fi
- # Inconsistency might be due to missing physical volumes, which doesn't
- # automatically mean we should fail. If partial_activation=true then
- # we should let start try to handle it, or if no PVs are listed as
- # "unknown device" then another node may have marked a device missing
- # where we have access to all of them and can start without issue.
- case $(vgs -o attr --noheadings $VG | tr -d ' ') in
- ???p??*)
- if ! ocf_is_true "$OCF_RESKEY_partial_activation" ; then
- # We are missing devices and cannot activate partially
- ocf_exit_reason "Volume group [$VG] has devices missing. Consider partial_activation=true to attempt to activate partially"
- exit $OCF_ERR_GENERIC
+ vg_missing_pv_count=$(vgs -o missing_pv_count --noheadings ${VG} 2>/dev/null)
+
+ if [ $vg_missing_pv_count -gt 0 ]; then
+ ocf_log warn "Volume Group ${VG} is missing $vg_missing_pv_count PVs."
+
+ # Setting new system ID will succeed if over half of PVs remain.
+ # Don't try to calculate here if a majority is present,
+ # but leave this up to the vgchange command to determine.
+ if ocf_is_true "$OCF_RESKEY_majority_pvs" ; then
+ ocf_log warn "Attempting fail over with missing PVs (majority.)"
+
+ # Setting new system ID will fail, and behavior is undefined for
+ # other access modes.
+ elif ocf_is_true "$OCF_RESKEY_partial_activation" ; then
+ ocf_log warn "Attempting fail over with missing PVs (partial.)"
+
else
- # We are missing devices but are allowed to activate partially.
- # Assume that caused the vgck failure and carry on
- ocf_log warn "Volume group inconsistency detected with missing device(s) and partial_activation enabled. Proceeding with requested action."
+ ocf_exit_reason "Volume group [$VG] has devices missing. Consider majority_pvs=true"
+ exit $OCF_ERR_GENERIC
fi
- ;;
- esac
+ fi
# Get the access mode from VG metadata and check if it matches the input
# value. Skip to check "tagging" mode because there's no reliable way to
@@ -601,7 +631,18 @@ lvm_validate() {
do_activate() {
do_activate_opt=$1
- if ocf_is_true "$OCF_RESKEY_partial_activation" ; then
+ if ocf_is_true "$OCF_RESKEY_degraded_activation" ; then
+ # This will allow a RAID LV to be activated if sufficient
+ # devices are available to allow the LV to be usable
+ do_activate_opt="${do_activate_opt} --activationmode degraded"
+
+ elif ocf_is_true "$OCF_RESKEY_partial_activation" ; then
+ # This will allow a mirror LV to be activated if any
+ # devices are missing, but the activated LV may not be
+ # usable, so it is not recommended. Also, other LV
+ # types without data redundancy will be activated
+ # when partial is set.
+ # RAID LVs and degraded_activation should be used instead.
do_activate_opt="${do_activate_opt} --partial"
fi
@@ -661,11 +702,16 @@ clvmd_activate() {
}
systemid_activate() {
+ majority_opt=""
set_autoactivation=0
cur_systemid=$(vgs --foreign --noheadings -o systemid ${VG} | tr -d '[:blank:]')
+ if ocf_is_true "$OCF_RESKEY_majority_pvs" ; then
+ vgchange --help | grep '\--majoritypvs' >/dev/null 2>&1 && majority_opt="--majoritypvs"
+ fi
+
# Put our system ID on the VG
- vgchange -y --config "local/extra_system_ids=[\"${cur_systemid}\"]" \
+ vgchange -y $majority_opt --config "local/extra_system_ids=[\"${cur_systemid}\"]" \
--systemid ${SYSTEM_ID} ${VG}
vgchange --help | grep '\--setautoactivation' >/dev/null 2>&1 && set_autoactivation=1

View File

@ -0,0 +1,70 @@
From 706b48fd93a75a582c538013aea1418b6ed69dd0 Mon Sep 17 00:00:00 2001
From: Oyvind Albrigtsen <oalbrigt@redhat.com>
Date: Thu, 9 Mar 2023 15:57:59 +0100
Subject: [PATCH] mysql: promotable fixes to avoid nodes getting bounced around
by setting -v 1/-v 2, and added OCF_CHECK_LEVEL=10 for promotable resources
to be able to distinguish between promoted and not
---
heartbeat/mysql | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/heartbeat/mysql b/heartbeat/mysql
index 9ab49ab20e..29ed427319 100755
--- a/heartbeat/mysql
+++ b/heartbeat/mysql
@@ -757,6 +757,10 @@ mysql_monitor() {
status_loglevel="info"
fi
+ if ocf_is_ms; then
+ OCF_CHECK_LEVEL=10
+ fi
+
mysql_common_status $status_loglevel
rc=$?
@@ -777,7 +781,13 @@ mysql_monitor() {
return $rc
fi
- if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then
+ if [ $OCF_CHECK_LEVEL -eq 10 ]; then
+ if [ -z "$OCF_RESKEY_test_table" ]; then
+ ocf_exit_reason "test_table not set"
+ return $OCF_ERR_CONFIGURED
+
+ fi
+
# Check if this instance is configured as a slave, and if so
# check slave status
if is_slave; then
@@ -795,18 +805,16 @@ mysql_monitor() {
ocf_exit_reason "Failed to select from $test_table";
return $OCF_ERR_GENERIC;
fi
- else
- # In case no exnteded tests are enabled and we are in master/slave mode _always_ set the master score to 1 if we reached this point
- ocf_is_ms && $CRM_MASTER -v 1
fi
if ocf_is_ms && ! get_read_only; then
ocf_log debug "MySQL monitor succeeded (master)";
# Always set master score for the master
- $CRM_MASTER -v 2
+ $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1))
return $OCF_RUNNING_MASTER
else
ocf_log debug "MySQL monitor succeeded";
+ ocf_is_ms && $CRM_MASTER -v 1
return $OCF_SUCCESS
fi
}
@@ -873,7 +881,6 @@ mysql_start() {
# preference set by the administrator. We choose a low
# greater-than-zero preference.
$CRM_MASTER -v 1
-
fi
# Initial monitor action

View File

@ -45,7 +45,7 @@
Name: resource-agents
Summary: Open Source HA Reusable Cluster Resource Scripts
Version: 4.10.0
Release: 35%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
Release: 36%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist}
License: GPLv2+ and LGPLv2+
URL: https://github.com/ClusterLabs/resource-agents
Source0: %{upstream_prefix}-%{upstream_version}.tar.gz
@ -96,6 +96,8 @@ Patch43: bz2157872-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch
Patch44: bz2142518-IPaddr2-IPsrcaddr-support-policy-based-routing.patch
Patch45: bz2149968-lvmlockd-add-use_lvmlockd-if-missing.patch
Patch46: bz2174896-ethmonitor-dont-log-iface-doesnt-exist-monitor.patch
Patch47: bz2179003-mysql-replication-fixes.patch
Patch48: bz2174911-LVM-activate-failover-with-missing-pvs.patch
# bundled ha-cloud-support libs
Patch500: ha-cloud-support-aws.patch
@ -266,6 +268,8 @@ exit 1
%patch44 -p1
%patch45 -p1
%patch46 -p1
%patch47 -p1
%patch48 -p1
# bundled ha-cloud-support libs
%patch500 -p1
@ -587,6 +591,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents
%{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm
%changelog
* Tue Mar 21 2023 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-36
- mysql: fix replication issues
- LVM-activate: failover with missing PVs
Resolves: rhbz#2179003
Resolves: rhbz#2174911
* Tue Mar 21 2023 Oyvind Albrigtsen <oalbrigt@redhat.com> - 4.10.0-35
- IPaddr2/IPsrcaddr: support policy-based routing
- lvmlockd: add "use_lvmlockd = 1" if it's commented out or missing