From 87b017967cd2dabbdb63c577bd9e9c90b5e64c14 Mon Sep 17 00:00:00 2001 From: Oyvind Albrigtsen Date: Tue, 21 Mar 2023 13:52:22 +0100 Subject: [PATCH] - mysql: fix replication issues - LVM-activate: failover with missing PVs Resolves: rhbz#2179003 Resolves: rhbz#2174911 --- ...M-activate-failover-with-missing-pvs.patch | 156 ++++++++++++++++++ bz2179003-mysql-replication-fixes.patch | 70 ++++++++ resource-agents.spec | 13 +- 3 files changed, 238 insertions(+), 1 deletion(-) create mode 100644 bz2174911-LVM-activate-failover-with-missing-pvs.patch create mode 100644 bz2179003-mysql-replication-fixes.patch diff --git a/bz2174911-LVM-activate-failover-with-missing-pvs.patch b/bz2174911-LVM-activate-failover-with-missing-pvs.patch new file mode 100644 index 0000000..9767c9a --- /dev/null +++ b/bz2174911-LVM-activate-failover-with-missing-pvs.patch @@ -0,0 +1,156 @@ +From 51dd5d5d051aa3b3f0c104f8e80f212cd5780fc3 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Tue, 14 Mar 2023 09:14:28 +0100 +Subject: [PATCH] LVM-activate: failover with missing PVs + +There area two changes included: + +- Allow the system ID to be changed on a VG when the VG is + missing PVs, as long as a majority of PVs are still present. + This requires a recent version of lvm that supports the + --majoritypvs option for vgchange. + +- Use --activationmode degraded when activating LVs so that + raid LVs can be activated when legs are missing, as long as + sufficient devices are available for raid to provide all the + data in the LV. + +By David Teigland. +--- + heartbeat/LVM-activate | 82 ++++++++++++++++++++++++++++++++---------- + 1 file changed, 64 insertions(+), 18 deletions(-) + +diff --git a/heartbeat/LVM-activate b/heartbeat/LVM-activate +index e951a08e9c..f6f24a3b52 100755 +--- a/heartbeat/LVM-activate ++++ b/heartbeat/LVM-activate +@@ -50,6 +50,8 @@ OCF_RESKEY_vg_access_mode_default="" + OCF_RESKEY_activation_mode_default="exclusive" + OCF_RESKEY_tag_default="pacemaker" + OCF_RESKEY_partial_activation_default="false" ++OCF_RESKEY_degraded_activation_default="false" ++OCF_RESKEY_majority_pvs_default="false" + + : ${OCF_RESKEY_vgname=${OCF_RESKEY_vgname_default}} + : ${OCF_RESKEY_lvname=${OCF_RESKEY_lvname_default}} +@@ -57,6 +59,8 @@ OCF_RESKEY_partial_activation_default="false" + : ${OCF_RESKEY_activation_mode=${OCF_RESKEY_activation_mode_default}} + : ${OCF_RESKEY_tag=${OCF_RESKEY_tag_default}} + : ${OCF_RESKEY_partial_activation=${OCF_RESKEY_partial_activation_default}} ++: ${OCF_RESKEY_degraded_activation=${OCF_RESKEY_degraded_activation_default}} ++: ${OCF_RESKEY_majority_pvs=${OCF_RESKEY_majority_pvs_default}} + + # If LV is given, only activate this named LV; otherwise, activate all + # LVs in the named VG. +@@ -191,6 +195,29 @@ logical volumes. + + + ++ ++ ++Activate RAID LVs using the "degraded" activation mode. This allows RAID ++LVs to be activated with missing PVs if all data can be provided with ++RAID redundancy. The RAID level determines the number of PVs that are ++required for degraded activation to succeed. If fewer PVs are available, ++then degraded activation will fail. Also enable majority_pvs. ++ ++Activate RAID LVs in degraded mode when missing PVs ++ ++ ++ ++ ++ ++If set, the VG system ID can be reassigned to a new host if a majority ++of PVs in the VG are present. Otherwise, VG failover with system ID ++will fail when the VG is missing PVs. Also enable degraded_activation ++when RAID LVs are used. ++ ++Allow changing the system ID of a VG with a majority of PVs ++ ++ ++ + + + +@@ -524,24 +551,27 @@ lvm_validate() { + exit $OCF_ERR_GENERIC + fi + +- # Inconsistency might be due to missing physical volumes, which doesn't +- # automatically mean we should fail. If partial_activation=true then +- # we should let start try to handle it, or if no PVs are listed as +- # "unknown device" then another node may have marked a device missing +- # where we have access to all of them and can start without issue. +- case $(vgs -o attr --noheadings $VG | tr -d ' ') in +- ???p??*) +- if ! ocf_is_true "$OCF_RESKEY_partial_activation" ; then +- # We are missing devices and cannot activate partially +- ocf_exit_reason "Volume group [$VG] has devices missing. Consider partial_activation=true to attempt to activate partially" +- exit $OCF_ERR_GENERIC ++ vg_missing_pv_count=$(vgs -o missing_pv_count --noheadings ${VG} 2>/dev/null) ++ ++ if [ $vg_missing_pv_count -gt 0 ]; then ++ ocf_log warn "Volume Group ${VG} is missing $vg_missing_pv_count PVs." ++ ++ # Setting new system ID will succeed if over half of PVs remain. ++ # Don't try to calculate here if a majority is present, ++ # but leave this up to the vgchange command to determine. ++ if ocf_is_true "$OCF_RESKEY_majority_pvs" ; then ++ ocf_log warn "Attempting fail over with missing PVs (majority.)" ++ ++ # Setting new system ID will fail, and behavior is undefined for ++ # other access modes. ++ elif ocf_is_true "$OCF_RESKEY_partial_activation" ; then ++ ocf_log warn "Attempting fail over with missing PVs (partial.)" ++ + else +- # We are missing devices but are allowed to activate partially. +- # Assume that caused the vgck failure and carry on +- ocf_log warn "Volume group inconsistency detected with missing device(s) and partial_activation enabled. Proceeding with requested action." ++ ocf_exit_reason "Volume group [$VG] has devices missing. Consider majority_pvs=true" ++ exit $OCF_ERR_GENERIC + fi +- ;; +- esac ++ fi + + # Get the access mode from VG metadata and check if it matches the input + # value. Skip to check "tagging" mode because there's no reliable way to +@@ -601,7 +631,18 @@ lvm_validate() { + do_activate() { + do_activate_opt=$1 + +- if ocf_is_true "$OCF_RESKEY_partial_activation" ; then ++ if ocf_is_true "$OCF_RESKEY_degraded_activation" ; then ++ # This will allow a RAID LV to be activated if sufficient ++ # devices are available to allow the LV to be usable ++ do_activate_opt="${do_activate_opt} --activationmode degraded" ++ ++ elif ocf_is_true "$OCF_RESKEY_partial_activation" ; then ++ # This will allow a mirror LV to be activated if any ++ # devices are missing, but the activated LV may not be ++ # usable, so it is not recommended. Also, other LV ++ # types without data redundancy will be activated ++ # when partial is set. ++ # RAID LVs and degraded_activation should be used instead. + do_activate_opt="${do_activate_opt} --partial" + fi + +@@ -661,11 +702,16 @@ clvmd_activate() { + } + + systemid_activate() { ++ majority_opt="" + set_autoactivation=0 + cur_systemid=$(vgs --foreign --noheadings -o systemid ${VG} | tr -d '[:blank:]') + ++ if ocf_is_true "$OCF_RESKEY_majority_pvs" ; then ++ vgchange --help | grep '\--majoritypvs' >/dev/null 2>&1 && majority_opt="--majoritypvs" ++ fi ++ + # Put our system ID on the VG +- vgchange -y --config "local/extra_system_ids=[\"${cur_systemid}\"]" \ ++ vgchange -y $majority_opt --config "local/extra_system_ids=[\"${cur_systemid}\"]" \ + --systemid ${SYSTEM_ID} ${VG} + vgchange --help | grep '\--setautoactivation' >/dev/null 2>&1 && set_autoactivation=1 + diff --git a/bz2179003-mysql-replication-fixes.patch b/bz2179003-mysql-replication-fixes.patch new file mode 100644 index 0000000..e086e07 --- /dev/null +++ b/bz2179003-mysql-replication-fixes.patch @@ -0,0 +1,70 @@ +From 706b48fd93a75a582c538013aea1418b6ed69dd0 Mon Sep 17 00:00:00 2001 +From: Oyvind Albrigtsen +Date: Thu, 9 Mar 2023 15:57:59 +0100 +Subject: [PATCH] mysql: promotable fixes to avoid nodes getting bounced around + by setting -v 1/-v 2, and added OCF_CHECK_LEVEL=10 for promotable resources + to be able to distinguish between promoted and not + +--- + heartbeat/mysql | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/heartbeat/mysql b/heartbeat/mysql +index 9ab49ab20e..29ed427319 100755 +--- a/heartbeat/mysql ++++ b/heartbeat/mysql +@@ -757,6 +757,10 @@ mysql_monitor() { + status_loglevel="info" + fi + ++ if ocf_is_ms; then ++ OCF_CHECK_LEVEL=10 ++ fi ++ + mysql_common_status $status_loglevel + rc=$? + +@@ -777,7 +781,13 @@ mysql_monitor() { + return $rc + fi + +- if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then ++ if [ $OCF_CHECK_LEVEL -eq 10 ]; then ++ if [ -z "$OCF_RESKEY_test_table" ]; then ++ ocf_exit_reason "test_table not set" ++ return $OCF_ERR_CONFIGURED ++ ++ fi ++ + # Check if this instance is configured as a slave, and if so + # check slave status + if is_slave; then +@@ -795,18 +805,16 @@ mysql_monitor() { + ocf_exit_reason "Failed to select from $test_table"; + return $OCF_ERR_GENERIC; + fi +- else +- # In case no exnteded tests are enabled and we are in master/slave mode _always_ set the master score to 1 if we reached this point +- ocf_is_ms && $CRM_MASTER -v 1 + fi + + if ocf_is_ms && ! get_read_only; then + ocf_log debug "MySQL monitor succeeded (master)"; + # Always set master score for the master +- $CRM_MASTER -v 2 ++ $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1)) + return $OCF_RUNNING_MASTER + else + ocf_log debug "MySQL monitor succeeded"; ++ ocf_is_ms && $CRM_MASTER -v 1 + return $OCF_SUCCESS + fi + } +@@ -873,7 +881,6 @@ mysql_start() { + # preference set by the administrator. We choose a low + # greater-than-zero preference. + $CRM_MASTER -v 1 +- + fi + + # Initial monitor action diff --git a/resource-agents.spec b/resource-agents.spec index dc27212..98399be 100644 --- a/resource-agents.spec +++ b/resource-agents.spec @@ -45,7 +45,7 @@ Name: resource-agents Summary: Open Source HA Reusable Cluster Resource Scripts Version: 4.10.0 -Release: 35%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} +Release: 36%{?rcver:%{rcver}}%{?numcomm:.%{numcomm}}%{?alphatag:.%{alphatag}}%{?dirty:.%{dirty}}%{?dist} License: GPLv2+ and LGPLv2+ URL: https://github.com/ClusterLabs/resource-agents Source0: %{upstream_prefix}-%{upstream_version}.tar.gz @@ -96,6 +96,8 @@ Patch43: bz2157872-5-pgsqlms-alidate-all-OCF_CHECK_LEVEL-10.patch Patch44: bz2142518-IPaddr2-IPsrcaddr-support-policy-based-routing.patch Patch45: bz2149968-lvmlockd-add-use_lvmlockd-if-missing.patch Patch46: bz2174896-ethmonitor-dont-log-iface-doesnt-exist-monitor.patch +Patch47: bz2179003-mysql-replication-fixes.patch +Patch48: bz2174911-LVM-activate-failover-with-missing-pvs.patch # bundled ha-cloud-support libs Patch500: ha-cloud-support-aws.patch @@ -266,6 +268,8 @@ exit 1 %patch44 -p1 %patch45 -p1 %patch46 -p1 +%patch47 -p1 +%patch48 -p1 # bundled ha-cloud-support libs %patch500 -p1 @@ -587,6 +591,13 @@ rm -rf %{buildroot}/usr/share/doc/resource-agents %{_usr}/lib/ocf/lib/heartbeat/OCF_*.pm %changelog +* Tue Mar 21 2023 Oyvind Albrigtsen - 4.10.0-36 +- mysql: fix replication issues +- LVM-activate: failover with missing PVs + + Resolves: rhbz#2179003 + Resolves: rhbz#2174911 + * Tue Mar 21 2023 Oyvind Albrigtsen - 4.10.0-35 - IPaddr2/IPsrcaddr: support policy-based routing - lvmlockd: add "use_lvmlockd = 1" if it's commented out or missing