diff --color -uNr a/doc/man/Makefile.am b/doc/man/Makefile.am --- a/doc/man/Makefile.am 2021-04-12 12:51:56.831835953 +0200 +++ b/doc/man/Makefile.am 2021-04-13 13:38:14.198361848 +0200 @@ -154,6 +154,7 @@ ocf_heartbeat_ovsmonitor.7 \ ocf_heartbeat_pgagent.7 \ ocf_heartbeat_pgsql.7 \ + ocf_heartbeat_pgsqlms.7 \ ocf_heartbeat_pingd.7 \ ocf_heartbeat_podman.7 \ ocf_heartbeat_portblock.7 \ diff --color -uNr a/heartbeat/Makefile.am b/heartbeat/Makefile.am --- a/heartbeat/Makefile.am 2021-04-12 12:51:56.831835953 +0200 +++ b/heartbeat/Makefile.am 2021-04-13 13:37:45.741292178 +0200 @@ -149,6 +149,7 @@ ovsmonitor \ pgagent \ pgsql \ + pgsqlms \ pingd \ podman \ portblock \ @@ -209,7 +210,10 @@ mysql-common.sh \ nfsserver-redhat.sh \ findif.sh \ - ocf.py + ocf.py \ + OCF_Directories.pm \ + OCF_Functions.pm \ + OCF_ReturnCodes.pm # Legacy locations hbdir = $(sysconfdir)/ha.d diff --color -uNr a/heartbeat/OCF_Directories.pm b/heartbeat/OCF_Directories.pm --- a/heartbeat/OCF_Directories.pm 1970-01-01 01:00:00.000000000 +0100 +++ b/heartbeat/OCF_Directories.pm 2021-04-13 13:37:35.621267404 +0200 @@ -0,0 +1,139 @@ +#!/usr/bin/perl +# This program is open source, licensed under the PostgreSQL License. +# For license terms, see the LICENSE file. +# +# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault + +=head1 NAME + +OCF_Directories - Binaries and binary options for use in Resource Agents + +=head1 SYNOPSIS + + use FindBin; + use lib "$FindBin::RealBin/../../lib/heartbeat/"; + + use OCF_Directories; + +=head1 DESCRIPTION + +This module has been ported from the ocf-directories shell script of the +resource-agents project. See L<https://github.com/ClusterLabs/resource-agents/>. + +=head1 VARIABLES + +Here are the variables exported by this module: + +=over + +=item $INITDIR + +=item $HA_DIR + +=item $HA_RCDIR + +=item $HA_CONFDIR + +=item $HA_CF + +=item $HA_VARLIB + +=item $HA_RSCTMP + +=item $HA_RSCTMP_OLD + +=item $HA_FIFO + +=item $HA_BIN + +=item $HA_SBIN_DIR + +=item $HA_DATEFMT + +=item $HA_DEBUGLOG + +=item $HA_RESOURCEDIR + +=item $HA_DOCDIR + +=item $__SCRIPT_NAME + +=item $HA_VARRUN + +=item $HA_VARLOCK + +=item $ocf_prefix + +=item $ocf_exec_prefix + +=back + +=cut + +package OCF_Directories; + +use strict; +use warnings; +use 5.008; +use File::Basename; + +BEGIN { + use Exporter; + + + our $VERSION = 'v2.3.0'; + our @ISA = ('Exporter'); + our @EXPORT = qw( + $INITDIR + $HA_DIR + $HA_RCDIR + $HA_CONFDIR + $HA_CF + $HA_VARLIB + $HA_RSCTMP + $HA_RSCTMP_OLD + $HA_FIFO + $HA_BIN + $HA_SBIN_DIR + $HA_DATEFMT + $HA_DEBUGLOG + $HA_RESOURCEDIR + $HA_DOCDIR + $__SCRIPT_NAME + $HA_VARRUN + $HA_VARLOCK + $ocf_prefix + $ocf_exec_prefix + ); + our @EXPORT_OK = ( @EXPORT ); +} + +our $INITDIR = ( $ENV{'INITDIR'} || '/etc/init.d' ); +our $HA_DIR = ( $ENV{'HA_DIR'} || '/etc/ha.d' ); +our $HA_RCDIR = ( $ENV{'HA_RCDIR'} || '/etc/ha.d/rc.d' ); +our $HA_CONFDIR = ( $ENV{'HA_CONFDIR'} || '/etc/ha.d/conf' ); +our $HA_CF = ( $ENV{'HA_CF'} || '/etc/ha.d/ha.cf' ); +our $HA_VARLIB = ( $ENV{'HA_VARLIB'} || '/var/lib/heartbeat' ); +our $HA_RSCTMP = ( $ENV{'HA_RSCTMP'} || '/run/resource-agents' ); +our $HA_RSCTMP_OLD = ( $ENV{'HA_RSCTMP_OLD'} || '/var/run/heartbeat/rsctmp' ); +our $HA_FIFO = ( $ENV{'HA_FIFO'} || '/var/lib/heartbeat/fifo' ); +our $HA_BIN = ( $ENV{'HA_BIN'} || '/usr/libexec/heartbeat' ); +our $HA_SBIN_DIR = ( $ENV{'HA_SBIN_DIR'} || '/usr/sbin' ); +our $HA_DATEFMT = ( $ENV{'HA_DATEFMT'} || '%b %d %T ' ); +our $HA_DEBUGLOG = ( $ENV{'HA_DEBUGLOG'} || '/dev/null' ); +our $HA_RESOURCEDIR = ( $ENV{'HA_RESOURCEDIR'}|| '/etc/ha.d/resource.d' ); +our $HA_DOCDIR = ( $ENV{'HA_DOCDIR'} || '/usr/share/doc/heartbeat' ); +our $__SCRIPT_NAME = ( $ENV{'__SCRIPT_NAME'} || fileparse($0) ); +our $HA_VARRUN = ( $ENV{'HA_VARRUN'} || '/var/run' ); +our $HA_VARLOCK = ( $ENV{'HA_VARLOCK'} || '/var/lock/subsys' ); +our $ocf_prefix = '/usr'; +our $ocf_exec_prefix = '/usr'; + +1; + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2016: Jehan-Guillaume de Rorthais and Mael Rimbault. + +Licensed under the PostgreSQL License. + diff --color -uNr a/heartbeat/OCF_Functions.pm b/heartbeat/OCF_Functions.pm --- a/heartbeat/OCF_Functions.pm 1970-01-01 01:00:00.000000000 +0100 +++ b/heartbeat/OCF_Functions.pm 2021-04-13 13:37:35.621267404 +0200 @@ -0,0 +1,631 @@ +#!/usr/bin/perl +# This program is open source, licensed under the PostgreSQL License. +# For license terms, see the LICENSE file. +# +# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault + +=head1 NAME + +OCF_Functions - helper subroutines for OCF agent + +=head1 SYNOPSIS + + use FindBin; + use lib "$FindBin::RealBin/../../lib/heartbeat/"; + + use OCF_Functions; + +=head1 DESCRIPTION + +This module has been ported from the ocf-shellfuncs shell script of the +resource-agents project. See L<https://github.com/ClusterLabs/resource-agents/>. + +=head1 VARIABLE + +The only variable exported by this module is C<__OCF_ACTION>. + +=head1 SUBROUTINES + +Here are the subroutines ported from ocf-shellfuncs and exported by this module: + +=over + +=item ha_debug + +=item ha_log + +=item hadate + +=item ocf_is_clone + +=item ocf_is_ms + +=item ocf_is_probe + +=item ocf_is_root + +=item ocf_is_true + +=item ocf_is_ver + +=item ocf_local_nodename + +=item ocf_log + +=item ocf_exit_reason + +=item ocf_maybe_random + +=item ocf_ver2num + +=item ocf_ver_complete_level + +=item ocf_ver_level + +=item ocf_version_cmp + +=item set_logtag + +=back + +Here are the subroutines only existing in the perl module but not in the +ocf-shellfuncs script: + +=over + +=item ocf_notify_env + +=back + +=cut + +package OCF_Functions; + +use strict; +use warnings; +use 5.008; +use POSIX qw( strftime setlocale LC_ALL ); +use English; + +use FindBin; +use lib "$FindBin::RealBin/../../lib/heartbeat/"; + +use OCF_ReturnCodes; +use OCF_Directories; + +BEGIN { + use Exporter; + + our $VERSION = 'v2.3.0'; + our @ISA = ('Exporter'); + our @EXPORT = qw( + $__OCF_ACTION + ocf_is_root + ocf_maybe_random + ocf_is_true + hadate + set_logtag + ha_log + ha_debug + ocf_log + ocf_exit_reason + ocf_is_probe + ocf_is_clone + ocf_is_ms + ocf_is_ver + ocf_ver2num + ocf_ver_level + ocf_ver_complete_level + ocf_version_cmp + ocf_local_nodename + ocf_notify_env + ); + our @EXPORT_OK = ( @EXPORT ); +} + +our $__OCF_ACTION; + +sub ocf_is_root { + return $EUID == 0; +} + +sub ocf_maybe_random { + return int( rand( 32767 ) ); +} + +sub ocf_is_true { + my $v = shift; + return ( defined $v and $v =~ /^(?:yes|true|1|YES|TRUE|ja|on|ON)$/ ); +} + +sub hadate { + return strftime( $HA_DATEFMT, localtime ); +} + +sub set_logtag { + + return if defined $ENV{'HA_LOGTAG'} and $ENV{'HA_LOGTAG'} ne ''; + + if ( defined $ENV{'OCF_RESOURCE_INSTANCE'} and $ENV{'OCF_RESOURCE_INSTANCE'} ne '' ) { + $ENV{'HA_LOGTAG'} = "$__SCRIPT_NAME($ENV{'OCF_RESOURCE_INSTANCE'})[$PID]"; + } + else { + $ENV{'HA_LOGTAG'}="${__SCRIPT_NAME}[$PID]"; + } +} + +sub __ha_log { + my $ignore_stderr = 0; + my $loglevel = ''; + + if ( $_[0] eq '--ignore-stderr' ) { + $ignore_stderr = 1; + shift; + } + + $ENV{'HA_LOGFACILITY'} = '' if not defined $ENV{'HA_LOGFACILITY'} + or $ENV{'HA_LOGFACILITY'} eq 'none'; + + # if we're connected to a tty, then output to stderr + if ( -t STDERR ) { + # FIXME + # T.N.: this was ported with the bug on $loglevel being empty + # and never set before the test here... + if ( defined $ENV{'HA_debug'} + and $ENV{'HA_debug'} == 0 + and $loglevel eq 'debug' + ) { + return 0; + } + elsif ( $ignore_stderr ) { + # something already printed this error to stderr, so ignore + return 0; + } + if ( defined $ENV{'HA_LOGTAG'} and $ENV{'HA_LOGTAG'} ne '' ) { + printf STDERR "%s: %s\n", $ENV{'HA_LOGTAG'}, join ' ', @ARG; + } + else { + printf STDERR "%s\n", join ' ', @ARG; + } + return 0; + } + + set_logtag(); + + if ( defined $ENV{'HA_LOGD'} and $ENV{'HA_LOGD'} eq 'yes' ) { + system 'ha_logger', '-t', $ENV{'HA_LOGTAG'}, @ARG; + return 0 if ( $? >> 8 ) == 0; + } + + unless ( $ENV{'HA_LOGFACILITY'} eq '' ) { + # logging through syslog + # loglevel is unknown, use 'notice' for now + $loglevel = 'notice'; + for ( "@ARG" ) { + if ( /ERROR/ ) { + $loglevel = 'err'; + } + elsif ( /WARN/ ) { + $loglevel = 'warning'; + } + elsif (/INFO|info/ ) { + $loglevel = 'info'; + } + } + + system 'logger', '-t', $ENV{'HA_LOGTAG'}, '-p', + "$ENV{'HA_LOGFACILITY'}.$loglevel", @ARG; + } + + if ( defined $ENV{'HA_LOGFILE'} and $ENV{'HA_LOGFILE'} ne '' ) { + # appending to $HA_LOGFILE + open my $logfile, '>>', $ENV{'HA_LOGFILE'}; + printf $logfile "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), + join (' ', @ARG); + close $logfile; + } + + # appending to stderr + printf STDERR "%s %s\n", hadate(), join ' ', @ARG + if (not defined $ENV{'HA_LOGFACILITY'} or $ENV{'HA_LOGFACILITY'} eq '') + and (not defined $ENV{'HA_LOGFILE'} or $ENV{'HA_LOGFILE'} eq '' ) + and not $ignore_stderr; + + if ( defined $ENV{'HA_DEBUGLOG'} and $ENV{'HA_DEBUGLOG'} ne '' + and $ENV{'HA_LOGFILE'} ne $ENV{'HA_DEBUGLOG'} + ) { + # appending to $HA_DEBUGLOG + open my $logfile, '>>', $ENV{'HA_DEBUGLOG'}; + printf $logfile "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), + join (' ', @ARG); + close $logfile; + } +} + +sub ha_log { + return __ha_log( @ARG ); +} + +sub ha_debug { + + return 0 if defined $ENV{'HA_debug'} and $ENV{'HA_debug'} == 0; + + if ( -t STDERR ) { + if ( defined $ENV{'HA_LOGTAG'} and $ENV{'HA_LOGTAG'} ne '' ) { + printf STDERR "%s: %s\n", $ENV{'HA_LOGTAG'}, join ' ', @ARG; + } + else { + printf STDERR "%s\n", join ' ', @ARG; + } + + return 0; + } + + set_logtag(); + + if ( defined $ENV{'HA_LOGD'} and $ENV{'HA_LOGD'} eq 'yes' ) { + system 'ha_logger', '-t', $ENV{'HA_LOGTAG'}, '-D', 'ha-debug', @ARG; + return 0 if ( $? >> 8 ) == 0; + } + + $ENV{'HA_LOGFACILITY'} = '' if not defined $ENV{'HA_LOGFACILITY'} + or $ENV{'HA_LOGFACILITY'} eq 'none'; + + unless ( $ENV{'HA_LOGFACILITY'} eq '' ) { + # logging through syslog + + system 'logger', '-t', $ENV{'HA_LOGTAG'}, '-p', + "$ENV{'HA_LOGFACILITY'}.debug", @ARG; + } + + if ( defined $ENV{'HA_DEBUGLOG'} and -f $ENV{'HA_DEBUGLOG'} ) { + my $logfile; + # appending to $HA_DEBUGLOG + open $logfile, '>>', $ENV{'HA_DEBUGLOG'}; + printf $logfile "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), + join (' ', @ARG); + close $logfile; + } + + # appending to stderr + printf STDERR "%s: %s %s\n", $ENV{'HA_LOGTAG'}, hadate(), join ' ', @ARG + if (not defined $ENV{'HA_LOGFACILITY'} or $ENV{'HA_LOGFACILITY'} eq '') + and (not defined $ENV{'HA_DEBUGLOG'} or $ENV{'HA_DEBUGLOG'} eq '' ); +} + +# +# ocf_log: log messages from the resource agent +# This function is slightly different from its equivalent in ocf-shellfuncs.in +# as it behaves like printf. +# Arguments: +# * __OCF_PRIO: log level +# * __OCF_MSG: printf-like format string +# * all other arguments are values for the printf-like format string +# +sub ocf_log { + my $__OCF_PRIO; + my $__OCF_MSG; + + # TODO: Revisit and implement internally. + if ( scalar @ARG < 2 ) { + ocf_log ( 'err', "Not enough arguments [%d] to ocf_log", scalar @ARG ); + } + + $__OCF_PRIO = shift; + $__OCF_MSG = shift; + $__OCF_MSG = sprintf $__OCF_MSG, @ARG; + + for ( $__OCF_PRIO ) { + if ( /crit/ ) { $__OCF_PRIO = 'CRIT' } + elsif ( /err/ ) { $__OCF_PRIO = 'ERROR' } + elsif ( /warn/ ) { $__OCF_PRIO = 'WARNING' } + elsif ( /info/ ) { $__OCF_PRIO = 'INFO' } + elsif ( /debug/ ) { $__OCF_PRIO = 'DEBUG' } + else { $__OCF_PRIO =~ tr/[a-z]/[A-Z]/ } + } + + if ( $__OCF_PRIO eq 'DEBUG' ) { + ha_debug( "$__OCF_PRIO: $__OCF_MSG"); + } + else { + ha_log( "$__OCF_PRIO: $__OCF_MSG"); + } +} + + +# +# ocf_exit_reason: print exit error string to stderr and log +# Usage: Allows the OCF script to provide a string +# describing why the exit code was returned. +# Arguments: reason - required, The string that represents +# why the error occured. +# +sub ocf_exit_reason { + my $cookie = $ENV{'OCF_EXIT_REASON_PREFIX'} || 'ocf-exit-reason:'; + my $fmt; + my $msg; + + # No argument is likely not intentional. + # Just one argument implies a printf format string of just "%s". + # "Least surprise" in case some interpolated string from variable + # expansion or other contains a percent sign. + # More than one argument: first argument is going to be the format string. + ocf_log ( 'err', 'Not enough arguments [%d] to ocf_exit_reason', + scalar @ARG ) if scalar @ARG < 1; + + $fmt = shift; + $msg = sprintf $fmt, @ARG; + + print STDERR "$cookie$msg\n"; + __ha_log( '--ignore-stderr', "ERROR: $msg" ); +} + +# returns true if the CRM is currently running a probe. A probe is +# defined as a monitor operation with a monitoring interval of zero. +sub ocf_is_probe { + return ( $__OCF_ACTION eq 'monitor' + and $ENV{'OCF_RESKEY_CRM_meta_interval'} == 0 ); +} + +# returns true if the resource is configured as a clone. This is +# defined as a resource where the clone-max meta attribute is present, +# and set to greater than zero. +sub ocf_is_clone { + return ( defined $ENV{'OCF_RESKEY_CRM_meta_clone_max'} + and $ENV{'OCF_RESKEY_CRM_meta_clone_max'} > 0 ); +} + +# returns true if the resource is configured as a multistate +# (master/slave) resource. This is defined as a resource where the +# master-max meta attribute is present, and set to greater than zero. +sub ocf_is_ms { + return ( defined $ENV{'OCF_RESKEY_CRM_meta_master_max'} + and $ENV{'OCF_RESKEY_CRM_meta_master_max'} > 0 ); +} + +# version check functions +# allow . and - to delimit version numbers +# max version number is 999 +# letters and such are effectively ignored +# +sub ocf_is_ver { + return $ARG[0] =~ /^[0-9][0-9.-]*[0-9]$/; +} + +sub ocf_ver2num { + my $v = 0; + + $v = $v * 1000 + $1 while $ARG[0] =~ /(\d+)/g; + + return $v; +} + +sub ocf_ver_level { + my $v = () = $ARG[0] =~ /(\d+)/g; + return $v; +} + +sub ocf_ver_complete_level { + my $ver = shift; + my $level = shift; + my $i = 0; + + for ( my $i = 0; $i < $level; $i++ ) { + $ver .= "$ver.0"; + } + + return $ver; +} + +# usage: ocf_version_cmp VER1 VER2 +# version strings can contain digits, dots, and dashes +# must start and end with a digit +# returns: +# 0: VER1 smaller (older) than VER2 +# 1: versions equal +# 2: VER1 greater (newer) than VER2 +# 3: bad format +sub ocf_version_cmp { + my $v1 = shift; + my $v2 = shift; + my $v1_level; + my $v2_level; + my $level_diff; + + return 3 unless ocf_is_ver( $v1 ); + return 3 unless ocf_is_ver( $v2 ); + + $v1_level = ocf_ver_level( $v1 ); + $v2_level = ocf_ver_level( $v2 ); + + if ( $v1_level < $v2_level ) { + $level_diff = $v2_level - $v1_level; + $v1 = ocf_ver_complete_level( $v1, $level_diff ); + } + elsif ( $v1_level > $v2_level ) { + $level_diff = $v1_level - $v2_level; + $v2 = ocf_ver_complete_level( $v2, $level_diff ); + } + + $v1 = ocf_ver2num( $v1 ); + $v2 = ocf_ver2num( $v2 ); + + if ( $v1 == $v2 ) { return 1; } + elsif ( $v1 < $v2 ) { return 0; } + + return 2; # -1 would look funny in shell ;-) ( T.N. not in perl ;) ) +} + +sub ocf_local_nodename { + # use crm_node -n for pacemaker > 1.1.8 + my $nodename; + + qx{ which pacemakerd > /dev/null 2>&1 }; + if ( $? == 0 ) { + my $version; + my $ret = qx{ pacemakerd -\$ }; + + $ret =~ /Pacemaker ([\d.]+)/; + $version = $1; + + if ( ocf_version_cmp( $version, '1.1.8' ) == 2 ) { + qx{ which crm_node > /dev/null 2>&1 }; + $nodename = qx{ crm_node -n } if $? == 0; + } + } + else { + # otherwise use uname -n + $nodename = qx { uname -n }; + } + + chomp $nodename; + return $nodename; +} + +# Parse and returns the notify environment variables in a convenient structure +# Returns undef if the action is not a notify +# Returns undef if the resource is neither a clone or a multistate one +sub ocf_notify_env { + my $i; + my %notify_env; + + return undef unless $__OCF_ACTION eq 'notify'; + + return undef unless ocf_is_clone() or ocf_is_ms(); + + %notify_env = ( + 'type' => $ENV{'OCF_RESKEY_CRM_meta_notify_type'} || '', + 'operation' => $ENV{'OCF_RESKEY_CRM_meta_notify_operation'} || '', + 'active' => [ ], + 'inactive' => [ ], + 'start' => [ ], + 'stop' => [ ], + ); + + for my $action ( qw{ active start stop } ) { + next unless + defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"} + and defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; + + $i = 0; + $notify_env{ $action }[$i++]{'rsc'} = $_ foreach split /\s+/ => + $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"}; + + $i = 0; + $notify_env{ $action }[$i++]{'uname'} = $_ foreach split /\s+/ => + $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; + } + + # notify_nactive_uname doesn't exists. See: + # http://lists.clusterlabs.org/pipermail/developers/2017-January/000406.html + if ( defined $ENV{"OCF_RESKEY_CRM_meta_notify_inactive_resource"} ) { + $i = 0; + $notify_env{'inactive'}[$i++]{'rsc'} = $_ foreach split /\s+/ => + $ENV{"OCF_RESKEY_CRM_meta_notify_inactive_resource"}; + } + + # exit if the resource is not a mutistate one + return %notify_env unless ocf_is_ms(); + + for my $action ( qw{ master slave promote demote } ) { + $notify_env{ $action } = [ ]; + + next unless + defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"} + and defined $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; + + $i = 0; + $notify_env{ $action }[$i++]{'rsc'} = $_ foreach split /\s+/ => + $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_resource"}; + + $i = 0; + $notify_env{ $action }[$i++]{'uname'} = $_ foreach split /\s+/ => + $ENV{"OCF_RESKEY_CRM_meta_notify_${action}_uname"}; + } + + # Fix active and inactive fields for Pacemaker version < 1.1.16 + # ie. crm_feature_set < 3.0.11 + # See http://lists.clusterlabs.org/pipermail/developers/2016-August/000265.html + # and git commit a6713c5d40327eff8549e7f596501ab1785b8765 + if ( + ocf_version_cmp( $ENV{"OCF_RESKEY_crm_feature_set"}, '3.0.11' ) == 0 + ) { + $notify_env{ 'active' } = [ + @{ $notify_env{ 'master' } }, + @{ $notify_env{ 'slave' } } + ]; + } + + return %notify_env; +} + +$__OCF_ACTION = $ARGV[0]; + +# Return to sanity for the agents... + +undef $ENV{'LC_ALL'}; +$ENV{'LC_ALL'} = 'C'; +setlocale( LC_ALL, 'C' ); +undef $ENV{'LANG'}; +undef $ENV{'LANGUAGE'}; + +$ENV{'OCF_ROOT'} = '/usr/lib/ocf' + unless defined $ENV{'OCF_ROOT'} and $ENV{'OCF_ROOT'} ne ''; + +# old +undef $ENV{'OCF_FUNCTIONS_DIR'} + if defined $ENV{'OCF_FUNCTIONS_DIR'} + and $ENV{'OCF_FUNCTIONS_DIR'} eq "$ENV{'OCF_ROOT'}/resource.d/heartbeat"; + +# Define OCF_RESKEY_CRM_meta_interval in case it isn't already set, +# to make sure that ocf_is_probe() always works +$ENV{'OCF_RESKEY_CRM_meta_interval'} = 0 + unless defined $ENV{'OCF_RESKEY_CRM_meta_interval'}; + +# Strip the OCF_RESKEY_ prefix from this particular parameter +unless ( defined $ENV{'$OCF_RESKEY_OCF_CHECK_LEVEL'} + and $ENV{'$OCF_RESKEY_OCF_CHECK_LEVEL'} ne '' +) { + $ENV{'OCF_CHECK_LEVEL'} = $ENV{'$OCF_RESKEY_OCF_CHECK_LEVEL'}; +} +else { + ENV{'OCF_CHECK_LEVEL'} = 0; +} + +unless ( -d $ENV{'OCF_ROOT'} ) { + ha_log( "ERROR: OCF_ROOT points to non-directory $ENV{'OCF_ROOT'}." ); + $! = $OCF_ERR_GENERIC; + die; +} + +$ENV{'OCF_RESOURCE_TYPE'} = $__SCRIPT_NAME + unless defined $ENV{'OCF_RESOURCE_TYPE'} + and $ENV{'OCF_RESOURCE_TYPE'} ne ''; + +unless ( defined $ENV{'OCF_RA_VERSION_MAJOR'} + and $ENV{'OCF_RA_VERSION_MAJOR'} ne '' +) { + # We are being invoked as an init script. + # Fill in some things with reasonable values. + $ENV{'OCF_RESOURCE_INSTANCE'} = 'default'; + return 1; +} + +$ENV{'OCF_RESOURCE_INSTANCE'} = "undef" if $__OCF_ACTION eq 'meta-data'; + +unless ( defined $ENV{'OCF_RESOURCE_INSTANCE'} + and $ENV{'OCF_RESOURCE_INSTANCE'} ne '' +) { + ha_log( "ERROR: Need to tell us our resource instance name." ); + $! = $OCF_ERR_ARGS; + die; +} + +1; + + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2016: Jehan-Guillaume de Rorthais and Mael Rimbault. + +Licensed under the PostgreSQL License. diff --color -uNr a/heartbeat/OCF_ReturnCodes.pm b/heartbeat/OCF_ReturnCodes.pm --- a/heartbeat/OCF_ReturnCodes.pm 1970-01-01 01:00:00.000000000 +0100 +++ b/heartbeat/OCF_ReturnCodes.pm 2021-04-13 13:37:35.621267404 +0200 @@ -0,0 +1,97 @@ +#!/usr/bin/perl +# This program is open source, licensed under the PostgreSQL License. +# For license terms, see the LICENSE file. +# +# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault + +=head1 NAME + +OCF_ReturnCodes - Common varibales for the OCF Resource Agents supplied by +heartbeat. + +=head1 SYNOPSIS + + use FindBin; + use lib "$FindBin::RealBin/../../lib/heartbeat/"; + + use OCF_ReturnCodes; + +=head1 DESCRIPTION + +This module has been ported from the ocf-retrurncodes shell script of the +resource-agents project. See L<https://github.com/ClusterLabs/resource-agents/>. + +=head1 VARIABLES + +Here are the variables exported by this module: + +=over + +=item $OCF_SUCCESS + +=item $OCF_ERR_GENERIC + +=item $OCF_ERR_ARGS + +=item $OCF_ERR_UNIMPLEMENTED + +=item $OCF_ERR_PERM + +=item $OCF_ERR_INSTALLED + +=item $OCF_ERR_CONFIGURED + +=item $OCF_NOT_RUNNING + +=item $OCF_RUNNING_MASTER + +=item $OCF_FAILED_MASTER + +=back + +=cut + +package OCF_ReturnCodes; + +use strict; +use warnings; +use 5.008; + +BEGIN { + use Exporter; + + our $VERSION = 'v2.3.0'; + our @ISA = ('Exporter'); + our @EXPORT = qw( + $OCF_SUCCESS + $OCF_ERR_GENERIC + $OCF_ERR_ARGS + $OCF_ERR_UNIMPLEMENTED + $OCF_ERR_PERM + $OCF_ERR_INSTALLED + $OCF_ERR_CONFIGURED + $OCF_NOT_RUNNING + $OCF_RUNNING_MASTER + $OCF_FAILED_MASTER + ); + our @EXPORT_OK = ( @EXPORT ); +} + +our $OCF_SUCCESS = 0; +our $OCF_ERR_GENERIC = 1; +our $OCF_ERR_ARGS = 2; +our $OCF_ERR_UNIMPLEMENTED = 3; +our $OCF_ERR_PERM = 4; +our $OCF_ERR_INSTALLED = 5; +our $OCF_ERR_CONFIGURED = 6; +our $OCF_NOT_RUNNING = 7; +our $OCF_RUNNING_MASTER = 8; +our $OCF_FAILED_MASTER = 9; + +1; + +=head1 COPYRIGHT AND LICENSE + +Copyright (C) 2016: Jehan-Guillaume de Rorthais and Mael Rimbault. + +Licensed under the PostgreSQL License. diff --color -uNr a/heartbeat/pgsqlms b/heartbeat/pgsqlms --- a/heartbeat/pgsqlms 1970-01-01 01:00:00.000000000 +0100 +++ b/heartbeat/pgsqlms 2021-04-13 13:37:40.934280411 +0200 @@ -0,0 +1,2308 @@ +#!/usr/bin/perl +# This program is open source, licensed under the PostgreSQL License. +# For license terms, see the LICENSE file. +# +# Copyright (C) 2016-2020: Jehan-Guillaume de Rorthais and Mael Rimbault + +=head1 NAME + +ocf_heartbeat_pgsqlms - A PostgreSQL multi-state resource agent for Pacemaker + +=head1 SYNOPSIS + +B<pgsqlms> [start | stop | monitor | promote | demote | notify | reload | methods | meta-data | validate-all] + +=head1 DESCRIPTION + +Resource script for PostgreSQL in replication. It manages PostgreSQL servers using streaming replication as an HA resource. + +=cut + +use strict; +use warnings; +use 5.008; + +use POSIX qw(locale_h); +use Scalar::Util qw(looks_like_number); +use File::Spec; +use File::Temp; +use Data::Dumper; + +my $OCF_FUNCTIONS_DIR; +BEGIN { + $OCF_FUNCTIONS_DIR = defined $ENV{'OCF_FUNCTIONS_DIR'} ? "$ENV{'OCF_FUNCTIONS_DIR'}" : "$ENV{'OCF_ROOT'}/lib/heartbeat"; +} +use lib "$OCF_FUNCTIONS_DIR"; + +use OCF_ReturnCodes; +use OCF_Directories; +use OCF_Functions; + +our $VERSION = 'v2.3.0'; +our $PROGRAM = 'pgsqlms'; + +# OCF environment +my $OCF_RESOURCE_INSTANCE = $ENV{'OCF_RESOURCE_INSTANCE'}; +my $OCF_RUNNING_SLAVE = $OCF_SUCCESS; +my %OCF_NOTIFY_ENV = ocf_notify_env() if $__OCF_ACTION eq 'notify'; + +# Default parameters values +my $system_user_default = "postgres"; +my $bindir_default = "/usr/bin"; +my $pgdata_default = "/var/lib/pgsql/data"; +my $pghost_default = "/tmp"; +my $pgport_default = 5432; +my $start_opts_default = ""; +my $maxlag_default = "0"; + +# Set default values if not found in environment +my $system_user = $ENV{'OCF_RESKEY_system_user'} || $system_user_default; +my $bindir = $ENV{'OCF_RESKEY_bindir'} || $bindir_default; +my $pgdata = $ENV{'OCF_RESKEY_pgdata'} || $pgdata_default; +my $datadir = $ENV{'OCF_RESKEY_datadir'} || $pgdata; +my $pghost = $ENV{'OCF_RESKEY_pghost'} || $pghost_default; +my $pgport = $ENV{'OCF_RESKEY_pgport'} || $pgport_default; +my $start_opts = $ENV{'OCF_RESKEY_start_opts'} || $start_opts_default; +my $maxlag = $ENV{'OCF_RESKEY_maxlag'} || $maxlag_default; +my $recovery_tpl = $ENV{'OCF_RESKEY_recovery_template'} + || "$pgdata/recovery.conf.pcmk"; + + +# PostgreSQL commands path +my $POSTGRES = "$bindir/postgres"; +my $PGCTL = "$bindir/pg_ctl"; +my $PGPSQL = "$bindir/psql"; +my $PGCTRLDATA = "$bindir/pg_controldata"; +my $PGISREADY = "$bindir/pg_isready"; +my $PGWALDUMP = "$bindir/pg_waldump"; + +# pacemaker commands path +my $CRM_MASTER = "$HA_SBIN_DIR/crm_master --lifetime forever"; +my $CRM_NODE = "$HA_SBIN_DIR/crm_node"; +my $CRM_RESOURCE = "$HA_SBIN_DIR/crm_resource"; +my $ATTRD_PRIV = "$HA_SBIN_DIR/attrd_updater --private --lifetime reboot"; + +# Global vars +my $nodename; +my $exit_code = 0; +# numeric pgsql versions +my $PGVERNUM; +my $PGVER_93 = 90300; +my $PGVER_10 = 100000; +my $PGVER_12 = 120000; + +# Run a query using psql. +# +# This function returns an array with psql return code as first element and +# the result as second one. +# +sub _query { + my $query = shift; + my $res = shift; + my $connstr = "dbname=postgres"; + my $RS = chr(30); # ASCII RS (record separator) + my $FS = chr(3); # ASCII ETX (end of text) + my $postgres_uid = getpwnam( $system_user ); + my $oldeuid = $>; + my $tmpfile; + my @res; + my $ans; + my $pid; + my $rc; + + unless ( defined $res and defined $query and $query ne '' ) { + ocf_log( 'debug', '_query: wrong parameters!' ); + return -1; + } + + unless ( $tmpfile = File::Temp->new( + TEMPLATE => 'pgsqlms-XXXXXXXX', + DIR => $HA_RSCTMP + ) ) + { + ocf_exit_reason( 'Could not create or write in a temp file' ); + exit $OCF_ERR_INSTALLED; + } + + print $tmpfile $query; + chmod 0644, $tmpfile; + + ocf_log( 'debug', '_query: %s', $query ); + + # Change the effective user to the given system_user so after forking + # the given uid to the process should allow psql to connect w/o password + $> = $postgres_uid; + + # Forking + piping + $pid = open(my $KID, "-|"); + + if ( $pid == 0 ) { # child + exec $PGPSQL, '--set', 'ON_ERROR_STOP=1', '-qXAtf', $tmpfile, + '-R', $RS, '-F', $FS, '--port', $pgport, '--host', $pghost, + $connstr; + } + + # parent + $> = $oldeuid; + + { + local $/; + $ans = <$KID>; + } + + close $KID; + $rc = $? >> 8; + + ocf_log( 'debug', '_query: psql return code: %d', $rc ); + + if ( defined $ans ) { + chop $ans; + + push @{ $res }, [ split(chr(3) => $_, -1) ] + foreach split (chr(30) => $ans, -1); + + ocf_log( 'debug', '_query: @res: %s', + Data::Dumper->new( [ $res ] )->Terse(1)->Dump ); + } + + # Possible return codes: + # -1: wrong parameters + # 0: OK + # 1: failed to get resources (memory, missing file, ...) + # 2: unable to connect + # 3: query failed + return $rc; +} + +# Get the last received location on a standby +# if the first argument is true, returns the value as decimal +# if the first argument is false, returns the value as LSN +# Returns undef if query failed +sub _get_last_received_lsn { + my ( $dec ) = @_; + my $pg_last_wal_receive_lsn = 'pg_last_wal_receive_lsn()'; + my $pg_wal_lsn_diff = 'pg_wal_lsn_diff'; + my $query; + my $rc; + my @rs; + + if ( $PGVERNUM < $PGVER_10 ) { + $pg_last_wal_receive_lsn = 'pg_last_xlog_receive_location()'; + $pg_wal_lsn_diff = 'pg_xlog_location_diff'; + } + + if ( $dec ) { + $query = "SELECT $pg_wal_lsn_diff( $pg_last_wal_receive_lsn, '0/0' )"; + } + else { + $query = "SELECT $pg_last_wal_receive_lsn"; + } + + $rc = _query( $query, \@rs ); + + return $rs[0][0] if $rc == 0 and $rs[0][0]; + + ocf_log( 'err', 'Could not query last received LSN (%s)', $rc ) if $rc != 0; + ocf_log( 'err', 'No values for last received LSN' ) + if $rc == 0 and not $rs[0][0]; + + return undef; +} + +# Get the master score for each connected standby +# Returns directly the result set of the query or exit with an error. +# Exits with OCF_ERR_GENERIC if the query failed +sub _get_lag_scores { + my $pg_current_wal_lsn = 'pg_current_wal_lsn()'; + my $pg_wal_lsn_diff = 'pg_wal_lsn_diff'; + my $write_lsn = 'write_lsn'; + my $query; + my $rc; + my @rs; + + if ( $PGVERNUM < $PGVER_10 ) { + $pg_current_wal_lsn = 'pg_current_xlog_location()'; + $pg_wal_lsn_diff = 'pg_xlog_location_diff'; + $write_lsn = 'write_location'; + } + + # We check locations of connected standbies by querying the + # "pg_stat_replication" view. + # The row_number applies on the result set ordered on write_location ASC so + # the highest row_number should be given to the closest node from the + # master, then the lowest node name (alphanumeric sort) in case of equality. + # The result set itself is order by priority DESC to process best known + # candidate first. + $query = qq{ + SELECT application_name, priority, location, state, current_lag + FROM ( + SELECT application_name, + (1000 - ( + row_number() OVER ( + PARTITION BY state IN ('startup', 'backup') + ORDER BY location ASC, application_name ASC + ) - 1 + ) * 10 + ) * CASE WHEN ( $maxlag > 0 + AND current_lag > $maxlag) + THEN -1 + ELSE 1 + END AS priority, + location, state, current_lag + FROM ( + SELECT application_name, $write_lsn AS location, state, + $pg_wal_lsn_diff($pg_current_wal_lsn, $write_lsn) AS current_lag + FROM pg_stat_replication + ) AS s2 + ) AS s1 + ORDER BY priority DESC + }; + + $rc = _query( $query, \@rs ); + + if ( $rc != 0 ) { + ocf_exit_reason( 'Query to get standby locations failed (%d)', $rc ); + exit $OCF_ERR_GENERIC; + } + + return \@rs; +} + +# get the timeout for the current action given from environment var +# Returns timeout as integer +# undef if unknown +sub _get_action_timeout { + my $timeout = $ENV{'OCF_RESKEY_CRM_meta_timeout'} / 1000; + + ocf_log( 'debug', '_get_action_timeout: known timeout: %s', + defined $timeout ? $timeout : 'undef' ); + + return $timeout if defined $timeout and $timeout =~ /^\d+$/; + + return undef; +} + +# Get, parse and return the value of the given private attribute name +# Returns an empty string if not found. +sub _get_priv_attr { + my ( $name, $node ) = @_; + my $val = ''; + my $node_arg = ''; + my $ans; + + $node = '' unless defined $node; + $name = "$name-$OCF_RESOURCE_INSTANCE"; + + $node_arg= "--node $node" if $node ne ''; + + $ans = qx{ $ATTRD_PRIV --name "$name" --query $node_arg }; + + $ans =~ m/^name=".*" host=".*" value="(.*)"$/; + + $val = $1 if defined $1; + + ocf_log( 'debug', '_get_priv_attr: value of "%s"%s is "%s"', $name, + ( $node ? " on \"$node\"": ""), + $val ); + + return $val; +} + +# Set the given private attribute name to the given value +# As setting an attribute is asynchronous, this will return as soon as the +# attribute is really set by attrd and available. +sub _set_priv_attr { + my ( $name, $val ) = @_; + my $name_instance = "$name-$OCF_RESOURCE_INSTANCE"; + + ocf_log( 'debug', '_set_priv_attr: set "%s=%s"...', $name_instance, $val ); + + qx{ $ATTRD_PRIV --name "$name_instance" --update "$val" }; + + # give attr name without the resource instance name as _get_priv_attr adds + # it as well + while ( _get_priv_attr( $name ) ne $val ) { + ocf_log( 'debug', '_set_priv_attr: waiting attrd ack for "%s"...', $name_instance ); + select( undef, undef, undef, 0.1 ); + } + + return; +} + +# Delete the given private attribute. +# As setting an attribute is asynchronous, this will return as soon as the +# attribute is really deleted by attrd. +sub _delete_priv_attr { + my ( $name ) = @_; + my $name_instance = "$name-$OCF_RESOURCE_INSTANCE"; + + ocf_log( 'debug', '_delete_priv_attr: delete "%s"...', $name_instance ); + + qx{ $ATTRD_PRIV --name "$name_instance" --delete }; + + # give attr name without the resource instance name as _get_priv_attr adds + # it as well + while ( _get_priv_attr( $name ) ne '' ) { + ocf_log( 'debug', '_delete_priv_attr: waiting attrd ack for "%s"...', + $name_instance ); + select( undef, undef, undef, 0.1 ); + } + + return; +} + +# Get, parse and return the resource master score on given node. +# Returns an empty string if not found. +# Returns undef on crm_master call on error +sub _get_master_score { + my ( $node ) = @_; + my $node_arg = ''; + my $score; + + $node_arg = sprintf '--node "%s"', $node if defined $node and $node ne ''; + + $score = qx{ $CRM_MASTER --quiet --get-value $node_arg 2> /dev/null }; + + return '' unless $? == 0 and defined $score; + + chomp $score; + + return $score; +} + +# Set the master score of the local node or the optionally given node. +# As setting an attribute is asynchronous, this will return as soon as the +# attribute is really set by attrd and available everywhere. +sub _set_master_score { + my ( $score, $node ) = @_; + my $node_arg = ''; + my $tmp; + + $node_arg = sprintf '--node "%s"', $node if defined $node and $node ne ''; + + qx{ $CRM_MASTER $node_arg --quiet --update "$score" }; + + while ( ( $tmp = _get_master_score( $node ) ) ne $score ) { + ocf_log( 'debug', + '_set_master_score: waiting to set score to "%s" (currently "%s")...', + $score, $tmp ); + select(undef, undef, undef, 0.1); + } + + return; +} + +# _master_score_exists +# This subroutine checks if a master score is set for one of the relative clones +# in the cluster and the score is greater or equal of 0. +# Returns 1 if at least one master score >= 0 is found. +# Returns 0 otherwise +sub _master_score_exists { + my @partition_nodes = split /\s+/ => qx{ $CRM_NODE --partition }; + + foreach my $node ( @partition_nodes ) { + my $score = _get_master_score( $node ); + + return 1 if defined $score and $score ne '' and $score > -1; + } + + return 0; +} + +# Check if the current transiation is a recover of a master clone on given node. +sub _is_master_recover { + my ( $n ) = @_; + + return ( + scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'master'} } + and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'promote'} } + ); +} + +# Check if the current transition is a recover of a slave clone on given node. +sub _is_slave_recover { + my ( $n ) = @_; + + return ( + scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'slave'} } + and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'start'} } + ); +} + +# check if th current transition is a switchover to the given node. +sub _is_switchover { + my ( $n ) = @_; + my $old = $OCF_NOTIFY_ENV{'master'}[0]{'uname'}; + + return 0 if scalar @{ $OCF_NOTIFY_ENV{'master'} } != 1 + or scalar @{ $OCF_NOTIFY_ENV{'demote'} } != 1 + or scalar @{ $OCF_NOTIFY_ENV{'promote'} } != 1; + + return ( + scalar grep { $_->{'uname'} eq $old } @{ $OCF_NOTIFY_ENV{'demote'} } + and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'slave'} } + and scalar grep { $_->{'uname'} eq $n } @{ $OCF_NOTIFY_ENV{'promote'} } + and not scalar grep { $_->{'uname'} eq $old } @{ $OCF_NOTIFY_ENV{'stop'} } + ); +} + +# Run the given command as the "system_user" given as parameter. +# It basically forks and seteuid/setuid away from root. +# +sub _runas { + my $rc; + my $pid; + my @cmd = @_; + my (undef, undef, $postgres_uid, $postgres_gid ) = getpwnam( $system_user ); + + $pid = fork; + + if ( $pid == 0 ) { # in child + $) = "$postgres_gid $postgres_gid"; + while ( my ( undef, undef, $gid, $members ) = getgrent ) { + $) .= " $gid" if grep { $system_user eq $_ } split /\s+/, $members + } + $( = $postgres_gid; + + $< = $> = $postgres_uid; + + exec @cmd; + } + + ocf_log( 'debug', '_runas: launching as "%s" command "%s"', $system_user, + join(' ', @cmd) ); + + waitpid $pid, 0; + $rc = $? >> 8; + + return $rc; +} + +# Check if instance is listening on the given host/port. +# +sub _pg_isready { + # Add 60s to the timeout or use a 24h timeout fallback to make sure + # Pacemaker will give up before us and take decisions + my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; + my $rc = _runas( $PGISREADY, '-h', $pghost, '-p', $pgport, '-d', 'postgres', '-t', $timeout ); + + # Possible error codes: + # 1: ping rejected (usually when instance is in startup, in crash + # recovery, in warm standby, or when a shutdown is in progress) + # 2: no response, usually means the instance is down + # 3: no attempt, probably a syntax error, should not happen + return $rc; +} + +# Check the postmaster.pid file and the postmaster process. +# WARNING: we do not distinguish the scenario where postmaster.pid does not +# exist from the scenario where the process is still alive. It should be ok +# though, as this is considered a hard error from monitor. +# +sub _pg_ctl_status { + my $rc = _runas( $PGCTL, '--pgdata', $pgdata, 'status' ); + + # Possible error codes: + # 3: postmaster.pid file does not exist OR it does but the process + # with the PID found in the file is not alive + return $rc; +} + +# Start the local instance using pg_ctl +# +sub _pg_ctl_start { + # Add 60s to the timeout or use a 24h timeout fallback to make sure + # Pacemaker will give up before us and take decisions + my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; + + my @cmd = ( $PGCTL, '--pgdata', $pgdata, '-w', '--timeout', $timeout, 'start' ); + + push @cmd => ( '-o', $start_opts ) if $start_opts ne ''; + + return _runas( @cmd ); +} + +# Enable the Standby mode. +# +# Up to v11, creates the recovery.conf file based on the given template. +# Since v12, creates standby.signal. +sub _enable_recovery { + my $fh; + my $content = ''; + my $standby_file = "$datadir/standby.signal"; + my (undef, undef, $uid, $gid) = getpwnam($system_user); + + if ( $PGVERNUM < $PGVER_12 ) { + $standby_file = "$datadir/recovery.conf"; + + ocf_log( 'debug', + '_enable_recovery: get replication configuration from the template file "%s"', + $recovery_tpl ); + + # Create the recovery.conf file to start the instance as a secondary. + # NOTE: the recovery.conf is supposed to be set up so the secondary can + # connect to the primary instance, eg. using a virtual IP address. + # As there is no primary instance available at startup, secondaries will + # complain about failing to connect. + # As we can not reload a recovery.conf file on a standby without restarting + # it, we will leave with this. + # FIXME how would the reload help us in this case ? + unless ( defined open( $fh, '<', $recovery_tpl ) ) { + ocf_exit_reason( 'Could not open file "%s": %s', $recovery_tpl, $! ); + exit $OCF_ERR_CONFIGURED; + } + + # Copy all parameters from the template file + while (my $line = <$fh>) { + chomp $line; + $content .= "$line\n"; + } + close $fh; + } + + ocf_log( 'debug', '_enable_recovery: write the standby file "%s"', $standby_file ); + + unless ( open( $fh, '>', $standby_file ) ) { + ocf_exit_reason( 'Could not open file "%s": %s', $standby_file, $! ); + exit $OCF_ERR_CONFIGURED; + } + + # Write the recovery.conf file using configuration from the template file + print $fh $content; + + close $fh; + + unless ( chown $uid, $gid, $standby_file ) { + ocf_exit_reason( 'Could not set owner of "%s"', $standby_file ); + exit $OCF_ERR_CONFIGURED; + }; +} + +# Parse and return various informations about the local PostgreSQL instance as +# reported by its controldata file. +# +# WARNING: the status is NOT updated in case of crash. +# +# This sub exit the script with an error on failure +sub _get_controldata { + my %controldata; + my $ans; + + $ans = qx{ $PGCTRLDATA "$datadir" 2>/dev/null }; + + # Parse the output of pg_controldata. + # This output is quite stable between pg versions, but we might need to sort + # it at some point if things are moving in there... + $ans =~ m{ + # get the current state + ^\QDatabase cluster state\E:\s+(.*?)\s*$ + .* + # Get the latest known REDO location + ^\QLatest checkpoint's REDO location\E:\s+([/0-9A-F]+)\s*$ + .* + # Get the latest known TL + ^\QLatest checkpoint's TimeLineID\E:\s+(\d+)\s*$ + .* + # Get the wal level + # NOTE: pg_controldata output changed with PostgreSQL 9.5, so we need to + # account for both syntaxes + ^(?:\QCurrent \E)?\Qwal_level setting\E:\s+(.*?)\s*$ + }smx; + + $controldata{'state'} = $1 if defined $1; + $controldata{'redo'} = $2 if defined $2; + $controldata{'tl'} = $3 if defined $3; + $controldata{'wal_level'} = $4 if defined $4; + + ocf_log( 'debug', + "_get_controldata: found: %s", + Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump ); + + return %controldata if defined $controldata{'state'} + and defined $controldata{'tl'} + and defined $controldata{'redo'} + and defined $controldata{'wal_level'}; + + ocf_exit_reason( 'Could not read all datas from controldata file for "%s"', + $datadir ); + + ocf_log( 'debug', + "_get_controldata: controldata file: %s", + Data::Dumper->new( [ \%controldata ] )->Terse(1)->Dump, $ans ); + + exit $OCF_ERR_ARGS; +} + +# Pead major version from datadir/PG_VERSION and return it as numeric version +sub _get_pg_version { + my $fh; + my $PGVERSION; + my $PGVERNUM; + + # check PG_VERSION + if ( ! -s "$datadir/PG_VERSION" ) { + ocf_exit_reason( 'PG_VERSION does not exist in "%s"', $datadir ); + exit $OCF_ERR_ARGS; + } + + unless ( open( $fh, '<', "$datadir/PG_VERSION" ) ) { + ocf_exit_reason( "Could not open file \"$datadir/PG_VERSION\": $!" ); + exit $OCF_ERR_ARGS; + } + + read( $fh, $PGVERSION, 32 ); + close $fh; + + chomp $PGVERSION; + + $PGVERSION =~ /^(\d+)(?:\.(\d+))?$/; + $PGVERNUM = $1 * 10000; + $PGVERNUM += $2 * 100 if $1 < 10; # no 2nd num in the major version from v10 + + return $PGVERNUM; +} + +# Use pg_controldata to check the state of the PostgreSQL server. This +# function returns codes depending on this state, so we can find whether the +# instance is a primary or a secondary, or use it to detect any inconsistency +# that could indicate the instance has crashed. +# +sub _controldata_to_ocf { + my %cdata = _get_controldata(); + + while ( 1 ) { + ocf_log( 'debug', '_controldata: instance "%s" state is "%s"', + $OCF_RESOURCE_INSTANCE, $cdata{'state'} ); + + # Instance should be running as a primary. + return $OCF_RUNNING_MASTER if $cdata{'state'} eq "in production"; + + # Instance should be running as a secondary. + # This state includes warm standby (rejects connections attempts, + # including pg_isready) + return $OCF_SUCCESS if $cdata{'state'} eq "in archive recovery"; + + + # The instance should be stopped. + # We don't care if it was a primary or secondary before, because we + # always start instances as secondaries, and then promote if necessary. + return $OCF_NOT_RUNNING if $cdata{'state'} eq "shut down" + or $cdata{'state'} eq "shut down in recovery"; + + # The state is "in crash recovery", "starting up" or "shutting down". + # This state should be transitional, so we wait and loop to check if + # it changes. + # If it does not, pacemaker will eventually abort with a timeout. + ocf_log( 'debug', + '_controldata: waiting for transitionnal state "%s" to finish', + $cdata{'state'} ); + sleep 1; + %cdata = _get_controldata(); + } + + # If we reach this point, something went really wrong with this code or + # pg_controldata. + ocf_exit_reason( 'Unable get instance "%s" state using pg_controldata', + $OCF_RESOURCE_INSTANCE ); + + return $OCF_ERR_INSTALLED ; +} + +# Check the write_location of all secondaries, and adapt their master score so +# that the instance closest to the master will be the selected candidate should +# a promotion be triggered. +# NOTE: This is only a hint to pacemaker! The selected candidate to promotion +# actually re-check it is the best candidate and force a re-election by failing +# if a better one exists. This avoid a race condition between the call of the +# monitor action and the promotion where another slave might have catchup faster +# with the master. +# NOTE: we cannot directly use the write_location, neither a lsn_diff value as +# promotion score as Pacemaker considers any value greater than 1,000,000 as +# INFINITY. +# +# This sub must be executed from a master monitor action. +# +sub _check_locations { + my $partition_nodes; + my $node_score; + my $row_num; + my $row; + my @rs; + + # Set the master score if not already done + $node_score = _get_master_score(); + _set_master_score( '1001' ) unless $node_score eq '1001'; + + # Ask crm_node what nodes are present in our current cluster partition + $partition_nodes = qx{ $CRM_NODE --partition }; + + @rs = @{ _get_lag_scores() }; + + $row_num = scalar @rs; + + # If no lag are reported at this point, it means that there is no + # secondary instance connected. + ocf_log( 'warning', 'No secondary connected to the master' ) + if $row_num == 0; + + # For each standby connected, set their master score based on the following + # rule: the first known node/application, with the highest priority and + # an acceptable state. + while ( $row = shift @rs ) { + + if ( $partition_nodes !~ /$row->[0]/ ) { + ocf_log( 'info', 'Ignoring unknown application_name/node "%s"', + $row->[0] ); + next; + } + + if ( $row->[0] eq $nodename ) { + ocf_log( 'warning', 'Streaming replication with myself!' ); + next; + } + + $node_score = _get_master_score( $row->[0] ); + + if ( $row->[3] =~ /^\s*(?:startup|backup)\s*$/ ) { + # We exclude any standby being in state backup (pg_basebackup) or + # startup (new standby or failing standby) + ocf_log( 'info', 'Forbidding promotion on "%s" in state "%s"', + $row->[0], $row->[3] ); + + _set_master_score( '-1', $row->[0] ) unless $node_score eq '-1'; + } + else { + ocf_log( 'debug', + '_check_locations: checking "%s" promotion ability (current_score: %s, priority: %s, location: %s, lag: %s)', + $row->[0], $node_score, $row->[1], $row->[2], $row->[4] ); + + if ( $node_score ne $row->[1] ) { + if ( $row->[1] < -1 ) { + ocf_log( 'info', 'Update score of "%s" from %s to %s because replication lag (%s) is higher than given maxlag (%s).', + $row->[0], $node_score, $row->[1], $row->[4], $maxlag ); + } + else { + ocf_log( 'info', 'Update score of "%s" from %s to %s because of a change in the replication lag (%s).', + $row->[0], $node_score, $row->[1], $row->[4] ); + } + _set_master_score( $row->[1], $row->[0] ); + } + else { + ocf_log( 'debug', + '_check_locations: "%s" keeps its current score of %s', + $row->[0], $row->[1] ); + } + } + + # Remove this node from the known nodes list. + $partition_nodes =~ s/(?:^|\s)$row->[0](?:\s|$)/ /g; + } + + $partition_nodes =~ s/(?:^\s+)|(?:\s+$)//g; + + # If there are still nodes in "partition_nodes", it means there is no + # corresponding line in "pg_stat_replication". + # Exclude these nodes that are not part of the cluster at this + # point. + foreach my $node (split /\s+/ => $partition_nodes) { + # Exclude the current node. + next if $node eq $nodename; + + # do not warn if the master score is already set to -1000. + # this avoid log flooding (gh #138) + $node_score = _get_master_score( $node ); + next if $node_score eq '-1000'; + + ocf_log( 'warning', '"%s" is not connected to the primary', $node ); + _set_master_score( '-1000', $node ); + } + + return $OCF_SUCCESS; +} + +# _check_switchover +# check if the pgsql switchover to the localnode is safe. +# This is supposed to be called **after** the master has been stopped or demoted. +# This sub checks if the local standby received the shutdown checkpoint from the +# old master to make sure it can take over the master role and the old master +# will be able to catchup as a standby after. +# +# Returns 0 if switchover is safe +# Returns 1 if swithcover is not safe +# Returns 2 for internal error +sub _check_switchover { + my $has_sht_chk = 0; + my $last_redo; + my $last_lsn; + my $ans; + my $rc; + my $tl; + my %cdata; + + $PGWALDUMP = "$bindir/pg_xlogdump" if $PGVERNUM < $PGVER_10; + + ocf_log( 'info', 'Switchover in progress from "%s" to "%s".' + .' Need to check the last record in WAL', + $OCF_NOTIFY_ENV{'demote'}[0]{'uname'}, $nodename ); + + # check if we received the shutdown checkpoint of the master during its + # demote process. + # We need the last local checkpoint LSN and the last received LSN from + # master to check in the WAL between these adresses if we have a + # "checkpoint shutdown" using pg_xlogdump/pg_waldump. + # + # Force a checkpoint to make sure the controldata shows the very last TL + # and the master's shutdown checkpoint + _query( q{ CHECKPOINT }, {} ); + %cdata = _get_controldata(); + $tl = $cdata{'tl'}; + $last_redo = $cdata{'redo'}; + + # Get the last received LSN from master + $last_lsn = _get_last_received_lsn(); + + unless ( defined $last_lsn ) { + ocf_exit_reason( 'Could not fetch last received LSN!' ); + + return 2; + } + + $ans = qx{ $PGWALDUMP --path "$datadir" --timeline "$tl" \\ + --start "$last_redo" --end "$last_lsn" 2>&1 }; + $rc = $?; + + ocf_log( 'debug', + '_check_switchover: %s rc: "%s", tl: "%s", last_chk: %s, last_lsn: %s, output: "%s"', + $PGWALDUMP, $rc, $tl, $last_redo, $last_lsn, $ans + ); + + if ( $rc == 0 and + $ans =~ m{^rmgr: XLOG.*desc: (?i:checkpoint)(?::|_SHUTDOWN) redo [0-9A-F/]+; tli $tl;.*; shutdown$}m + ) { + ocf_log( 'info', 'Slave received the shutdown checkpoint' ); + return 0; + } + + ocf_exit_reason( + 'Did not receive the shutdown checkpoint from the old master!' ); + + return 1; +} + +# Check to confirm if the instance is really started as _pg_isready stated and +# check if the instance is primary or secondary. +# +sub _confirm_role { + my $is_in_recovery; + my $rc; + my @rs; + + $rc = _query( "SELECT pg_is_in_recovery()", \@rs ); + + $is_in_recovery = $rs[0][0]; + + if ( $rc == 0 ) { + # The query was executed, check the result. + if ( $is_in_recovery eq 't' ) { + # The instance is a secondary. + ocf_log( 'debug', "_confirm_role: instance $OCF_RESOURCE_INSTANCE is a secondary"); + return $OCF_SUCCESS; + } + elsif ( $is_in_recovery eq 'f' ) { + # The instance is a primary. + ocf_log( 'debug', "_confirm_role: instance $OCF_RESOURCE_INSTANCE is a primary"); + # Check lsn diff with current slaves if any + _check_locations() if $__OCF_ACTION eq 'monitor'; + return $OCF_RUNNING_MASTER; + } + + # This should not happen, raise a hard configuration error. + ocf_exit_reason( + 'Unexpected result from query to check if "%s" is a primary or a secondary: "%s"', + $OCF_RESOURCE_INSTANCE, $is_in_recovery ); + + return $OCF_ERR_CONFIGURED; + } + elsif ( $rc == 1 or $rc == 2 ) { + # psql cound not connect to the instance. + # As pg_isready reported the instance was listening, this error + # could be a max_connection saturation. Just report a soft error. + ocf_exit_reason( 'psql could not connect to instance "%s"', + $OCF_RESOURCE_INSTANCE ); + return $OCF_ERR_GENERIC; + } + + # The query failed (rc: 3) or bad parameters (rc: -1). + # This should not happen, raise a hard configuration error. + ocf_exit_reason( + 'The query to check if instance "%s" is a primary or a secondary failed (rc: %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + + return $OCF_ERR_CONFIGURED; +} + + +# Check to confirm if the instance is really stopped as _pg_isready stated +# and if it was propertly shut down. +# +sub _confirm_stopped { + my $pgctlstatus_rc; + my $controldata_rc; + + # Check the postmaster process status. + $pgctlstatus_rc = _pg_ctl_status(); + + if ( $pgctlstatus_rc == 0 ) { + # The PID file exists and the process is available. + # That should not be the case, return an error. + ocf_exit_reason( + 'Instance "%s" is not listening, but the process referenced in postmaster.pid exists', + $OCF_RESOURCE_INSTANCE ); + return $OCF_ERR_GENERIC; + } + + # The PID file does not exist or the process is not available. + ocf_log( 'debug', + '_confirm_stopped: no postmaster process found for instance "%s"', + $OCF_RESOURCE_INSTANCE ); + + if ( -f "$datadir/backup_label" ) { + # We are probably on a freshly built secondary that was not started yet. + ocf_log( 'debug', + '_confirm_stopped: backup_label file exists: probably on a never started secondary', + ); + return $OCF_NOT_RUNNING; + } + + # Continue the check with pg_controldata. + $controldata_rc = _controldata_to_ocf(); + if ( $controldata_rc == $OCF_RUNNING_MASTER ) { + # The controldata has not been updated to "shutdown". + # It should mean we had a crash on a primary instance. + ocf_exit_reason( + 'Instance "%s" controldata indicates a running primary instance, the instance has probably crashed', + $OCF_RESOURCE_INSTANCE ); + return $OCF_FAILED_MASTER; + } + elsif ( $controldata_rc == $OCF_SUCCESS ) { + # The controldata has not been updated to "shutdown in recovery". + # It should mean we had a crash on a secondary instance. + # There is no "FAILED_SLAVE" return code, so we return a generic error. + ocf_exit_reason( + 'Instance "%s" controldata indicates a running secondary instance, the instance has probably crashed', + $OCF_RESOURCE_INSTANCE ); + return $OCF_ERR_GENERIC; + } + elsif ( $controldata_rc == $OCF_NOT_RUNNING ) { + # The controldata state is consistent, the instance was probably + # propertly shut down. + ocf_log( 'debug', + '_confirm_stopped: instance "%s" controldata indicates that the instance was propertly shut down', + $OCF_RESOURCE_INSTANCE ); + return $OCF_NOT_RUNNING; + } + + # Something went wrong with the controldata check. + ocf_exit_reason( + 'Could not get instance "%s" status from controldata (returned: %d)', + $OCF_RESOURCE_INSTANCE, $controldata_rc ); + + return $OCF_ERR_GENERIC; +} + +############################################################ +#### OCF FUNCS + + + +=head1 SUPPORTED PARAMETERS + +=over + +=item B<pgdata> + +Location of the PGDATA of your instance + +(optional, string, default "/var/lib/pgsql/data") + +=item B<pghost> + +The socket directory or IP address to use to connect to the local instance + +(optional, string, default "/tmp") + +=item B<pgport> + +The port to connect to the local instance + +(optional, integer, default "5432") + +=item B<bindir> + +Location of the PostgreSQL binaries. + +(optional, string, default "/usr/bin") + +=item B<system_user> + +The system owner of your instance's process + +(optional, string, default "postgres") + +=item B<recovery_template> + +B<ONLY> for PostgreSQL 11 and bellow. + +The local template that will be copied as the C<PGDATA/recovery.conf> file. +This template file must exists on all node. + +With PostgreSQL 12 and higher, the cluster will refuse to start if this +parameter is set or a template file is found. + +(optional, string, default "$PGDATA/recovery.conf.pcmk") + +=item B<maxlag> + +Maximum lag allowed on a standby before we set a negative master score on it. +The calculation is based on the difference between the current xlog location on +the master and the write location on the standby. + +(optional, integer, default "0" disables this feature) + +=item B<datadir> + +Path to the directory set in C<data_directory> from your postgresql.conf file. +This parameter has same default than PostgreSQL itself: the C<pgdata> parameter +value. + +Unless you have a special PostgreSQL setup and you understand this parameter, +B<ignore it> + +(optional, string, default to the value of C<pgdata>) + +=item B<start_opts> + +Additional arguments given to the postgres process on startup. See +"postgres --help" for available options. Useful when the postgresql.conf file +is not in the data directory (PGDATA), eg.: + + -c config_file=/etc/postgresql/9.3/main/postgresql.conf + +(optinal, string, default "") + +=back + +=cut + +sub ocf_meta_data { + print qq{<?xml version="1.0"?> + <!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd"> + <resource-agent name="pgsqlms"> + <version>1.0</version> + + <longdesc lang="en"> + Resource script for PostgreSQL in replication. It manages PostgreSQL servers using streaming replication as an HA resource. + </longdesc> + <shortdesc lang="en">Manages PostgreSQL servers in replication</shortdesc> + <parameters> + <parameter name="system_user" unique="0" required="0"> + <longdesc lang="en"> + System user account used to run the PostgreSQL server + </longdesc> + <shortdesc lang="en">PostgreSQL system User</shortdesc> + <content type="string" default="$system_user_default" /> + </parameter> + + <parameter name="bindir" unique="0" required="0"> + <longdesc lang="en"> + Path to the directory storing the PostgreSQL binaries. The agent uses psql, pg_isready, pg_controldata and pg_ctl. + </longdesc> + <shortdesc lang="en">Path to the PostgreSQL binaries</shortdesc> + <content type="string" default="$bindir_default" /> + </parameter> + + <parameter name="pgdata" unique="1" required="0"> + <longdesc lang="en"> + Path to the data directory, e.g. PGDATA + </longdesc> + <shortdesc lang="en">Path to the data directory</shortdesc> + <content type="string" default="$pgdata_default" /> + </parameter> + + <parameter name="datadir" unique="1" required="0"> + <longdesc lang="en"> + Path to the directory set in data_directory from your postgresql.conf file. This parameter + has the same default than PostgreSQL itself: the pgdata parameter value. Unless you have a + special PostgreSQL setup and you understand this parameter, ignore it. + </longdesc> + <shortdesc lang="en">Path to the directory set in data_directory from your postgresql.conf file</shortdesc> + <content type="string" default="PGDATA" /> + </parameter> + + <parameter name="pghost" unique="0" required="0"> + <longdesc lang="en"> + Host IP address or unix socket folder the instance is listening on. + </longdesc> + <shortdesc lang="en">Instance IP or unix socket folder</shortdesc> + <content type="string" default="$pghost_default" /> + </parameter> + + <parameter name="pgport" unique="0" required="0"> + <longdesc lang="en"> + Port the instance is listening on. + </longdesc> + <shortdesc lang="en">Instance port</shortdesc> + <content type="integer" default="$pgport_default" /> + </parameter> + + <parameter name="maxlag" unique="0" required="0"> + <longdesc lang="en"> + Maximum lag allowed on a standby before we set a negative master score on it. The calculation + is based on the difference between the current LSN on the master and the LSN + written on the standby. + This parameter must be a valid positive number as described in PostgreSQL documentation. + See: https://www.postgresql.org/docs/current/static/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + </longdesc> + <shortdesc lang="en">Maximum write lag before we mark a standby as inappropriate to promote</shortdesc> + <content type="integer" default="$maxlag_default" /> + </parameter> + + <parameter name="recovery_template" unique="1" required="0"> + <longdesc lang="en"> + Path to the recovery.conf template. This file is simply copied to \$PGDATA + before starting the instance as slave. + ONLY for PostgreSQL 11 and bellow. This parameter is IGNORED for + PostgreSQL 12 and higher. The cluster will refuse to start if a template + file is found. + </longdesc> + <shortdesc lang="en">Path to the recovery.conf template for PostgreSQL 11 and older.</shortdesc> + <content type="string" default="PGDATA/recovery.conf.pcmk" /> + </parameter> + + <parameter name="start_opts" unique="0" required="0"> + <longdesc lang="en"> + Additionnal arguments given to the postgres process on startup. + See "postgres --help" for available options. Usefull when the + postgresql.conf file is not in the data directory (PGDATA), eg.: + "-c config_file=/etc/postgresql/9.3/main/postgresql.conf". + </longdesc> + <shortdesc lang="en">Additionnal arguments given to the postgres process on startup.</shortdesc> + <content type="string" default="$start_opts_default" /> + </parameter> + + </parameters> + <actions> + <action name="start" timeout="60" /> + <action name="stop" timeout="60" /> + <action name="reload" timeout="20" /> + <action name="promote" timeout="30" /> + <action name="demote" timeout="120" /> + <action name="monitor" depth="0" timeout="10" interval="15"/> + <action name="monitor" depth="0" timeout="10" interval="15" role="Master"/> + <action name="monitor" depth="0" timeout="10" interval="16" role="Slave"/> + <action name="notify" timeout="60" /> + <action name="meta-data" timeout="5" /> + <action name="validate-all" timeout="5" /> + <action name="methods" timeout="5" /> + </actions> + </resource-agent> + }; + return $OCF_SUCCESS; +} + + +=head1 SUPPORTED ACTIONS + +This resource agent supports the following actions (operations): + +=over + +=item B<start> + +Starts the resource. Suggested minimum timeout: 60. + +=item B<stop> + +Stops the resource. Suggested minimum timeout: 60. + +=item B<reload> + +Suggested minimum timeout: 20. + +=item B<promote> + +Promotes the resource to the Master role. Suggested minimum timeout: 30. + +=item B<demote> + +Demotes the resource to the Slave role. Suggested minimum timeout: 120. + +=item B<monitor (Master role)> + +Performs a detailed status check. Suggested minimum timeout: 10. +Suggested interval: 15. + +=item B<monitor (Slave role)> + +Performs a detailed status check. Suggested minimum timeout: 10. +Suggested interval: 16. + +=item B<notify> + +Suggested minimum timeout: 60 + +=item B<meta-data> + +Retrieves resource agent metadata (internal use only). +Suggested minimum timeout: 5. + +=item B<methods> + +Suggested minimum timeout: 5. + +=item B<validate-all> + +Performs a validation of the resource configuration. +Suggested minimum timeout: 5. + +=back + +=cut + +sub ocf_methods { + print q{ + start + stop + reload + promote + demote + monitor + notify + methods + meta-data + validate-all + }; + + return $OCF_SUCCESS; +} + +############################################################ +#### RA FUNCS + +sub pgsql_validate_all { + my $fh; + my $ans = ''; + my %cdata; + + unless ( + ocf_version_cmp( $ENV{"OCF_RESKEY_crm_feature_set"}, '3.0.9' ) == 2 + ) { + ocf_exit_reason( + 'PAF %s is compatible with Pacemaker 1.1.13 and greater', + $VERSION + ); + return $OCF_ERR_INSTALLED; + } + + # check notify=true + $ans = qx{ $CRM_RESOURCE --resource "$OCF_RESOURCE_INSTANCE" \\ + --meta --get-parameter notify 2>/dev/null }; + chomp $ans; + unless ( lc($ans) =~ /^true$|^on$|^yes$|^y$|^1$/ ) { + ocf_exit_reason( + 'You must set meta parameter notify=true for your master resource' + ); + return $OCF_ERR_INSTALLED; + } + + # check master-max=1 + unless ( + defined $ENV{'OCF_RESKEY_CRM_meta_master_max'} + and $ENV{'OCF_RESKEY_CRM_meta_master_max'} eq '1' + ) { + ocf_exit_reason( + 'You must set meta parameter master-max=1 for your master resource' + ); + return $OCF_ERR_INSTALLED; + } + + if ( $PGVERNUM >= $PGVER_12 ) { + # check PostgreSQL setup: checks related to v12 and after + my $guc; + + # recovery.conf template must not exists + if ( -f $recovery_tpl ) { + ocf_exit_reason( + 'Recovery template file "%s" is forbidden for PostgreSQL 12 and above', + $recovery_tpl ); + exit $OCF_ERR_ARGS; + } + + # WARNING: you MUST put -C as first argument to bypass the root check + $guc = qx{ $POSTGRES -C recovery_target_timeline -D "$pgdata" $start_opts}; + chomp $guc; + unless ( $guc eq 'latest' ) { + ocf_exit_reason( + q{Parameter "recovery_target_timeline" MUST be set to 'latest'. } . + q{It is currently set to '%s'}, $guc ); + return $OCF_ERR_ARGS; + } + + $guc = qx{ $POSTGRES -C primary_conninfo -D "$pgdata" $start_opts}; + unless ($guc =~ /\bapplication_name='?$nodename'?\b/) { + ocf_exit_reason( + q{Parameter "primary_conninfo" MUST contain 'application_name=%s'. }. + q{It is currently set to '%s'}, $nodename, $guc ); + return $OCF_ERR_ARGS; + } + } + else { + my @content; + + # check recovery template + if ( ! -f $recovery_tpl ) { + ocf_exit_reason( 'Recovery template file "%s" does not exist', + $recovery_tpl ); + return $OCF_ERR_ARGS; + } + + # check content of the recovery template file + unless ( open( $fh, '<', $recovery_tpl ) ) { + ocf_exit_reason( 'Could not open file "%s": %s', $recovery_tpl, $! ); + return $OCF_ERR_ARGS; + } + @content = <$fh>; + close $fh; + + + unless ( grep /^\s*standby_mode\s*=\s*'?on'?\s*$/, @content ) { + ocf_exit_reason( + 'Recovery template file must contain "standby_mode = on"' ); + return $OCF_ERR_ARGS; + } + + unless ( grep /^\s*recovery_target_timeline\s*=\s*'?latest'?\s*$/, @content ) { + ocf_exit_reason( + "Recovery template file must contain \"recovery_target_timeline = 'latest'\"" + ); + return $OCF_ERR_ARGS; + } + + unless ( + grep /^\s*primary_conninfo\s*=.*['\s]application_name=$nodename['\s]/, + @content + ) { + ocf_exit_reason( + 'Recovery template file must contain in primary_conninfo parameter "application_name=%s"', + $nodename ); + return $OCF_ERR_ARGS; + } + } + + unless ( looks_like_number($maxlag) ) { + ocf_exit_reason( 'maxlag is not a number: "%s"', $maxlag ); + return $OCF_ERR_INSTALLED; + } + + # check system user + unless ( defined getpwnam $system_user ) { + ocf_exit_reason( 'System user "%s" does not exist', $system_user ); + return $OCF_ERR_ARGS; + } + + # require 9.3 minimum + if ( $PGVERNUM < $PGVER_93 ) { + ocf_exit_reason( "Require 9.3 and more" ); + return $OCF_ERR_INSTALLED; + } + + # check binaries + unless ( -x $PGCTL and -x $PGPSQL and -x $PGCTRLDATA and -x $PGISREADY + and ( -x $PGWALDUMP or -x "$bindir/pg_xlogdump") + ) { + ocf_exit_reason( + "Missing one or more binary. Check following path: %s, %s, %s, %s, %s or %s", + $PGCTL, $PGPSQL, $PGCTRLDATA, $PGISREADY, $PGWALDUMP, "$bindir/pg_xlogdump" ); + return $OCF_ERR_ARGS; + } + + # require wal_level >= hot_standby + %cdata = _get_controldata(); + unless ( $cdata{'wal_level'} =~ m{hot_standby|logical|replica} ) { + ocf_exit_reason( + 'wal_level must be one of "hot_standby", "logical" or "replica"' ); + return $OCF_ERR_ARGS; + } + + return $OCF_SUCCESS; +} + + +# Start the PostgreSQL instance as a *secondary* +# +sub pgsql_start { + my $rc = pgsql_monitor(); + my %cdata = _get_controldata(); + my $prev_state = $cdata{'state'}; + + # Instance must be running as secondary or being stopped. + # Anything else is an error. + if ( $rc == $OCF_SUCCESS ) { + ocf_log( 'info', 'Instance "%s" already started', + $OCF_RESOURCE_INSTANCE ); + return $OCF_SUCCESS; + } + elsif ( $rc != $OCF_NOT_RUNNING ) { + ocf_exit_reason( 'Unexpected state for instance "%s" (returned %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + return $OCF_ERR_GENERIC; + } + + # + # From here, the instance is NOT running for sure. + # + + ocf_log( 'debug', + 'pgsql_start: instance "%s" is not running, starting it as a secondary', + $OCF_RESOURCE_INSTANCE ); + + # Must start as a standby, so enable recovery. + _enable_recovery(); + + # Start the instance as a secondary. + $rc = _pg_ctl_start(); + + if ( $rc == 0 ) { + + # Wait for the start to finish. + sleep 1 while ( $rc = pgsql_monitor() ) == $OCF_NOT_RUNNING; + + if ( $rc == $OCF_SUCCESS ) { + ocf_log( 'info', 'Instance "%s" started', $OCF_RESOURCE_INSTANCE ); + + # Check if a master score exists in the cluster. + # During the very first start of the cluster, no master score will + # exists on any of the existing slaves, unless an admin designated + # one of them using crm_master. If no master exists the cluster will + # not promote a master among the slaves. + # To solve this situation, we check if there is at least one master + # score existing on one node in the cluster. Do nothing if at least + # one master score is found among the clones of the resource. If no + # master score exists, set a score of 1 only if the resource was a + # shut downed master before the start. + if ( $prev_state eq "shut down" and not _master_score_exists() ) { + ocf_log( 'info', 'No master score around. Set mine to 1' ); + + _set_master_score( '1' ); + } + + return $OCF_SUCCESS; + } + + ocf_exit_reason( + 'Instance "%s" is not running as a slave (returned %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + + return $OCF_ERR_GENERIC; + } + + ocf_exit_reason( 'Instance "%s" failed to start (rc: %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + + return $OCF_ERR_GENERIC; +} + +# Stop the PostgreSQL instance +# +sub pgsql_stop { + my $rc; + my $state; + my $pidfile = "$datadir/postmaster.pid"; + # Add 60s to the timeout or use a 24h timeout fallback to make sure + # Pacemaker will give up before us and take decisions + my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; + + # Instance must be running as secondary or primary or being stopped. + # Anything else is an error. + $rc = pgsql_monitor(); + if ( $rc == $OCF_NOT_RUNNING ) { + ocf_log( 'info', 'Instance "%s" already stopped', + $OCF_RESOURCE_INSTANCE ); + return $OCF_SUCCESS; + } + elsif ( $rc != $OCF_SUCCESS and $rc != $OCF_RUNNING_MASTER ) { + ocf_exit_reason( 'Unexpected state for instance "%s" (returned %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + return $OCF_ERR_GENERIC; + } + + # + # From here, the instance is running for sure. + # + + ocf_log( 'debug', 'pgsql_stop: instance "%s" is running, stopping it', + $OCF_RESOURCE_INSTANCE ); + + # Try to quit with proper shutdown. + + + $rc = _runas( $PGCTL, '--pgdata', $pgdata, '-w', '--timeout', $timeout, + '-m', 'fast', 'stop' ); + + if ( $rc == 0 ) { + # Wait for the stop to finish. + sleep 1 while ( $rc = pgsql_monitor() ) != $OCF_NOT_RUNNING ; + + ocf_log( 'info', 'Instance "%s" stopped', $OCF_RESOURCE_INSTANCE ); + + return $OCF_SUCCESS; + } + + ocf_exit_reason( 'Instance "%s" failed to stop', $OCF_RESOURCE_INSTANCE ); + return $OCF_ERR_GENERIC; +} + +# Monitor the PostgreSQL instance +# +sub pgsql_monitor { + my $pgisready_rc; + my $controldata_rc; + + ocf_log( 'debug', 'pgsql_monitor: monitor is a probe' ) if ocf_is_probe(); + + # First check, verify if the instance is listening. + $pgisready_rc = _pg_isready(); + + if ( $pgisready_rc == 0 ) { + # The instance is listening. + # We confirm that the instance is up and return if it is a primary or a + # secondary + ocf_log( 'debug', 'pgsql_monitor: instance "%s" is listening', + $OCF_RESOURCE_INSTANCE ); + return _confirm_role(); + } + + if ( $pgisready_rc == 1 ) { + # The attempt was rejected. + # This could happen in several cases: + # - at startup + # - during shutdown + # - during crash recovery + # - if instance is a warm standby + # Except for the warm standby case, this should be a transitional state. + # We try to confirm using pg_controldata. + ocf_log( 'debug', + 'pgsql_monitor: instance "%s" rejects connections - checking again...', + $OCF_RESOURCE_INSTANCE ); + $controldata_rc = _controldata_to_ocf(); + + if ( $controldata_rc == $OCF_RUNNING_MASTER + or $controldata_rc == $OCF_SUCCESS + ) { + # This state indicates that pg_isready check should succeed. + # We check again. + ocf_log( 'debug', + 'pgsql_monitor: instance "%s" controldata shows a running status', + $OCF_RESOURCE_INSTANCE ); + + $pgisready_rc = _pg_isready(); + if ( $pgisready_rc == 0 ) { + # Consistent with pg_controdata output. + # We can check if the instance is primary or secondary + ocf_log( 'debug', 'pgsql_monitor: instance "%s" is listening', + $OCF_RESOURCE_INSTANCE ); + return _confirm_role(); + } + + # Still not consistent, raise an error. + # NOTE: if the instance is a warm standby, we end here. + # TODO raise an hard error here ? + ocf_exit_reason( + 'Instance "%s" controldata is not consistent with pg_isready (returned: %d)', + $OCF_RESOURCE_INSTANCE, $pgisready_rc ); + ocf_log( 'info', + 'If this instance is in warm standby, this resource agent only supports hot standby', + $OCF_RESOURCE_INSTANCE, $pgisready_rc ); + + return $OCF_ERR_GENERIC; + } + + if ( $controldata_rc == $OCF_NOT_RUNNING ) { + # This state indicates that pg_isready check should fail with rc 2. + # We check again. + $pgisready_rc = _pg_isready(); + if ( $pgisready_rc == 2 ) { + # Consistent with pg_controdata output. + # We check the process status using pg_ctl status and check + # if it was propertly shut down using pg_controldata. + ocf_log( 'debug', + 'pgsql_monitor: instance "%s" is not listening', + $OCF_RESOURCE_INSTANCE ); + return _confirm_stopped(); + } + # Still not consistent, raise an error. + # TODO raise an hard error here ? + ocf_exit_reason( + 'Instance "%s" controldata is not consistent with pg_isready (returned: %d)', + $OCF_RESOURCE_INSTANCE, $pgisready_rc ); + + return $OCF_ERR_GENERIC; + } + + # Something went wrong with the controldata check, hard fail. + ocf_exit_reason( + 'Could not get instance "%s" status from controldata (returned: %d)', + $OCF_RESOURCE_INSTANCE, $controldata_rc ); + + return $OCF_ERR_INSTALLED; + } + + elsif ( $pgisready_rc == 2 ) { + # The instance is not listening. + # We check the process status using pg_ctl status and check + # if it was propertly shut down using pg_controldata. + ocf_log( 'debug', 'pgsql_monitor: instance "%s" is not listening', + $OCF_RESOURCE_INSTANCE ); + return _confirm_stopped(); + } + + elsif ( $pgisready_rc == 3 ) { + # No attempt was done, probably a syntax error. + # Hard configuration error, we don't want to retry or failover here. + ocf_exit_reason( + 'Unknown error while checking if instance "%s" is listening (returned %d)', + $OCF_RESOURCE_INSTANCE, $pgisready_rc ); + + return $OCF_ERR_CONFIGURED; + } + + ocf_exit_reason( 'Unexpected result when checking instance "%s" status', + $OCF_RESOURCE_INSTANCE ); + + return $OCF_ERR_GENERIC; +} + + +# Demote the PostgreSQL instance from primary to secondary +# To demote a PostgreSQL instance, we must: +# * stop it gracefully +# * create recovery.conf with standby_mode = on +# * start it +# +sub pgsql_demote { + my $rc; + + $rc = pgsql_monitor(); + + # Running as primary. Normal, expected behavior. + if ( $rc == $OCF_RUNNING_MASTER ) { + ocf_log( 'debug', 'pgsql_demote: "%s" currently running as a primary', + $OCF_RESOURCE_INSTANCE ) ; + } + elsif ( $rc == $OCF_SUCCESS ) { + # Already running as secondary. Nothing to do. + ocf_log( 'debug', + 'pgsql_demote: "%s" currently running as a secondary', + $OCF_RESOURCE_INSTANCE ); + return $OCF_SUCCESS; + } + elsif ( $rc == $OCF_NOT_RUNNING ) { + # Instance is stopped. Nothing to do. + ocf_log( 'debug', 'pgsql_demote: "%s" currently shut down', + $OCF_RESOURCE_INSTANCE ); + } + elsif ( $rc == $OCF_ERR_CONFIGURED ) { + # We actually prefer raising a hard or fatal error instead of leaving + # the CRM abording its transition for a new one because of a soft error. + # The hard error will force the CRM to move the resource immediately. + return $OCF_ERR_CONFIGURED; + } + else { + return $OCF_ERR_GENERIC; + } + + # TODO we need to make sure at least one slave is connected!! + + # WARNING if the resource state is stopped instead of master, the ocf ra dev + # rsc advises to return OCF_ERR_GENERIC, misleading the CRM in a loop where + # it computes transitions of demote(failing)->stop->start->promote actions + # until failcount == migration-threshold. + # This is a really ugly trick to keep going with the demode action if the + # rsc is already stopped gracefully. + # See discussion "CRM trying to demote a stopped resource" on + # developers@clusterlabs.org + unless ( $rc == $OCF_NOT_RUNNING ) { + # Add 60s to the timeout or use a 24h timeout fallback to make sure + # Pacemaker will give up before us and take decisions + my $timeout = ( _get_action_timeout() || 60*60*24 ) + 60; + + # WARNING the instance **MUST** be stopped gracefully. + # Do **not** use pg_stop() or service or systemctl here as these + # commands might force-stop the PostgreSQL instance using immediate + # after some timeout and return success, which is misleading. + + $rc = _runas( $PGCTL, '--pgdata', $pgdata, '--mode', 'fast', '-w', + '--timeout', $timeout , 'stop' ); + + # No need to wait for stop to complete, this is handled in pg_ctl + # using -w option. + unless ( $rc == 0 ) { + ocf_exit_reason( 'Failed to stop "%s" using pg_ctl (returned %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + return $OCF_ERR_GENERIC; + } + + # Double check that the instance is stopped correctly. + $rc = pgsql_monitor(); + unless ( $rc == $OCF_NOT_RUNNING ) { + ocf_exit_reason( + 'Unexpected "%s" state: monitor status (%d) disagree with pg_ctl return code', + $OCF_RESOURCE_INSTANCE, $rc ); + return $OCF_ERR_GENERIC; + } + } + + # + # At this point, the instance **MUST** be stopped gracefully. + # + + # Note: We do not need to handle the recovery.conf file here as pgsql_start + # deal with that itself. Equally, no need to wait for the start to complete + # here, handled in pgsql_start. + $rc = pgsql_start(); + if ( $rc == $OCF_SUCCESS ) { + ocf_log( 'info', 'pgsql_demote: "%s" started as a secondary', + $OCF_RESOURCE_INSTANCE ); + return $OCF_SUCCESS; + } + + # NOTE: No need to double check the instance state as pgsql_start already use + # pgsql_monitor to check the state before returning. + + ocf_exit_reason( 'Starting "%s" as a standby failed (returned %d)', + $OCF_RESOURCE_INSTANCE, $rc ); + return $OCF_ERR_GENERIC; +} + + +# Promote the secondary instance to primary +# +sub pgsql_promote { + my $rc; + my $cancel_switchover; + + $rc = pgsql_monitor(); + + if ( $rc == $OCF_SUCCESS ) { + # Running as slave. Normal, expected behavior. + ocf_log( 'debug', 'pgsql_promote: "%s" currently running as a standby', + $OCF_RESOURCE_INSTANCE ); + } + elsif ( $rc == $OCF_RUNNING_MASTER ) { + # Already a master. Unexpected, but not a problem. + ocf_log( 'info', '"%s" already running as a primary', + $OCF_RESOURCE_INSTANCE ); + return $OCF_SUCCESS; + } + elsif ( $rc == $OCF_NOT_RUNNING ) { # INFO this is not supposed to happen. + # Currently not running. Need to start before promoting. + ocf_log( 'info', '"%s" currently not running, starting it', + $OCF_RESOURCE_INSTANCE ); + + $rc = pgsql_start(); + if ( $rc != $OCF_SUCCESS ) { + ocf_exit_reason( 'Failed to start the instance "%s"', + $OCF_RESOURCE_INSTANCE ); + return $OCF_ERR_GENERIC; + } + } + else { + ocf_exit_reason( 'Unexpected error, cannot promote "%s"', + $OCF_RESOURCE_INSTANCE ); + return $OCF_ERR_GENERIC; + } + + # + # At this point, the instance **MUST** be started as a secondary. + # + + # Cancel the switchover if it has been considered not safe during the + # pre-promote action + $cancel_switchover = _get_priv_attr('cancel_switchover'); + if ( $cancel_switchover ) { # if not empty or not 0 + ocf_exit_reason( 'Switchover has been canceled from pre-promote action' ); + + _delete_priv_attr( 'cancel_switchover' ); + + return $OCF_ERR_GENERIC if $cancel_switchover eq '1'; + return $OCF_ERR_ARGS; # ban the resource from the node if we have an + # internal error during _check_switchover + } + + # Do not check for a better candidate if we try to recover the master + # Recover of a master is detected during the pre-promote action. It sets the + # private attribute 'recover_master' to '1' if this is a master recover. + if ( _get_priv_attr( 'recover_master' ) eq '1' ) { + ocf_log( 'info', 'Recovering old master, no election needed'); + } + else { + + # The promotion is occurring on the best known candidate (highest + # master score), as chosen by pacemaker during the last working monitor + # on previous master (see pgsql_monitor/_check_locations subs). + # To avoid any race condition between the last monitor action on the + # previous master and the **real** most up-to-date standby, we + # set each standby location during the "pre-promote" action, and stored + # them using the "lsn_location" resource attribute. + # + # The best standby to promote would have the highest known LSN. If the + # current resource is not the best one, we need to modify the master + # scores accordingly, and abort the current promotion. + ocf_log( 'debug', + 'pgsql_promote: checking if current node is the best candidate for promotion' ); + + # Exclude nodes that are known to be unavailable (not in the current + # partition) using the "crm_node" command + my @active_nodes = split /\s+/ => _get_priv_attr( 'nodes' ); + my $node_to_promote = ''; + my $ans; + my $max_tl; + my $max_lsn; + my $node_tl; + my $node_lsn; + my $wal_num; + my $wal_off; + + # Get the "lsn_location" attribute value for the current node, as set + # during the "pre-promote" action. + # It should be the greatest among the secondary instances. + $ans = _get_priv_attr( 'lsn_location' ); + + if ( $ans eq '' ) { + # This should not happen as the "lsn_location" attribute should have + # been updated during the "pre-promote" action. + ocf_exit_reason( 'Can not get current node LSN location' ); + return $OCF_ERR_GENERIC; + } + + chomp $ans; + ( $max_tl, $max_lsn ) = split /#/, $ans; + + ocf_log( 'debug', 'pgsql_promote: current node TL#LSN location: %s#%s', + $max_tl, $max_lsn ); + + # Now we compare with the other available nodes. + foreach my $node ( @active_nodes ) { + # We exclude the current node from the check. + next if $node eq $nodename; + + # Get the "lsn_location" attribute value for the node, as set during + # the "pre-promote" action. + # This is implemented as a loop as private attributes are asynchronously + # available from other nodes. + # see: https://github.com/ClusterLabs/PAF/issues/131 + # NOTE: if a node did not set its lsn_location for some reason, this will end + # with a timeout and the whole promotion will start again. + WAIT_FOR_LSN: { + $ans = _get_priv_attr( 'lsn_location', $node ); + if ( $ans eq '' ) { + ocf_log( 'info', 'pgsql_promote: waiting for LSN from %s', $node ); + select( undef, undef, undef, 0.1 ); + redo WAIT_FOR_LSN; + } + } + + chomp $ans; + ( $node_tl, $node_lsn ) = split /#/, $ans; + + ocf_log( 'debug', + 'pgsql_promote: comparing with "%s": TL#LSN is %s#%s', + $node, $node_tl, $node_lsn ); + + # If the node has a higher LSN, select it as a best candidate to + # promotion and keep looping to check the TL/LSN of other nodes. + if ( $node_tl > $max_tl + or ( $node_tl == $max_tl and $node_lsn > $max_lsn ) + ) { + ocf_log( 'debug', + 'pgsql_promote: "%s" is a better candidate to promote (%s#%s > %s#%s)', + $node, $node_tl, $node_lsn, $max_tl, $max_lsn ); + $node_to_promote = $node; + $max_tl = $node_tl; + $max_lsn = $node_lsn; + } + } + + # If any node has been selected, we adapt the master scores accordingly + # and break the current promotion. + if ( $node_to_promote ne '' ) { + ocf_exit_reason( + '%s is the best candidate to promote, aborting current promotion', + $node_to_promote ); + + # Reset current node master score. + _set_master_score( '1' ); + + # Set promotion candidate master score. + _set_master_score( '1000', $node_to_promote ); + + # We fail the promotion to trigger another promotion transition + # with the new scores. + return $OCF_ERR_GENERIC; + } + + # Else, we will keep on promoting the current node. + } + + unless ( + # Promote the instance on the current node. + _runas( $PGCTL, '--pgdata', $pgdata, '-w', 'promote' ) == 0 ) + { + ocf_exit_reason( 'Error during promotion command' ); + return $OCF_ERR_GENERIC; + } + + # The instance promotion is asynchronous, so we need to wait for this + # process to complete. + while ( pgsql_monitor() != $OCF_RUNNING_MASTER ) { + ocf_log( 'info', 'Waiting for the promote to complete' ); + sleep 1; + } + + ocf_log( 'info', 'Promote complete' ); + + return $OCF_SUCCESS; +} + +# This action is called **before** the actual promotion when a failing master is +# considered unreclaimable, recoverable or a new master must be promoted +# (switchover or first start). +# As every "notify" action, it is executed almost simultaneously on all +# available nodes. +sub pgsql_notify_pre_promote { + my $rc; + my $node_tl; + my $node_lsn; + my %cdata; + my %active_nodes; + my $attr_nodes; + + ocf_log( 'info', 'Promoting instance on node "%s"', + $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} ); + + # No need to do an election between slaves if this is recovery of the master + if ( _is_master_recover( $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} ) ) { + ocf_log( 'warning', 'This is a master recovery!' ); + + _set_priv_attr( 'recover_master', '1' ) + if $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} eq $nodename; + + return $OCF_SUCCESS; + } + + # Environment cleanup! + _delete_priv_attr( 'lsn_location' ); + _delete_priv_attr( 'recover_master' ); + _delete_priv_attr( 'nodes' ); + _delete_priv_attr( 'cancel_switchover' ); + + # check for the last received entry of WAL from the master if we are + # the designated slave to promote + if ( _is_switchover( $nodename ) and scalar + grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'promote'} } + ) { + $rc = _check_switchover(); + + unless ( $rc == 0 ) { + # Shortcut the election process as the switchover will be + # canceled + _set_priv_attr( 'cancel_switchover', $rc ); + return $OCF_SUCCESS; # return code is ignored during notify + } + + # If the sub keeps going, that means the switchover is safe. + # Keep going with the election process in case the switchover was + # instruct to the wrong node. + # FIXME: should we allow a switchover to a lagging slave? + } + + # We need to trigger an election between existing slaves to promote the best + # one based on its current LSN location. Each node set a private attribute + # "lsn_location" with its TL and LSN location. + # + # During the following promote action, The designated standby for + # promotion use these attributes to check if the instance to be promoted + # is the best one, so we can avoid a race condition between the last + # successful monitor on the previous master and the current promotion. + + # As we can not break the transition from a notification action, we check + # during the promotion if each node TL and LSN are valid. + + # Force a checpoint to make sure the controldata shows the very last TL + _query( q{ CHECKPOINT }, {} ); + %cdata = _get_controldata(); + $node_lsn = _get_last_received_lsn( 'in decimal' ); + + unless ( defined $node_lsn ) { + ocf_log( 'warning', 'Unknown current node LSN' ); + # Return code are ignored during notifications... + return $OCF_SUCCESS; + } + + $node_lsn = "$cdata{'tl'}#$node_lsn"; + + ocf_log( 'info', 'Current node TL#LSN: %s', $node_lsn ); + + # Set the "lsn_location" attribute value for this node so we can use it + # during the following "promote" action. + _set_priv_attr( 'lsn_location', $node_lsn ); + + ocf_log( 'warning', 'Could not set the current node LSN' ) + if $? != 0 ; + + # If this node is the future master, keep track of the slaves that + # received the same notification to compare our LSN with them during + # promotion + if ( $OCF_NOTIFY_ENV{'promote'}[0]{'uname'} eq $nodename ) { + # Build the list of active nodes: + # master + slave + start - stop + # FIXME: Deal with rsc started during the same transaction but **after** + # the promotion ? + $active_nodes{ $_->{'uname'} }++ foreach @{ $OCF_NOTIFY_ENV{'active'} }, + @{ $OCF_NOTIFY_ENV{'start'} }; + $active_nodes{ $_->{'uname'} }-- foreach @{ $OCF_NOTIFY_ENV{'stop'} }; + + $attr_nodes = join " " + => grep { $active_nodes{$_} > 0 } keys %active_nodes; + + _set_priv_attr( 'nodes', $attr_nodes ); + } + + return $OCF_SUCCESS; +} + +# This action is called after a promote action. +sub pgsql_notify_post_promote { + + # We have a new master (or the previous one recovered). + # Environment cleanup! + _delete_priv_attr( 'lsn_location' ); + _delete_priv_attr( 'recover_master' ); + _delete_priv_attr( 'nodes' ); + _delete_priv_attr( 'cancel_switchover' ); + + return $OCF_SUCCESS; +} + +# This is called before a demote occurs. +sub pgsql_notify_pre_demote { + my $rc; + my %cdata; + + # do nothing if the local node will not be demoted + return $OCF_SUCCESS unless scalar + grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'demote'} }; + + $rc = pgsql_monitor(); + + # do nothing if this is not a master recovery + return $OCF_SUCCESS unless _is_master_recover( $nodename ) + and $rc == $OCF_FAILED_MASTER; + + # in case of master crash, we need to detect if the CRM tries to recover + # the master clone. The usual transition is to do: + # demote->stop->start->promote + # + # There are multiple flaws with this transition: + # * the 1st and 2nd actions will fail because the instance is in + # OCF_FAILED_MASTER step + # * the usual start action is dangerous as the instance will start with + # a recovery.conf instead of entering a normal recovery process + # + # To avoid this, we try to start the instance in recovery from here. + # If it success, at least it will be demoted correctly with a normal + # status. If it fails, it will be catched up in next steps. + + ocf_log( 'info', 'Trying to start failing master "%s"...', + $OCF_RESOURCE_INSTANCE ); + + # Either the instance managed to start or it couldn't. + # We rely on the pg_ctk '-w' switch to take care of this. If it couldn't + # start, this error will be catched up later during the various checks + _pg_ctl_start(); + + %cdata = _get_controldata(); + + ocf_log( 'info', 'State is "%s" after recovery attempt', $cdata{'state'} ); + + return $OCF_SUCCESS; +} + +# This is called before a stop occurs. +sub pgsql_notify_pre_stop { + my $rc; + my %cdata; + + # do nothing if the local node will not be stopped + return $OCF_SUCCESS unless scalar + grep { $_->{'uname'} eq $nodename } @{ $OCF_NOTIFY_ENV{'stop'} }; + + $rc = _controldata_to_ocf(); + + # do nothing if this is not a slave recovery + return $OCF_SUCCESS unless _is_slave_recover( $nodename ) + and $rc == $OCF_RUNNING_SLAVE; + + # in case of slave crash, we need to detect if the CRM tries to recover + # the slaveclone. The usual transition is to do: stop->start + # + # This transition can no twork because the instance is in + # OCF_ERR_GENERIC step. So the stop action will fail, leading most + # probably to fencing action. + # + # To avoid this, we try to start the instance in recovery from here. + # If it success, at least it will be stopped correctly with a normal + # status. If it fails, it will be catched up in next steps. + + ocf_log( 'info', 'Trying to start failing slave "%s"...', + $OCF_RESOURCE_INSTANCE ); + + # Either the instance managed to start or it couldn't. + # We rely on the pg_ctk '-w' switch to take care of this. If it couldn't + # start, this error will be catched up later during the various checks + _pg_ctl_start(); + + %cdata = _get_controldata(); + + ocf_log( 'info', 'State is "%s" after recovery attempt', $cdata{'state'} ); + + return $OCF_SUCCESS; +} + +# Notify type actions, called on all available nodes before (pre) and after +# (post) other actions, like promote, start, ... +# +sub pgsql_notify { + my $type_op; + + ocf_log( 'debug', "pgsql_notify: environment variables: %s", + Data::Dumper->new( [ \%OCF_NOTIFY_ENV ] )->Sortkeys(1)->Terse(1)->Dump ); + + return unless %OCF_NOTIFY_ENV; + + $type_op = "$OCF_NOTIFY_ENV{'type'}-$OCF_NOTIFY_ENV{'operation'}"; + + for ( $type_op ) { + if ( /^pre-promote$/ ) { return pgsql_notify_pre_promote() } + elsif ( /^post-promote$/ ) { return pgsql_notify_post_promote() } + elsif ( /^pre-demote$/ ) { return pgsql_notify_pre_demote() } + elsif ( /^pre-stop$/ ) { return pgsql_notify_pre_stop() } + } + + return $OCF_SUCCESS; +} + +# Action used to allow for online modification of resource parameters value. +# +sub pgsql_reload { + + # No action necessary, the action declaration is enough to inform pacemaker + # that the modification of any non-unique parameter can be applied without + # having to restart the resource. + ocf_log( 'info', 'Instance "%s" reloaded', $OCF_RESOURCE_INSTANCE ); + return $OCF_SUCCESS; + +} + +############################################################ +#### MAIN + +exit ocf_meta_data() if $__OCF_ACTION eq 'meta-data'; +exit ocf_methods() if $__OCF_ACTION eq 'methods'; + +# Avoid "could not change directory" when executing commands as "system-user". +chdir File::Spec->tmpdir(); + +# mandatory sanity checks +# check pgdata +if ( ! -d $pgdata ) { + ocf_exit_reason( 'PGDATA "%s" does not exist', $pgdata ); + exit $OCF_ERR_ARGS; +} + +# check datadir +if ( ! -d $datadir ) { + ocf_exit_reason( 'data_directory "%s" does not exist', $datadir ); + exit $OCF_ERR_ARGS; +} + +# Set PostgreSQL version +$PGVERNUM = _get_pg_version(); + +# Set current node name. +$nodename = ocf_local_nodename(); + +$exit_code = pgsql_validate_all(); + +exit $exit_code if $exit_code != $OCF_SUCCESS or $__OCF_ACTION eq 'validate-all'; + +# Run action +for ( $__OCF_ACTION ) { + if ( /^start$/ ) { $exit_code = pgsql_start() } + elsif ( /^stop$/ ) { $exit_code = pgsql_stop() } + elsif ( /^monitor$/ ) { $exit_code = pgsql_monitor() } + elsif ( /^promote$/ ) { $exit_code = pgsql_promote() } + elsif ( /^demote$/ ) { $exit_code = pgsql_demote() } + elsif ( /^notify$/ ) { $exit_code = pgsql_notify() } + elsif ( /^reload$/ ) { $exit_code = pgsql_reload() } + else { $exit_code = $OCF_ERR_UNIMPLEMENTED } +} + +exit $exit_code; + + +=head1 EXAMPLE CRM SHELL + +The following is an example configuration for a pgsqlms resource using the +crm(8) shell: + + primitive pgsqld pgsqlms \ + params pgdata="/var/lib/postgresql/9.6/main" \ + bindir="/usr/lib/postgresql/9.6/bin" \ + pghost="/var/run/postgresql" \ + recovery_template="/etc/postgresql/9.6/main/recovery.conf.pcmk" \ + start_opts="-c config_file=/etc/postgresql/9.6/main/postgresql.conf" \ + op start timeout=60s \ + op stop timeout=60s \ + op promote timeout=30s \ + op demote timeout=120s \ + op monitor interval=15s timeout=10s role="Master" \ + op monitor interval=16s timeout=10s role="Slave" \ + op notify timeout=60s + + ms pgsql-ha pgsqld meta notify=true + + +=head1 EXAMPLE PCS + +The following is an example configuration for a pgsqlms resource using pcs(8): + + pcs resource create pgsqld ocf:heartbeat:pgsqlms \ + bindir=/usr/pgsql-9.6/bin pgdata=/var/lib/pgsql/9.6/data \ + op start timeout=60s \ + op stop timeout=60s \ + op promote timeout=30s \ + op demote timeout=120s \ + op monitor interval=15s timeout=10s role="Master" \ + op monitor interval=16s timeout=10s role="Slave" \ + op notify timeout=60s --master notify=true + +=head1 SEE ALSO + +http://clusterlabs.org/ + +=head1 AUTHOR + +Jehan-Guillaume de Rorthais and Mael Rimbault. + +=cut diff --color -uNr a/paf_LICENSE b/paf_LICENSE --- a/paf_LICENSE 1970-01-01 01:00:00.000000000 +0100 +++ b/paf_LICENSE 2021-04-14 09:16:39.083555835 +0200 @@ -0,0 +1,19 @@ +Copyright (c) 2016-2020, Jehan-Guillaume de Rorthais, Mael Rimbault. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement +is hereby granted, provided that the above copyright notice and this +paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL THE AUTHOR OR DISTRIBUTORS BE LIABLE TO ANY PARTY FOR +DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS +DOCUMENTATION, EVEN IF THE AUTHOR OR DISTRIBUTORS HAVE BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE AUTHOR AND DISTRIBUTORS SPECIFICALLY DISCLAIMS ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS +ON AN "AS IS" BASIS, AND THE AUTHOR AND DISTRIBUTORS HAS NO OBLIGATIONS TO +PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + diff --color -uNr a/paf_README.md b/paf_README.md --- a/paf_README.md 1970-01-01 01:00:00.000000000 +0100 +++ b/paf_README.md 2021-04-14 09:18:57.450968048 +0200 @@ -0,0 +1,86 @@ +# PostgreSQL Automatic Failover + +High-Availibility for Postgres, based on industry references Pacemaker and +Corosync. + +## Description + +Pacemaker is nowadays the industry reference for High Availability. In the same +fashion than for Systemd, all Linux distributions moved (or are moving) to this +unique Pacemaker+Corosync stack, removing all other existing high availability +stacks (CMAN, RGManager, OpenAIS, ...). It is able to detect failure on various +services and automatically decide to failover the failing resource to another +node when possible. + +To be able to manage a specific service resource, Pacemaker interact with it +through a so-called "Resource Agent". Resource agents must comply to the OCF +specification which define what they must implement (start, stop, promote, +etc), how they should behave and inform Pacemaker of their results. + +PostgreSQL Automatic Failover is a new OCF resource Agent dedicated to +PostgreSQL. Its original wish is to keep a clear limit between the Pacemaker +administration and the PostgreSQL one, to keep things simple, documented and +yet powerful. + +Once your PostgreSQL cluster built using internal streaming replication, PAF is +able to expose to Pacemaker what is the current status of the PostgreSQL +instance on each node: master, slave, stopped, catching up, etc. Should a +failure occurs on the master, Pacemaker will try to recover it by default. +Should the failure be non-recoverable, PAF allows the slaves to be able to +elect the best of them (the closest one to the old master) and promote it as +the new master. All of this thanks to the robust, feature-full and most +importantly experienced project: Pacemaker. + +For information about how to install this agent, see `INSTALL.md`. + +## Setup and requirements + +PAF supports PostgreSQL 9.3 and higher. It has been extensively tested under +CentOS 6 and 7 in various scenario. + +PAF has been written to give to the administrator the maximum control +over their PostgreSQL configuration and architecture. Thus, you are 100% +responsible for the master/slave creations and their setup. The agent +will NOT edit your setup. It only requires you to follow these pre-requisites: + + * slave __must__ be in hot_standby (accept read-only connections) ; + * the following parameters __must__ be configured in the appropriate place : + * `standby_mode = on` (for PostgreSQL 11 and before) + * `recovery_target_timeline = 'latest'` + * `primary_conninfo` wih `application_name` set to the node name as seen + in Pacemaker. + * these last parameters has been merged inside the instance configuration + file with PostgreSQL 12. For PostgreSQL 11 and before, you __must__ + provide a `recovery.conf` template file. + +When setting up the resource in Pacemaker, here are the available parameters you +can set: + + * `bindir`: location of the PostgreSQL binaries (default: `/usr/bin`) + * `pgdata`: location of the PGDATA of your instance (default: + `/var/lib/pgsql/data`) + * `datadir`: path to the directory set in `data_directory` from your + postgresql.conf file. This parameter has same default than PostgreSQL + itself: the `pgdata` parameter value. Unless you have a special PostgreSQL + setup and you understand this parameter, __ignore it__ + * `pghost`: the socket directory or IP address to use to connect to the + local instance (default: `/tmp` or `/var/run/postgresql` for DEBIAN) + * `pgport`: the port to connect to the local instance (default: `5432`) + * `recovery_template`: __only__ for PostgreSQL 11 and before. The local + template that will be copied as the `PGDATA/recovery.conf` file. This + file must not exist on any node for PostgreSQL 12 and after. + (default: `$PGDATA/recovery.conf.pcmk`) + * `start_opts`: Additional arguments given to the postgres process on startup. + See "postgres --help" for available options. Useful when the postgresql.conf + file is not in the data directory (PGDATA), eg.: + `-c config_file=/etc/postgresql/9.3/main/postgresql.conf` + * `system_user`: the system owner of your instance's process (default: + `postgres`) + * `maxlag`: maximum lag allowed on a standby before we set a negative master + score on it. The calculation is based on the difference between the current + xlog location on the master and the write location on the standby. + (default: 0, which disables this feature) + +For a demonstration about how to setup a cluster, see +[http://clusterlabs.github.io/PAF/documentation.html](http://clusterlabs.github.io/PAF/documentation.html). +