103 lines
3.8 KiB
Diff
103 lines
3.8 KiB
Diff
|
From 3ae6d8f0a34d099945d9bf005ed45dbfe9452202 Mon Sep 17 00:00:00 2001
|
||
|
From: kj1724 <78624900+kj1724@users.noreply.github.com>
|
||
|
Date: Wed, 28 Apr 2021 10:22:38 -0400
|
||
|
Subject: [PATCH] gcp-vpc-move-vip.in: Adds retries
|
||
|
|
||
|
If the cluster fails a monitoring event, it will try to restart the resource. If the resource agent makes an API/metadata call that fails at that time, the resource will be considered "failed", but in certain case also "unconfigured", which prevents further operations.
|
||
|
|
||
|
These changes can help the agent recover on certain intermittent failures.
|
||
|
---
|
||
|
heartbeat/gcp-vpc-move-vip.in | 62 ++++++++++++++++++++---------------
|
||
|
1 file changed, 35 insertions(+), 27 deletions(-)
|
||
|
|
||
|
diff --git a/heartbeat/gcp-vpc-move-vip.in b/heartbeat/gcp-vpc-move-vip.in
|
||
|
index bbbd87b7a9..c411555110 100755
|
||
|
--- a/heartbeat/gcp-vpc-move-vip.in
|
||
|
+++ b/heartbeat/gcp-vpc-move-vip.in
|
||
|
@@ -50,6 +50,8 @@ REMOVE = 1
|
||
|
CONN = None
|
||
|
THIS_VM = None
|
||
|
ALIAS = None
|
||
|
+MAX_RETRIES = 3
|
||
|
+RETRY_BACKOFF_SECS = 1
|
||
|
METADATA_SERVER = 'http://metadata.google.internal/computeMetadata/v1/'
|
||
|
METADATA_HEADERS = {'Metadata-Flavor': 'Google'}
|
||
|
METADATA = \
|
||
|
@@ -111,18 +113,37 @@ def get_metadata(metadata_key, params=None, timeout=None):
|
||
|
|
||
|
Returns:
|
||
|
HTTP response from the GET request.
|
||
|
-
|
||
|
- Raises:
|
||
|
- urlerror.HTTPError: raises when the GET request fails.
|
||
|
"""
|
||
|
- timeout = timeout or 60
|
||
|
- metadata_url = os.path.join(METADATA_SERVER, metadata_key)
|
||
|
- params = urlparse.urlencode(params or {})
|
||
|
- url = '%s?%s' % (metadata_url, params)
|
||
|
- request = urlrequest.Request(url, headers=METADATA_HEADERS)
|
||
|
- request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({}))
|
||
|
- return request_opener.open(
|
||
|
- request, timeout=timeout * 1.1).read().decode("utf-8")
|
||
|
+ for i in range(MAX_RETRIES):
|
||
|
+ try:
|
||
|
+ timeout = timeout or 60
|
||
|
+ metadata_url = os.path.join(METADATA_SERVER, metadata_key)
|
||
|
+ params = urlparse.urlencode(params or {})
|
||
|
+ url = '%s?%s' % (metadata_url, params)
|
||
|
+ request = urlrequest.Request(url, headers=METADATA_HEADERS)
|
||
|
+ request_opener = urlrequest.build_opener(urlrequest.ProxyHandler({}))
|
||
|
+ return request_opener.open(
|
||
|
+ request, timeout=timeout * 1.1).read().decode("utf-8")
|
||
|
+ except Exception as e:
|
||
|
+ logger.error('Couldn\'t get instance name, is this running inside GCE?: '
|
||
|
+ + str(e))
|
||
|
+ time.sleep(RETRY_BACKOFF_SECS * (i + 1))
|
||
|
+
|
||
|
+ # If the retries are exhausted we exit with a generic error.
|
||
|
+ sys.exit(OCF_ERR_GENERIC)
|
||
|
+
|
||
|
+
|
||
|
+def create_api_connection():
|
||
|
+ for i in range(MAX_RETRIES):
|
||
|
+ try:
|
||
|
+ return googleapiclient.discovery.build('compute', 'v1',
|
||
|
+ cache_discovery=False)
|
||
|
+ except Exception as e:
|
||
|
+ logger.error('Couldn\'t connect with google api: ' + str(e))
|
||
|
+ time.sleep(RETRY_BACKOFF_SECS * (i + 1))
|
||
|
+
|
||
|
+ # If the retries are exhausted we exit with a generic error.
|
||
|
+ sys.exit(OCF_ERR_GENERIC)
|
||
|
|
||
|
|
||
|
def get_instance(project, zone, instance):
|
||
|
@@ -358,24 +379,11 @@ def gcp_alias_status(alias):
|
||
|
|
||
|
def validate():
|
||
|
global ALIAS
|
||
|
- global CONN
|
||
|
global THIS_VM
|
||
|
+ global CONN
|
||
|
|
||
|
- # Populate global vars
|
||
|
- try:
|
||
|
- CONN = googleapiclient.discovery.build('compute', 'v1',
|
||
|
- cache_discovery=False)
|
||
|
- except Exception as e:
|
||
|
- logger.error('Couldn\'t connect with google api: ' + str(e))
|
||
|
- sys.exit(OCF_ERR_CONFIGURED)
|
||
|
-
|
||
|
- try:
|
||
|
- THIS_VM = get_metadata('instance/name')
|
||
|
- except Exception as e:
|
||
|
- logger.error('Couldn\'t get instance name, is this running inside GCE?: '
|
||
|
- + str(e))
|
||
|
- sys.exit(OCF_ERR_CONFIGURED)
|
||
|
-
|
||
|
+ CONN = create_api_connection()
|
||
|
+ THIS_VM = get_metadata('instance/name')
|
||
|
ALIAS = os.environ.get('OCF_RESKEY_alias_ip')
|
||
|
if not ALIAS:
|
||
|
logger.error('Missing alias_ip parameter')
|