From 1742baf17954c58a84b9c668a617bac78303ce95 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Tue, 17 Mar 2020 13:18:38 +0000 Subject: [PATCH 1/9] fence_aws: Fix fence race condition by checking local instance status --- agents/aws/fence_aws.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index 4a4d9de2..f37f68d6 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -3,6 +3,7 @@ import sys, re import logging import atexit +import requests sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import * from fencing import fail, fail_usage, run_delay, EC_STATUS @@ -10,6 +11,17 @@ import boto3 from botocore.exceptions import ClientError, EndpointConnectionError, NoRegionError +def get_instance_id(): + try: + r = requests.get('http://169.254.169.254/latest/meta-data/instance-id') + return r.content + except HTTPError as http_err: + logging.error('HTTP error occurred while trying to access EC2 metadata server: %s', http_err) + except Exception as err: + logging.error('A fatal error occurred while trying to access EC2 metadata server: %s', err) + return None + + def get_nodes_list(conn, options): result = {} try: @@ -45,10 +57,33 @@ def get_power_status(conn, options): logging.error("Failed to get power status: %s", e) fail(EC_STATUS) +def get_self_power_status(conn, options): + try: + instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}]) + state = list(instance)[0].state["Name"] + if state == "running": + logging.debug("Captured my (%s) state and it %s - returning OK - Proceeding with fencing",instance_id,state.upper()) + return "ok" + else: + logging.debug("Captured my (%s) state it is %s - returning Alert - Unable to fence other nodes",instance_id,state.upper()) + return "alert" + + except ClientError: + fail_usage("Failed: Incorrect Access Key or Secret Key.") + except EndpointConnectionError: + fail_usage("Failed: Incorrect Region.") + except IndexError: + return "fail" + def set_power_status(conn, options): + my_instance = get_instance_id() try: if (options["--action"]=="off"): - conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True) + if (get_self_power_status(conn,myinstance) == "ok"): + logging.info("Called StopInstance API call for %s", options["--plug"]) + conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True) + else: + logging.info("Skipping fencing as instance is not in running status") elif (options["--action"]=="on"): conn.instances.filter(InstanceIds=[options["--plug"]]).start() except Exception as e: From 45e429b3132ebc9e78121c3fbb15f0bf46845a59 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Tue, 17 Mar 2020 13:28:34 +0000 Subject: [PATCH 2/9] fence_aws: Use local logger and improve logging experience --- agents/aws/fence_aws.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index f37f68d6..b0b6685a 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -6,7 +6,7 @@ import requests sys.path.append("@FENCEAGENTSLIBDIR@") from fencing import * -from fencing import fail, fail_usage, run_delay, EC_STATUS +from fencing import fail, fail_usage, run_delay, EC_STATUS, SyslogLibHandler import boto3 from botocore.exceptions import ClientError, EndpointConnectionError, NoRegionError @@ -16,13 +16,14 @@ def get_instance_id(): r = requests.get('http://169.254.169.254/latest/meta-data/instance-id') return r.content except HTTPError as http_err: - logging.error('HTTP error occurred while trying to access EC2 metadata server: %s', http_err) + logger.error('HTTP error occurred while trying to access EC2 metadata server: %s', http_err) except Exception as err: - logging.error('A fatal error occurred while trying to access EC2 metadata server: %s', err) + logger.error('A fatal error occurred while trying to access EC2 metadata server: %s', err) return None def get_nodes_list(conn, options): + logger.info("Starting monitor operation") result = {} try: for instance in conn.instances.all(): @@ -32,14 +33,16 @@ def get_nodes_list(conn, options): except EndpointConnectionError: fail_usage("Failed: Incorrect Region.") except Exception as e: - logging.error("Failed to get node list: %s", e) - + logger.error("Failed to get node list: %s", e) + logger.debug("Monitor operation OK: %s",result) return result def get_power_status(conn, options): + logger.debug("Starting status operation") try: instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [options["--plug"]]}]) state = list(instance)[0].state["Name"] + logger.info("Status operation for EC2 instance %s returned state: %s",options["--plug"],state.upper()) if state == "running": return "on" elif state == "stopped": @@ -80,14 +83,14 @@ def set_power_status(conn, options): try: if (options["--action"]=="off"): if (get_self_power_status(conn,myinstance) == "ok"): - logging.info("Called StopInstance API call for %s", options["--plug"]) conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True) + logger.info("Called StopInstance API call for %s", options["--plug"]) else: - logging.info("Skipping fencing as instance is not in running status") + logger.info("Skipping fencing as instance is not in running status") elif (options["--action"]=="on"): conn.instances.filter(InstanceIds=[options["--plug"]]).start() except Exception as e: - logging.error("Failed to power %s %s: %s", \ + logger.error("Failed to power %s %s: %s", \ options["--action"], options["--plug"], e) def define_new_opts(): @@ -142,6 +145,13 @@ def main(): run_delay(options) + if options.get("--verbose") is not None: + lh = logging.FileHandler('/var/log/fence_aws_debug.log') + logger.addHandler(lh) + lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + lh.setFormatter(lhf) + logger.setLevel(logging.DEBUG) + region = options.get("--region") access_key = options.get("--access-key") secret_key = options.get("--secret-key") @@ -157,4 +167,12 @@ def main(): sys.exit(result) if __name__ == "__main__": + + logger = logging.getLogger("fence_aws") + logger.propagate = False + logger.setLevel(logging.INFO) + logger.addHandler(SyslogLibHandler()) + logger.getLogger('botocore.vendored').propagate = False + + main() From 00569921597b8007c67296ab8332747baf1e6fae Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Tue, 17 Mar 2020 13:33:02 +0000 Subject: [PATCH 3/9] fence_aws: Decouple boto3 and botocore debug logging from local logging --- agents/aws/fence_aws.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index b0b6685a..11714315 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -118,18 +118,27 @@ def define_new_opts(): "required" : "0", "order" : 4 } + all_opt["boto3_debug"] = { + "getopt" : "b:", + "longopt" : "boto3_debug", + "help" : "-b, --boto3_debug=on|off Boto3 and Botocore library debug logging", + "shortdesc": "Boto Lib debug", + "required": "0", + "order": 5 + } # Main agent method def main(): conn = None - device_opt = ["port", "no_password", "region", "access_key", "secret_key"] + device_opt = ["port", "no_password", "region", "access_key", "secret_key", "boto3_debug"] atexit.register(atexit_handler) define_new_opts() all_opt["power_timeout"]["default"] = "60" + all_opt["boto3_debug"]["default"] = "off" options = check_input(device_opt, process_input(device_opt)) @@ -151,6 +160,21 @@ def main(): lhf = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') lh.setFormatter(lhf) logger.setLevel(logging.DEBUG) + + if options["--boto3_debug"] != "on": + boto3.set_stream_logger('boto3',logging.INFO) + boto3.set_stream_logger('botocore',logging.INFO) + logging.getLogger('botocore').propagate = False + logging.getLogger('boto3').propagate = False + else: + log_format = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s') + logging.getLogger('botocore').propagate = False + logging.getLogger('boto3').propagate = False + fdh = logging.FileHandler('/var/log/fence_aws_boto3.log') + fdh.setFormatter(log_format) + logging.getLogger('boto3').addHandler(fdh) + logging.getLogger('botocore').addHandler(fdh) + logging.debug("Boto debug level is %s and sending debug info to /var/log/fence_aws_boto3.log", options["--boto3_debug"]) region = options.get("--region") access_key = options.get("--access-key") From ed309bd51dfd5e0fed30156e7a312d5b5a8f4bd4 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Thu, 19 Mar 2020 16:02:47 +0000 Subject: [PATCH 4/9] fence_aws: Fix typos and variable names --- agents/aws/fence_aws.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index 11714315..207631e8 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -60,7 +60,7 @@ def get_power_status(conn, options): logging.error("Failed to get power status: %s", e) fail(EC_STATUS) -def get_self_power_status(conn, options): +def get_self_power_status(conn, instance_id): try: instance = conn.instances.filter(Filters=[{"Name": "instance-id", "Values": [instance_id]}]) state = list(instance)[0].state["Name"] @@ -82,7 +82,7 @@ def set_power_status(conn, options): my_instance = get_instance_id() try: if (options["--action"]=="off"): - if (get_self_power_status(conn,myinstance) == "ok"): + if (get_self_power_status(conn,my_instance) == "ok"): conn.instances.filter(InstanceIds=[options["--plug"]]).stop(Force=True) logger.info("Called StopInstance API call for %s", options["--plug"]) else: @@ -196,7 +196,7 @@ def main(): logger.propagate = False logger.setLevel(logging.INFO) logger.addHandler(SyslogLibHandler()) - logger.getLogger('botocore.vendored').propagate = False + logging.getLogger('botocore.vendored').propagate = False main() From 624c652a95a676286af408898186186b7d7fcf55 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Thu, 19 Mar 2020 16:58:45 +0000 Subject: [PATCH 5/9] fence_aws: Missing brackets on boto3_debug metadata --- agents/aws/fence_aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index 207631e8..8916f4a0 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -121,7 +121,7 @@ def define_new_opts(): all_opt["boto3_debug"] = { "getopt" : "b:", "longopt" : "boto3_debug", - "help" : "-b, --boto3_debug=on|off Boto3 and Botocore library debug logging", + "help" : "-b, --boto3_debug=[on|off] Boto3 and Botocore library debug logging", "shortdesc": "Boto Lib debug", "required": "0", "order": 5 From 7c641a6885c4ab67b7739a43892d92d95a6f566c Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Thu, 19 Mar 2020 17:04:31 +0000 Subject: [PATCH 6/9] fence_aws: Fix travis build #1 --- agents/aws/fence_aws.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index 8916f4a0..f41a47e4 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -121,7 +121,7 @@ def define_new_opts(): all_opt["boto3_debug"] = { "getopt" : "b:", "longopt" : "boto3_debug", - "help" : "-b, --boto3_debug=[on|off] Boto3 and Botocore library debug logging", + "help" : "-b, --boto3_debug=[option] Boto3 and Botocore library debug logging", "shortdesc": "Boto Lib debug", "required": "0", "order": 5 From 257af7ccc9789646adc7abf1e7dbac744b756071 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Fri, 20 Mar 2020 10:59:56 +0000 Subject: [PATCH 7/9] fence_aws: Updated metadata XML file --- tests/data/metadata/fence_aws.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/data/metadata/fence_aws.xml b/tests/data/metadata/fence_aws.xml index 5e5d5d99..acfebb61 100644 --- a/tests/data/metadata/fence_aws.xml +++ b/tests/data/metadata/fence_aws.xml @@ -36,6 +36,11 @@ For instructions see: https://boto3.readthedocs.io/en/latest/guide/quickstart.ht Secret Key. + + + + Boto Lib debug + From 8f78bc19356b5e07d0021aaf7da3fc4e712e00f0 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Fri, 20 Mar 2020 12:13:16 +0000 Subject: [PATCH 8/9] fence_aws: Moving logger config next to import statements for visibility --- agents/aws/fence_aws.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index f41a47e4..72fb8843 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -11,6 +11,12 @@ import boto3 from botocore.exceptions import ClientError, EndpointConnectionError, NoRegionError +logger = logging.getLogger("fence_aws") +logger.propagate = False +logger.setLevel(logging.INFO) +logger.addHandler(SyslogLibHandler()) +logging.getLogger('botocore.vendored').propagate = False + def get_instance_id(): try: r = requests.get('http://169.254.169.254/latest/meta-data/instance-id') @@ -192,11 +198,4 @@ def main(): if __name__ == "__main__": - logger = logging.getLogger("fence_aws") - logger.propagate = False - logger.setLevel(logging.INFO) - logger.addHandler(SyslogLibHandler()) - logging.getLogger('botocore.vendored').propagate = False - - main() From 570a05c425fe55008c8892ebaad8a73d36143909 Mon Sep 17 00:00:00 2001 From: Guilherme Felix Date: Fri, 20 Mar 2020 14:17:55 +0000 Subject: [PATCH 9/9] fence_aws: Remove empty line --- agents/aws/fence_aws.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/agents/aws/fence_aws.py b/agents/aws/fence_aws.py index 72fb8843..ed55f390 100644 --- a/agents/aws/fence_aws.py +++ b/agents/aws/fence_aws.py @@ -197,5 +197,4 @@ def main(): sys.exit(result) if __name__ == "__main__": - - main() + main() \ No newline at end of file