316 lines
14 KiB
Diff
316 lines
14 KiB
Diff
|
From ca5b83cee7b45bf56eec258db739cb5fe51b3231 Mon Sep 17 00:00:00 2001
|
||
|
From: aswinrajamannar <39812128+aswinrajamannar@users.noreply.github.com>
|
||
|
Date: Mon, 26 Apr 2021 07:28:39 -0700
|
||
|
Subject: [PATCH 6/7] Azure: Retry net metadata during nic attach for
|
||
|
non-timeout errs (#878)
|
||
|
|
||
|
RH-Author: Eduardo Otubo <otubo@redhat.com>
|
||
|
RH-MergeRequest: 45: Add support for userdata on Azure from IMDS
|
||
|
RH-Commit: [6/7] 4e6e44f017d5ffcb72ac8959a94f80c71fef9560
|
||
|
RH-Bugzilla: 2023940
|
||
|
RH-Acked-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
|
||
|
RH-Acked-by: Mohamed Gamal Morsy <mmorsy@redhat.com>
|
||
|
|
||
|
When network interfaces are hot-attached to the VM, attempting to get
|
||
|
network metadata might return 410 (or 500, 503 etc) because the info
|
||
|
is not yet available. In those cases, we retry getting the metadata
|
||
|
before giving up. The only case where we can move on to wait for more
|
||
|
nic attach events is if the call times out despite retries, which
|
||
|
means the interface is not likely a primary interface, and we should
|
||
|
try for more nic attach events.
|
||
|
---
|
||
|
cloudinit/sources/DataSourceAzure.py | 65 +++++++++++--
|
||
|
tests/unittests/test_datasource/test_azure.py | 95 ++++++++++++++++---
|
||
|
2 files changed, 140 insertions(+), 20 deletions(-)
|
||
|
|
||
|
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
|
||
|
index 6d7954ee..d0be6d84 100755
|
||
|
--- a/cloudinit/sources/DataSourceAzure.py
|
||
|
+++ b/cloudinit/sources/DataSourceAzure.py
|
||
|
@@ -17,6 +17,7 @@ from time import sleep
|
||
|
from xml.dom import minidom
|
||
|
import xml.etree.ElementTree as ET
|
||
|
from enum import Enum
|
||
|
+import requests
|
||
|
|
||
|
from cloudinit import dmi
|
||
|
from cloudinit import log as logging
|
||
|
@@ -665,7 +666,9 @@ class DataSourceAzure(sources.DataSource):
|
||
|
self,
|
||
|
fallback_nic,
|
||
|
retries,
|
||
|
- md_type=metadata_type.compute):
|
||
|
+ md_type=metadata_type.compute,
|
||
|
+ exc_cb=retry_on_url_exc,
|
||
|
+ infinite=False):
|
||
|
"""
|
||
|
Wrapper for get_metadata_from_imds so that we can have flexibility
|
||
|
in which IMDS api-version we use. If a particular instance of IMDS
|
||
|
@@ -685,7 +688,8 @@ class DataSourceAzure(sources.DataSource):
|
||
|
fallback_nic=fallback_nic,
|
||
|
retries=0,
|
||
|
md_type=md_type,
|
||
|
- api_version=IMDS_VER_WANT
|
||
|
+ api_version=IMDS_VER_WANT,
|
||
|
+ exc_cb=exc_cb
|
||
|
)
|
||
|
except UrlError as err:
|
||
|
LOG.info(
|
||
|
@@ -708,7 +712,9 @@ class DataSourceAzure(sources.DataSource):
|
||
|
fallback_nic=fallback_nic,
|
||
|
retries=retries,
|
||
|
md_type=md_type,
|
||
|
- api_version=IMDS_VER_MIN
|
||
|
+ api_version=IMDS_VER_MIN,
|
||
|
+ exc_cb=exc_cb,
|
||
|
+ infinite=infinite
|
||
|
)
|
||
|
|
||
|
def device_name_to_device(self, name):
|
||
|
@@ -938,6 +944,9 @@ class DataSourceAzure(sources.DataSource):
|
||
|
is_primary = False
|
||
|
expected_nic_count = -1
|
||
|
imds_md = None
|
||
|
+ metadata_poll_count = 0
|
||
|
+ metadata_logging_threshold = 1
|
||
|
+ metadata_timeout_count = 0
|
||
|
|
||
|
# For now, only a VM's primary NIC can contact IMDS and WireServer. If
|
||
|
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
|
||
|
@@ -962,14 +971,48 @@ class DataSourceAzure(sources.DataSource):
|
||
|
% (ifname, e), logger_func=LOG.error)
|
||
|
raise
|
||
|
|
||
|
+ # Retry polling network metadata for a limited duration only when the
|
||
|
+ # calls fail due to timeout. This is because the platform drops packets
|
||
|
+ # going towards IMDS when it is not a primary nic. If the calls fail
|
||
|
+ # due to other issues like 410, 503 etc, then it means we are primary
|
||
|
+ # but IMDS service is unavailable at the moment. Retry indefinitely in
|
||
|
+ # those cases since we cannot move on without the network metadata.
|
||
|
+ def network_metadata_exc_cb(msg, exc):
|
||
|
+ nonlocal metadata_timeout_count, metadata_poll_count
|
||
|
+ nonlocal metadata_logging_threshold
|
||
|
+
|
||
|
+ metadata_poll_count = metadata_poll_count + 1
|
||
|
+
|
||
|
+ # Log when needed but back off exponentially to avoid exploding
|
||
|
+ # the log file.
|
||
|
+ if metadata_poll_count >= metadata_logging_threshold:
|
||
|
+ metadata_logging_threshold *= 2
|
||
|
+ report_diagnostic_event(
|
||
|
+ "Ran into exception when attempting to reach %s "
|
||
|
+ "after %d polls." % (msg, metadata_poll_count),
|
||
|
+ logger_func=LOG.error)
|
||
|
+
|
||
|
+ if isinstance(exc, UrlError):
|
||
|
+ report_diagnostic_event("poll IMDS with %s failed. "
|
||
|
+ "Exception: %s and code: %s" %
|
||
|
+ (msg, exc.cause, exc.code),
|
||
|
+ logger_func=LOG.error)
|
||
|
+
|
||
|
+ if exc.cause and isinstance(exc.cause, requests.Timeout):
|
||
|
+ metadata_timeout_count = metadata_timeout_count + 1
|
||
|
+ return (metadata_timeout_count <= 10)
|
||
|
+ return True
|
||
|
+
|
||
|
# Primary nic detection will be optimized in the future. The fact that
|
||
|
# primary nic is being attached first helps here. Otherwise each nic
|
||
|
# could add several seconds of delay.
|
||
|
try:
|
||
|
imds_md = self.get_imds_data_with_api_fallback(
|
||
|
ifname,
|
||
|
- 5,
|
||
|
- metadata_type.network
|
||
|
+ 0,
|
||
|
+ metadata_type.network,
|
||
|
+ network_metadata_exc_cb,
|
||
|
+ True
|
||
|
)
|
||
|
except Exception as e:
|
||
|
LOG.warning(
|
||
|
@@ -2139,7 +2182,9 @@ def _generate_network_config_from_fallback_config() -> dict:
|
||
|
def get_metadata_from_imds(fallback_nic,
|
||
|
retries,
|
||
|
md_type=metadata_type.compute,
|
||
|
- api_version=IMDS_VER_MIN):
|
||
|
+ api_version=IMDS_VER_MIN,
|
||
|
+ exc_cb=retry_on_url_exc,
|
||
|
+ infinite=False):
|
||
|
"""Query Azure's instance metadata service, returning a dictionary.
|
||
|
|
||
|
If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
|
||
|
@@ -2158,7 +2203,7 @@ def get_metadata_from_imds(fallback_nic,
|
||
|
kwargs = {'logfunc': LOG.debug,
|
||
|
'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
|
||
|
'func': _get_metadata_from_imds,
|
||
|
- 'args': (retries, md_type, api_version,)}
|
||
|
+ 'args': (retries, exc_cb, md_type, api_version, infinite)}
|
||
|
if net.is_up(fallback_nic):
|
||
|
return util.log_time(**kwargs)
|
||
|
else:
|
||
|
@@ -2176,14 +2221,16 @@ def get_metadata_from_imds(fallback_nic,
|
||
|
@azure_ds_telemetry_reporter
|
||
|
def _get_metadata_from_imds(
|
||
|
retries,
|
||
|
+ exc_cb,
|
||
|
md_type=metadata_type.compute,
|
||
|
- api_version=IMDS_VER_MIN):
|
||
|
+ api_version=IMDS_VER_MIN,
|
||
|
+ infinite=False):
|
||
|
url = "{}?api-version={}".format(md_type.value, api_version)
|
||
|
headers = {"Metadata": "true"}
|
||
|
try:
|
||
|
response = readurl(
|
||
|
url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
|
||
|
- retries=retries, exception_cb=retry_on_url_exc)
|
||
|
+ retries=retries, exception_cb=exc_cb, infinite=infinite)
|
||
|
except Exception as e:
|
||
|
# pylint:disable=no-member
|
||
|
if isinstance(e, UrlError) and e.code == 400:
|
||
|
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
|
||
|
index d9817d84..c4a8e08d 100644
|
||
|
--- a/tests/unittests/test_datasource/test_azure.py
|
||
|
+++ b/tests/unittests/test_datasource/test_azure.py
|
||
|
@@ -448,7 +448,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
|
||
|
"http://169.254.169.254/metadata/instance?api-version="
|
||
|
"2019-06-01", exception_cb=mock.ANY,
|
||
|
headers=mock.ANY, retries=mock.ANY,
|
||
|
- timeout=mock.ANY)
|
||
|
+ timeout=mock.ANY, infinite=False)
|
||
|
|
||
|
@mock.patch(MOCKPATH + 'readurl', autospec=True)
|
||
|
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
|
||
|
@@ -467,7 +467,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
|
||
|
"http://169.254.169.254/metadata/instance/network?api-version="
|
||
|
"2019-06-01", exception_cb=mock.ANY,
|
||
|
headers=mock.ANY, retries=mock.ANY,
|
||
|
- timeout=mock.ANY)
|
||
|
+ timeout=mock.ANY, infinite=False)
|
||
|
|
||
|
@mock.patch(MOCKPATH + 'readurl', autospec=True)
|
||
|
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
|
||
|
@@ -486,7 +486,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
|
||
|
"http://169.254.169.254/metadata/instance?api-version="
|
||
|
"2019-06-01", exception_cb=mock.ANY,
|
||
|
headers=mock.ANY, retries=mock.ANY,
|
||
|
- timeout=mock.ANY)
|
||
|
+ timeout=mock.ANY, infinite=False)
|
||
|
|
||
|
@mock.patch(MOCKPATH + 'readurl', autospec=True)
|
||
|
@mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True)
|
||
|
@@ -511,7 +511,7 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
|
||
|
m_readurl.assert_called_with(
|
||
|
self.network_md_url, exception_cb=mock.ANY,
|
||
|
headers={'Metadata': 'true'}, retries=2,
|
||
|
- timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)
|
||
|
+ timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False)
|
||
|
|
||
|
@mock.patch('cloudinit.url_helper.time.sleep')
|
||
|
@mock.patch(MOCKPATH + 'net.is_up', autospec=True)
|
||
|
@@ -2694,15 +2694,22 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
|
||
|
|
||
|
def nic_attach_ret(nl_sock, nics_found):
|
||
|
nonlocal m_attach_call_count
|
||
|
- if m_attach_call_count == 0:
|
||
|
- m_attach_call_count = m_attach_call_count + 1
|
||
|
+ m_attach_call_count = m_attach_call_count + 1
|
||
|
+ if m_attach_call_count == 1:
|
||
|
return "eth0"
|
||
|
- return "eth1"
|
||
|
+ elif m_attach_call_count == 2:
|
||
|
+ return "eth1"
|
||
|
+ raise RuntimeError("Must have found primary nic by now.")
|
||
|
+
|
||
|
+ # Simulate two NICs by adding the same one twice.
|
||
|
+ md = {
|
||
|
+ "interface": [
|
||
|
+ IMDS_NETWORK_METADATA['interface'][0],
|
||
|
+ IMDS_NETWORK_METADATA['interface'][0]
|
||
|
+ ]
|
||
|
+ }
|
||
|
|
||
|
- def network_metadata_ret(ifname, retries, type):
|
||
|
- # Simulate two NICs by adding the same one twice.
|
||
|
- md = IMDS_NETWORK_METADATA
|
||
|
- md['interface'].append(md['interface'][0])
|
||
|
+ def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
|
||
|
if ifname == "eth0":
|
||
|
return md
|
||
|
raise requests.Timeout('Fake connection timeout')
|
||
|
@@ -2724,6 +2731,72 @@ class TestPreprovisioningHotAttachNics(CiTestCase):
|
||
|
self.assertEqual(1, m_imds.call_count)
|
||
|
self.assertEqual(2, m_link_up.call_count)
|
||
|
|
||
|
+ @mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback')
|
||
|
+ @mock.patch(MOCKPATH + 'EphemeralDHCPv4')
|
||
|
+ def test_check_if_nic_is_primary_retries_on_failures(
|
||
|
+ self, m_dhcpv4, m_imds):
|
||
|
+ """Retry polling for network metadata on all failures except timeout"""
|
||
|
+ dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
|
||
|
+ lease = {
|
||
|
+ 'interface': 'eth9', 'fixed-address': '192.168.2.9',
|
||
|
+ 'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
|
||
|
+ 'unknown-245': '624c3620'}
|
||
|
+
|
||
|
+ eth0Retries = []
|
||
|
+ eth1Retries = []
|
||
|
+ # Simulate two NICs by adding the same one twice.
|
||
|
+ md = {
|
||
|
+ "interface": [
|
||
|
+ IMDS_NETWORK_METADATA['interface'][0],
|
||
|
+ IMDS_NETWORK_METADATA['interface'][0]
|
||
|
+ ]
|
||
|
+ }
|
||
|
+
|
||
|
+ def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
|
||
|
+ nonlocal eth0Retries, eth1Retries
|
||
|
+
|
||
|
+ # Simulate readurl functionality with retries and
|
||
|
+ # exception callbacks so that the callback logic can be
|
||
|
+ # validated.
|
||
|
+ if ifname == "eth0":
|
||
|
+ cause = requests.HTTPError()
|
||
|
+ for _ in range(0, 15):
|
||
|
+ error = url_helper.UrlError(cause=cause, code=410)
|
||
|
+ eth0Retries.append(exc_cb("No goal state.", error))
|
||
|
+ else:
|
||
|
+ cause = requests.Timeout('Fake connection timeout')
|
||
|
+ for _ in range(0, 10):
|
||
|
+ error = url_helper.UrlError(cause=cause)
|
||
|
+ eth1Retries.append(exc_cb("Connection timeout", error))
|
||
|
+ # Should stop retrying after 10 retries
|
||
|
+ eth1Retries.append(exc_cb("Connection timeout", error))
|
||
|
+ raise cause
|
||
|
+ return md
|
||
|
+
|
||
|
+ m_imds.side_effect = network_metadata_ret
|
||
|
+
|
||
|
+ dhcp_ctx = mock.MagicMock(lease=lease)
|
||
|
+ dhcp_ctx.obtain_lease.return_value = lease
|
||
|
+ m_dhcpv4.return_value = dhcp_ctx
|
||
|
+
|
||
|
+ is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0")
|
||
|
+ self.assertEqual(True, is_primary)
|
||
|
+ self.assertEqual(2, expected_nic_count)
|
||
|
+
|
||
|
+ # All Eth0 errors are non-timeout errors. So we should have been
|
||
|
+ # retrying indefinitely until success.
|
||
|
+ for i in eth0Retries:
|
||
|
+ self.assertTrue(i)
|
||
|
+
|
||
|
+ is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1")
|
||
|
+ self.assertEqual(False, is_primary)
|
||
|
+
|
||
|
+ # All Eth1 errors are timeout errors. Retry happens for a max of 10 and
|
||
|
+ # then we should have moved on assuming it is not the primary nic.
|
||
|
+ for i in range(0, 10):
|
||
|
+ self.assertTrue(eth1Retries[i])
|
||
|
+ self.assertFalse(eth1Retries[10])
|
||
|
+
|
||
|
@mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
|
||
|
def test_wait_for_link_up_returns_if_already_up(
|
||
|
self, m_is_link_up):
|
||
|
--
|
||
|
2.27.0
|
||
|
|