nmstate/SOURCES/BZ_1916073_better-handling-for-timeout.patch
2021-09-09 22:51:59 +00:00

420 lines
14 KiB
Diff

From 803ad90f11eb57221e7805e5cba8c309bafe1de8 Mon Sep 17 00:00:00 2001
From: Gris Ge <fge@redhat.com>
Date: Wed, 13 Jan 2021 23:51:31 +0800
Subject: [PATCH 1/2] nm: Better handling for timeout
When creating 1000 VLAN along with 1000 bridge using each VLAN,
NetworkManager might trigger two timeout:
* The callback raises `Gio.IOErrorEnum.TIMED_OUT` error.
* NetworkManager never call callback, nmstate idle check trigger the
timeout.
To solve above issue:
* Increase the nmstate idle check timeout to 5 minutes.
* For actions like add profile and activate profile, we add a
fallback checker which check whether requested action is already
finished using `GLib.timeout_source_new()`
* When `Gio.IOErrorEnum.TIMED_OUT` happens, ignore the failure and wait
fallback checker.
* The fallback checker is only started 15 seconds after action started,
so this does not impact small desire state.
Test results on RHEL 8.3 i7-8665U 2G RAM:
10m29.212s to create 1000 VLAN and 1000 bridge over each VLAN.
Changed the integration test case to test 500 VLANs + 500 bridges.
Signed-off-by: Gris Ge <fge@redhat.com>
---
libnmstate/nm/connection.py | 67 ++++++++++++++++++++++++++++++++++++-
libnmstate/nm/context.py | 3 +-
2 files changed, 67 insertions(+), 3 deletions(-)
diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py
index 1f6c734..374a379 100644
--- a/libnmstate/nm/connection.py
+++ b/libnmstate/nm/connection.py
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2018-2020 Red Hat, Inc.
+# Copyright (c) 2018-2021 Red Hat, Inc.
#
# This file is part of nmstate
#
@@ -24,11 +24,14 @@ from libnmstate.error import NmstateLibnmError
from libnmstate.error import NmstateInternalError
from libnmstate.error import NmstateValueError
+from .common import GLib
+from .common import Gio
from .common import NM
from . import ipv4
from . import ipv6
ACTIVATION_TIMEOUT_FOR_BRIDGE = 35 # Bridge STP requires 30 seconds.
+FALLBACK_CHECKER_INTERNAL = 15
class ConnectionProfile:
@@ -40,6 +43,7 @@ class ConnectionProfile:
self._nm_ac = None
self._ac_handlers = set()
self._dev_handlers = set()
+ self._fallback_checker = None
def create(self, settings):
self.profile = NM.SimpleConnection.new()
@@ -102,6 +106,26 @@ class ConnectionProfile:
self._add_connection2_callback,
user_data,
)
+ self._fallback_checker = GLib.timeout_source_new(
+ FALLBACK_CHECKER_INTERNAL * 1000
+ )
+ self._fallback_checker.set_callback(
+ self._profile_add_fallback_checker_callback, action
+ )
+ self._fallback_checker.attach(self._ctx.context)
+
+ def _profile_add_fallback_checker_callback(self, action):
+ for nm_profile in self._ctx.client.get_connections():
+ if nm_profile.get_uuid() == self.profile.get_uuid():
+ self._fallback_checker_cleanup()
+ self._ctx.finish_async(action)
+ return GLib.SOURCE_REMOVE
+ return GLib.SOURCE_CONTINUE
+
+ def _fallback_checker_cleanup(self):
+ if self._fallback_checker:
+ self._fallback_checker.destroy()
+ self._fallback_checker = None
def delete(self):
if not self.profile:
@@ -152,6 +176,26 @@ class ConnectionProfile:
self._active_connection_callback,
user_data,
)
+ self._fallback_checker = GLib.timeout_source_new(
+ FALLBACK_CHECKER_INTERNAL * 1000
+ )
+ self._fallback_checker.set_callback(
+ self._activation_fallback_checker_callback, action
+ )
+ self._fallback_checker.attach(self._ctx.context)
+
+ def _activation_fallback_checker_callback(self, action):
+ if self.devname:
+ self._nm_dev = self._ctx.get_nm_dev(self.devname)
+ if self._nm_dev:
+ self._activation_progress_check(action)
+ return GLib.SOURCE_CONTINUE
+ else:
+ logging.warn(
+ "Failed to get interface name from profile, "
+ "can not perform flalback check on activation"
+ )
+ return GLib.SOURCE_REMOVE
@property
def profile(self):
@@ -213,6 +257,18 @@ class ConnectionProfile:
try:
nm_act_con = src_object.activate_connection_finish(result)
+ except GLib.Error as e:
+ if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+ logging.debug(
+ f"{action} timeout on activation, "
+ "using fallback method to wait activation"
+ )
+ return
+ else:
+ self._ctx.fail(
+ NmstateLibnmError(f"{action} failed: error={e}")
+ )
+ return
except Exception as e:
self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}"))
return
@@ -366,6 +422,7 @@ class ConnectionProfile:
def _activation_clean_up(self):
self._remove_ac_handlers()
self._remove_dev_handlers()
+ self._fallback_checker_cleanup()
def _is_activating(self):
if not self._nm_ac or not self._nm_dev:
@@ -396,6 +453,13 @@ class ConnectionProfile:
action = user_data
try:
profile = src_object.add_connection2_finish(result)[0]
+ except GLib.Error as e:
+ if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+ logging.debug(
+ f"{action} timeout, using fallback method to "
+ "wait profile creation"
+ )
+ return
except Exception as e:
self._ctx.fail(
NmstateLibnmError(f"{action} failed with error: {e}")
@@ -410,6 +474,7 @@ class ConnectionProfile:
)
)
else:
+ self._fallback_checker_cleanup()
self._ctx.finish_async(action)
def _update2_callback(self, src_object, result, user_data):
diff --git a/libnmstate/nm/context.py b/libnmstate/nm/context.py
index 373ffe8..bc5c41c 100644
--- a/libnmstate/nm/context.py
+++ b/libnmstate/nm/context.py
@@ -31,8 +31,7 @@ from .common import Gio
# last finish async action.
IDLE_CHECK_INTERNAL = 5
-# libnm dbus connection has reply timeout 25 seconds.
-IDLE_TIMEOUT = 25
+IDLE_TIMEOUT = 60 * 5 # 5 minutes
# NetworkManage is using dbus in libnm while the dbus has limitation on
# maximum number of pending replies per connection.(RHEL/CentOS 8 is 1024)
--
2.27.0
From ac82d18f96aa2313583efa1477be441291e2957c Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Sun, 17 Jan 2021 11:18:10 +0800
Subject: [PATCH 2/2] nm: Use fallback checker on profile deactivation and
delete
When NM is under heave loads, NM might raise timeout error when
try to deactivate or delete a profile, to solve that this patch
introduce the same method in 2407f98
to have a fallback checker on whether profile is deactivated/deleted
every 15 seconds.
No test case required as current `test_lot_of_vlans_with_bridges` test
case has `state: absent` which is good enough for testing this patch.
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Gris Ge <fge@redhat.com>
---
libnmstate/nm/active_connection.py | 61 ++++++++++++++++++++++++------
libnmstate/nm/connection.py | 30 +++++++++++++++
2 files changed, 80 insertions(+), 11 deletions(-)
diff --git a/libnmstate/nm/active_connection.py b/libnmstate/nm/active_connection.py
index 062c78a..b235e8b 100644
--- a/libnmstate/nm/active_connection.py
+++ b/libnmstate/nm/active_connection.py
@@ -21,12 +21,14 @@ import logging
from libnmstate.error import NmstateLibnmError
+from .common import Gio
from .common import GLib
from .common import GObject
from .common import NM
NM_AC_STATE_CHANGED_SIGNAL = "state-changed"
+FALLBACK_CHECKER_INTERNAL = 15
class ActivationError(Exception):
@@ -37,6 +39,8 @@ class ActiveConnection:
def __init__(self, context=None, nm_ac_con=None):
self._ctx = context
self._act_con = nm_ac_con
+ self._signal_handler = None
+ self._fallback_checker = None
nmdevs = None
if nm_ac_con:
@@ -75,19 +79,35 @@ class ActiveConnection:
action = f"Deactivate profile: {self.devname}"
self._ctx.register_async(action)
- handler_id = act_connection.connect(
+ self._signal_handler = act_connection.connect(
NM_AC_STATE_CHANGED_SIGNAL,
self._wait_state_changed_callback,
action,
)
if act_connection.props.state != NM.ActiveConnectionState.DEACTIVATING:
- user_data = (handler_id, action)
+ user_data = action
self._ctx.client.deactivate_connection_async(
act_connection,
self._ctx.cancellable,
self._deactivate_connection_callback,
user_data,
)
+ self._fallback_checker = GLib.timeout_source_new(
+ FALLBACK_CHECKER_INTERNAL * 1000
+ )
+ self._fallback_checker.set_callback(
+ self._deactivation_fallback_checker_callback, action
+ )
+ self._fallback_checker.attach(self._ctx.context)
+
+ def _clean_up(self):
+ if self._signal_handler:
+ if self._act_con:
+ self._act_con.handler_disconnect(self._signal_handler)
+ self._signal_handler = None
+ if self._fallback_checker:
+ self._fallback_checker.destroy()
+ self._fallback_checker = None
def _wait_state_changed_callback(self, act_con, state, reason, action):
if self._ctx.is_cancelled():
@@ -96,13 +116,13 @@ class ActiveConnection:
logging.debug(
"Connection deactivation succeeded on %s", self.devname,
)
+ self._clean_up()
self._ctx.finish_async(action)
def _deactivate_connection_callback(self, src_object, result, user_data):
- handler_id, action = user_data
+ action = user_data
if self._ctx.is_cancelled():
- if self._act_con:
- self._act_con.handler_disconnect(handler_id)
+ self._clean_up()
return
try:
@@ -116,16 +136,20 @@ class ActiveConnection:
"Connection is not active on {}, no need to "
"deactivate".format(self.devname)
)
+ elif e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+ logging.debug(
+ f"{action} timeout, using fallback method to wait profile "
+ "deactivation"
+ )
+ return
else:
- if self._act_con:
- self._act_con.handler_disconnect(handler_id)
+ self._clean_up()
self._ctx.fail(
NmstateLibnmError(f"{action} failed: error={e}")
)
return
except Exception as e:
- if self._act_con:
- self._act_con.handler_disconnect(handler_id)
+ self._clean_up()
self._ctx.fail(
NmstateLibnmError(
f"BUG: Unexpected error when activating {self.devname} "
@@ -135,8 +159,7 @@ class ActiveConnection:
return
if not success:
- if self._act_con:
- self._act_con.handler_disconnect(handler_id)
+ self._clean_up()
self._ctx.fail(
NmstateLibnmError(
f"{action} failed: error='None returned from "
@@ -144,6 +167,22 @@ class ActiveConnection:
)
)
+ def _deactivation_fallback_checker_callback(self, action):
+ if self.devname:
+ self._nmdev = self._ctx.get_nm_dev(self.devname)
+ if self._nmdev:
+ self._act_con = self._nmdev.get_active_connection()
+ if (
+ self._act_con
+ and self._act_con.props.state
+ != NM.ActiveConnectionState.DEACTIVATED
+ ):
+ return GLib.SOURCE_CONTINUE
+
+ self._clean_up()
+ self._ctx.finish_async(action)
+ return GLib.SOURCE_REMOVE
+
@property
def nm_active_connection(self):
return self._act_con
diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py
index 374a379..af7296f 100644
--- a/libnmstate/nm/connection.py
+++ b/libnmstate/nm/connection.py
@@ -144,6 +144,13 @@ class ConnectionProfile:
self._delete_connection_callback,
user_data,
)
+ self._fallback_checker = GLib.timeout_source_new(
+ FALLBACK_CHECKER_INTERNAL * 1000
+ )
+ self._fallback_checker.set_callback(
+ self._delete_fallback_checker_callback, action
+ )
+ self._fallback_checker.attach(self._ctx.context)
def activate(self):
if self.con_id:
@@ -504,11 +511,24 @@ class ConnectionProfile:
action = user_data
try:
success = src_object.delete_finish(result)
+ except GLib.Error as e:
+ if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
+ logging.debug(
+ f"{action} timeout, using fallback method to wait profile "
+ "deletion"
+ )
+ return
+ else:
+ self._ctx.fail(
+ NmstateLibnmError(f"{action} failed with error: {e}")
+ )
+ return
except Exception as e:
self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}"))
return
if success:
+ self._fallback_checker_cleanup()
self._ctx.finish_async(action)
else:
self._ctx.fail(
@@ -518,6 +538,16 @@ class ConnectionProfile:
)
)
+ def _delete_fallback_checker_callback(self, action):
+ if self.profile:
+ for nm_profile in self._ctx.client.get_connections():
+ if nm_profile.get_uuid() == self.profile.get_uuid():
+ return GLib.SOURCE_CONTINUE
+
+ self._fallback_checker_cleanup()
+ self._ctx.finish_async(action)
+ return GLib.SOURCE_REMOVE
+
def _reset_profile(self):
self._con_profile = None
--
2.27.0