420 lines
14 KiB
Diff
420 lines
14 KiB
Diff
From 803ad90f11eb57221e7805e5cba8c309bafe1de8 Mon Sep 17 00:00:00 2001
|
|
From: Gris Ge <fge@redhat.com>
|
|
Date: Wed, 13 Jan 2021 23:51:31 +0800
|
|
Subject: [PATCH 1/2] nm: Better handling for timeout
|
|
|
|
When creating 1000 VLAN along with 1000 bridge using each VLAN,
|
|
NetworkManager might trigger two timeout:
|
|
|
|
* The callback raises `Gio.IOErrorEnum.TIMED_OUT` error.
|
|
* NetworkManager never call callback, nmstate idle check trigger the
|
|
timeout.
|
|
|
|
To solve above issue:
|
|
|
|
* Increase the nmstate idle check timeout to 5 minutes.
|
|
|
|
* For actions like add profile and activate profile, we add a
|
|
fallback checker which check whether requested action is already
|
|
finished using `GLib.timeout_source_new()`
|
|
|
|
* When `Gio.IOErrorEnum.TIMED_OUT` happens, ignore the failure and wait
|
|
fallback checker.
|
|
|
|
* The fallback checker is only started 15 seconds after action started,
|
|
so this does not impact small desire state.
|
|
|
|
Test results on RHEL 8.3 i7-8665U 2G RAM:
|
|
|
|
10m29.212s to create 1000 VLAN and 1000 bridge over each VLAN.
|
|
|
|
Changed the integration test case to test 500 VLANs + 500 bridges.
|
|
|
|
Signed-off-by: Gris Ge <fge@redhat.com>
|
|
---
|
|
libnmstate/nm/connection.py | 67 ++++++++++++++++++++++++++++++++++++-
|
|
libnmstate/nm/context.py | 3 +-
|
|
2 files changed, 67 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py
|
|
index 1f6c734..374a379 100644
|
|
--- a/libnmstate/nm/connection.py
|
|
+++ b/libnmstate/nm/connection.py
|
|
@@ -1,5 +1,5 @@
|
|
#
|
|
-# Copyright (c) 2018-2020 Red Hat, Inc.
|
|
+# Copyright (c) 2018-2021 Red Hat, Inc.
|
|
#
|
|
# This file is part of nmstate
|
|
#
|
|
@@ -24,11 +24,14 @@ from libnmstate.error import NmstateLibnmError
|
|
from libnmstate.error import NmstateInternalError
|
|
from libnmstate.error import NmstateValueError
|
|
|
|
+from .common import GLib
|
|
+from .common import Gio
|
|
from .common import NM
|
|
from . import ipv4
|
|
from . import ipv6
|
|
|
|
ACTIVATION_TIMEOUT_FOR_BRIDGE = 35 # Bridge STP requires 30 seconds.
|
|
+FALLBACK_CHECKER_INTERNAL = 15
|
|
|
|
|
|
class ConnectionProfile:
|
|
@@ -40,6 +43,7 @@ class ConnectionProfile:
|
|
self._nm_ac = None
|
|
self._ac_handlers = set()
|
|
self._dev_handlers = set()
|
|
+ self._fallback_checker = None
|
|
|
|
def create(self, settings):
|
|
self.profile = NM.SimpleConnection.new()
|
|
@@ -102,6 +106,26 @@ class ConnectionProfile:
|
|
self._add_connection2_callback,
|
|
user_data,
|
|
)
|
|
+ self._fallback_checker = GLib.timeout_source_new(
|
|
+ FALLBACK_CHECKER_INTERNAL * 1000
|
|
+ )
|
|
+ self._fallback_checker.set_callback(
|
|
+ self._profile_add_fallback_checker_callback, action
|
|
+ )
|
|
+ self._fallback_checker.attach(self._ctx.context)
|
|
+
|
|
+ def _profile_add_fallback_checker_callback(self, action):
|
|
+ for nm_profile in self._ctx.client.get_connections():
|
|
+ if nm_profile.get_uuid() == self.profile.get_uuid():
|
|
+ self._fallback_checker_cleanup()
|
|
+ self._ctx.finish_async(action)
|
|
+ return GLib.SOURCE_REMOVE
|
|
+ return GLib.SOURCE_CONTINUE
|
|
+
|
|
+ def _fallback_checker_cleanup(self):
|
|
+ if self._fallback_checker:
|
|
+ self._fallback_checker.destroy()
|
|
+ self._fallback_checker = None
|
|
|
|
def delete(self):
|
|
if not self.profile:
|
|
@@ -152,6 +176,26 @@ class ConnectionProfile:
|
|
self._active_connection_callback,
|
|
user_data,
|
|
)
|
|
+ self._fallback_checker = GLib.timeout_source_new(
|
|
+ FALLBACK_CHECKER_INTERNAL * 1000
|
|
+ )
|
|
+ self._fallback_checker.set_callback(
|
|
+ self._activation_fallback_checker_callback, action
|
|
+ )
|
|
+ self._fallback_checker.attach(self._ctx.context)
|
|
+
|
|
+ def _activation_fallback_checker_callback(self, action):
|
|
+ if self.devname:
|
|
+ self._nm_dev = self._ctx.get_nm_dev(self.devname)
|
|
+ if self._nm_dev:
|
|
+ self._activation_progress_check(action)
|
|
+ return GLib.SOURCE_CONTINUE
|
|
+ else:
|
|
+ logging.warn(
|
|
+ "Failed to get interface name from profile, "
|
|
+ "can not perform flalback check on activation"
|
|
+ )
|
|
+ return GLib.SOURCE_REMOVE
|
|
|
|
@property
|
|
def profile(self):
|
|
@@ -213,6 +257,18 @@ class ConnectionProfile:
|
|
|
|
try:
|
|
nm_act_con = src_object.activate_connection_finish(result)
|
|
+ except GLib.Error as e:
|
|
+ if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
|
|
+ logging.debug(
|
|
+ f"{action} timeout on activation, "
|
|
+ "using fallback method to wait activation"
|
|
+ )
|
|
+ return
|
|
+ else:
|
|
+ self._ctx.fail(
|
|
+ NmstateLibnmError(f"{action} failed: error={e}")
|
|
+ )
|
|
+ return
|
|
except Exception as e:
|
|
self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}"))
|
|
return
|
|
@@ -366,6 +422,7 @@ class ConnectionProfile:
|
|
def _activation_clean_up(self):
|
|
self._remove_ac_handlers()
|
|
self._remove_dev_handlers()
|
|
+ self._fallback_checker_cleanup()
|
|
|
|
def _is_activating(self):
|
|
if not self._nm_ac or not self._nm_dev:
|
|
@@ -396,6 +453,13 @@ class ConnectionProfile:
|
|
action = user_data
|
|
try:
|
|
profile = src_object.add_connection2_finish(result)[0]
|
|
+ except GLib.Error as e:
|
|
+ if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
|
|
+ logging.debug(
|
|
+ f"{action} timeout, using fallback method to "
|
|
+ "wait profile creation"
|
|
+ )
|
|
+ return
|
|
except Exception as e:
|
|
self._ctx.fail(
|
|
NmstateLibnmError(f"{action} failed with error: {e}")
|
|
@@ -410,6 +474,7 @@ class ConnectionProfile:
|
|
)
|
|
)
|
|
else:
|
|
+ self._fallback_checker_cleanup()
|
|
self._ctx.finish_async(action)
|
|
|
|
def _update2_callback(self, src_object, result, user_data):
|
|
diff --git a/libnmstate/nm/context.py b/libnmstate/nm/context.py
|
|
index 373ffe8..bc5c41c 100644
|
|
--- a/libnmstate/nm/context.py
|
|
+++ b/libnmstate/nm/context.py
|
|
@@ -31,8 +31,7 @@ from .common import Gio
|
|
# last finish async action.
|
|
IDLE_CHECK_INTERNAL = 5
|
|
|
|
-# libnm dbus connection has reply timeout 25 seconds.
|
|
-IDLE_TIMEOUT = 25
|
|
+IDLE_TIMEOUT = 60 * 5 # 5 minutes
|
|
|
|
# NetworkManage is using dbus in libnm while the dbus has limitation on
|
|
# maximum number of pending replies per connection.(RHEL/CentOS 8 is 1024)
|
|
--
|
|
2.27.0
|
|
|
|
|
|
From ac82d18f96aa2313583efa1477be441291e2957c Mon Sep 17 00:00:00 2001
|
|
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
|
|
Date: Sun, 17 Jan 2021 11:18:10 +0800
|
|
Subject: [PATCH 2/2] nm: Use fallback checker on profile deactivation and
|
|
delete
|
|
|
|
When NM is under heave loads, NM might raise timeout error when
|
|
try to deactivate or delete a profile, to solve that this patch
|
|
introduce the same method in 2407f98
|
|
to have a fallback checker on whether profile is deactivated/deleted
|
|
every 15 seconds.
|
|
|
|
No test case required as current `test_lot_of_vlans_with_bridges` test
|
|
case has `state: absent` which is good enough for testing this patch.
|
|
|
|
Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
|
|
Signed-off-by: Gris Ge <fge@redhat.com>
|
|
---
|
|
libnmstate/nm/active_connection.py | 61 ++++++++++++++++++++++++------
|
|
libnmstate/nm/connection.py | 30 +++++++++++++++
|
|
2 files changed, 80 insertions(+), 11 deletions(-)
|
|
|
|
diff --git a/libnmstate/nm/active_connection.py b/libnmstate/nm/active_connection.py
|
|
index 062c78a..b235e8b 100644
|
|
--- a/libnmstate/nm/active_connection.py
|
|
+++ b/libnmstate/nm/active_connection.py
|
|
@@ -21,12 +21,14 @@ import logging
|
|
|
|
from libnmstate.error import NmstateLibnmError
|
|
|
|
+from .common import Gio
|
|
from .common import GLib
|
|
from .common import GObject
|
|
from .common import NM
|
|
|
|
|
|
NM_AC_STATE_CHANGED_SIGNAL = "state-changed"
|
|
+FALLBACK_CHECKER_INTERNAL = 15
|
|
|
|
|
|
class ActivationError(Exception):
|
|
@@ -37,6 +39,8 @@ class ActiveConnection:
|
|
def __init__(self, context=None, nm_ac_con=None):
|
|
self._ctx = context
|
|
self._act_con = nm_ac_con
|
|
+ self._signal_handler = None
|
|
+ self._fallback_checker = None
|
|
|
|
nmdevs = None
|
|
if nm_ac_con:
|
|
@@ -75,19 +79,35 @@ class ActiveConnection:
|
|
|
|
action = f"Deactivate profile: {self.devname}"
|
|
self._ctx.register_async(action)
|
|
- handler_id = act_connection.connect(
|
|
+ self._signal_handler = act_connection.connect(
|
|
NM_AC_STATE_CHANGED_SIGNAL,
|
|
self._wait_state_changed_callback,
|
|
action,
|
|
)
|
|
if act_connection.props.state != NM.ActiveConnectionState.DEACTIVATING:
|
|
- user_data = (handler_id, action)
|
|
+ user_data = action
|
|
self._ctx.client.deactivate_connection_async(
|
|
act_connection,
|
|
self._ctx.cancellable,
|
|
self._deactivate_connection_callback,
|
|
user_data,
|
|
)
|
|
+ self._fallback_checker = GLib.timeout_source_new(
|
|
+ FALLBACK_CHECKER_INTERNAL * 1000
|
|
+ )
|
|
+ self._fallback_checker.set_callback(
|
|
+ self._deactivation_fallback_checker_callback, action
|
|
+ )
|
|
+ self._fallback_checker.attach(self._ctx.context)
|
|
+
|
|
+ def _clean_up(self):
|
|
+ if self._signal_handler:
|
|
+ if self._act_con:
|
|
+ self._act_con.handler_disconnect(self._signal_handler)
|
|
+ self._signal_handler = None
|
|
+ if self._fallback_checker:
|
|
+ self._fallback_checker.destroy()
|
|
+ self._fallback_checker = None
|
|
|
|
def _wait_state_changed_callback(self, act_con, state, reason, action):
|
|
if self._ctx.is_cancelled():
|
|
@@ -96,13 +116,13 @@ class ActiveConnection:
|
|
logging.debug(
|
|
"Connection deactivation succeeded on %s", self.devname,
|
|
)
|
|
+ self._clean_up()
|
|
self._ctx.finish_async(action)
|
|
|
|
def _deactivate_connection_callback(self, src_object, result, user_data):
|
|
- handler_id, action = user_data
|
|
+ action = user_data
|
|
if self._ctx.is_cancelled():
|
|
- if self._act_con:
|
|
- self._act_con.handler_disconnect(handler_id)
|
|
+ self._clean_up()
|
|
return
|
|
|
|
try:
|
|
@@ -116,16 +136,20 @@ class ActiveConnection:
|
|
"Connection is not active on {}, no need to "
|
|
"deactivate".format(self.devname)
|
|
)
|
|
+ elif e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
|
|
+ logging.debug(
|
|
+ f"{action} timeout, using fallback method to wait profile "
|
|
+ "deactivation"
|
|
+ )
|
|
+ return
|
|
else:
|
|
- if self._act_con:
|
|
- self._act_con.handler_disconnect(handler_id)
|
|
+ self._clean_up()
|
|
self._ctx.fail(
|
|
NmstateLibnmError(f"{action} failed: error={e}")
|
|
)
|
|
return
|
|
except Exception as e:
|
|
- if self._act_con:
|
|
- self._act_con.handler_disconnect(handler_id)
|
|
+ self._clean_up()
|
|
self._ctx.fail(
|
|
NmstateLibnmError(
|
|
f"BUG: Unexpected error when activating {self.devname} "
|
|
@@ -135,8 +159,7 @@ class ActiveConnection:
|
|
return
|
|
|
|
if not success:
|
|
- if self._act_con:
|
|
- self._act_con.handler_disconnect(handler_id)
|
|
+ self._clean_up()
|
|
self._ctx.fail(
|
|
NmstateLibnmError(
|
|
f"{action} failed: error='None returned from "
|
|
@@ -144,6 +167,22 @@ class ActiveConnection:
|
|
)
|
|
)
|
|
|
|
+ def _deactivation_fallback_checker_callback(self, action):
|
|
+ if self.devname:
|
|
+ self._nmdev = self._ctx.get_nm_dev(self.devname)
|
|
+ if self._nmdev:
|
|
+ self._act_con = self._nmdev.get_active_connection()
|
|
+ if (
|
|
+ self._act_con
|
|
+ and self._act_con.props.state
|
|
+ != NM.ActiveConnectionState.DEACTIVATED
|
|
+ ):
|
|
+ return GLib.SOURCE_CONTINUE
|
|
+
|
|
+ self._clean_up()
|
|
+ self._ctx.finish_async(action)
|
|
+ return GLib.SOURCE_REMOVE
|
|
+
|
|
@property
|
|
def nm_active_connection(self):
|
|
return self._act_con
|
|
diff --git a/libnmstate/nm/connection.py b/libnmstate/nm/connection.py
|
|
index 374a379..af7296f 100644
|
|
--- a/libnmstate/nm/connection.py
|
|
+++ b/libnmstate/nm/connection.py
|
|
@@ -144,6 +144,13 @@ class ConnectionProfile:
|
|
self._delete_connection_callback,
|
|
user_data,
|
|
)
|
|
+ self._fallback_checker = GLib.timeout_source_new(
|
|
+ FALLBACK_CHECKER_INTERNAL * 1000
|
|
+ )
|
|
+ self._fallback_checker.set_callback(
|
|
+ self._delete_fallback_checker_callback, action
|
|
+ )
|
|
+ self._fallback_checker.attach(self._ctx.context)
|
|
|
|
def activate(self):
|
|
if self.con_id:
|
|
@@ -504,11 +511,24 @@ class ConnectionProfile:
|
|
action = user_data
|
|
try:
|
|
success = src_object.delete_finish(result)
|
|
+ except GLib.Error as e:
|
|
+ if e.matches(Gio.io_error_quark(), Gio.IOErrorEnum.TIMED_OUT):
|
|
+ logging.debug(
|
|
+ f"{action} timeout, using fallback method to wait profile "
|
|
+ "deletion"
|
|
+ )
|
|
+ return
|
|
+ else:
|
|
+ self._ctx.fail(
|
|
+ NmstateLibnmError(f"{action} failed with error: {e}")
|
|
+ )
|
|
+ return
|
|
except Exception as e:
|
|
self._ctx.fail(NmstateLibnmError(f"{action} failed: error={e}"))
|
|
return
|
|
|
|
if success:
|
|
+ self._fallback_checker_cleanup()
|
|
self._ctx.finish_async(action)
|
|
else:
|
|
self._ctx.fail(
|
|
@@ -518,6 +538,16 @@ class ConnectionProfile:
|
|
)
|
|
)
|
|
|
|
+ def _delete_fallback_checker_callback(self, action):
|
|
+ if self.profile:
|
|
+ for nm_profile in self._ctx.client.get_connections():
|
|
+ if nm_profile.get_uuid() == self.profile.get_uuid():
|
|
+ return GLib.SOURCE_CONTINUE
|
|
+
|
|
+ self._fallback_checker_cleanup()
|
|
+ self._ctx.finish_async(action)
|
|
+ return GLib.SOURCE_REMOVE
|
|
+
|
|
def _reset_profile(self):
|
|
self._con_profile = None
|
|
|
|
--
|
|
2.27.0
|
|
|