be071b020b
Resolves: bz#1459709 bz#1610743 bz#1618221 bz#1619627 bz#1622649 Resolves: bz#1623749 bz#1623874 bz#1624444 bz#1625622 bz#1626780 Resolves: bz#1627098 bz#1627617 bz#1627639 bz#1630688 Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
101 lines
3.8 KiB
Diff
101 lines
3.8 KiB
Diff
From 62fe36178cab588b658b44808cc954a57a1fc452 Mon Sep 17 00:00:00 2001
|
|
From: Kotresh HR <khiremat@redhat.com>
|
|
Date: Fri, 10 Aug 2018 08:14:14 -0400
|
|
Subject: [PATCH 369/385] geo-rep: Fix deadlock during worker start
|
|
|
|
Analysis:
|
|
Monitor process spawns monitor threads (one per brick).
|
|
Each monitor thread, forks worker and agent processes.
|
|
Each monitor thread, while intializing, updates the
|
|
monitor status file. It is synchronized using flock.
|
|
The race is that, some thread can fork worker while
|
|
other thread opened the status file resulting in
|
|
holding the reference of fd in worker process.
|
|
|
|
Cause:
|
|
flock gets unlocked either by specifically unlocking it
|
|
or by closing all duplicate fds referring to the file.
|
|
The code was relying on fd close, hence a reference
|
|
in worker/agent process by fork could cause the deadlock.
|
|
|
|
Fix:
|
|
1. flock is unlocked specifically.
|
|
2. Also made sure to update status file in approriate places so that
|
|
the reference is not leaked to worker/agent process.
|
|
|
|
With this fix, both the deadlock and possible fd
|
|
leaks is solved.
|
|
|
|
Upstream Patch : https://review.gluster.org/#/c/glusterfs/+/20704/
|
|
>fixes: bz#1614799
|
|
>Signed-off-by: Kotresh HR <khiremat@redhat.com>
|
|
|
|
Change-Id: I0d1ce93072dab07d0dbcc7e779287368cd9f093d
|
|
BUG: 1623749
|
|
Signed-off-by: Sunny Kumar <sunkumar@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/149760
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
---
|
|
geo-replication/syncdaemon/gsyncdstatus.py | 1 +
|
|
geo-replication/syncdaemon/monitor.py | 17 ++++++++++++++---
|
|
2 files changed, 15 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/geo-replication/syncdaemon/gsyncdstatus.py b/geo-replication/syncdaemon/gsyncdstatus.py
|
|
index 909c669..67493ca 100644
|
|
--- a/geo-replication/syncdaemon/gsyncdstatus.py
|
|
+++ b/geo-replication/syncdaemon/gsyncdstatus.py
|
|
@@ -99,6 +99,7 @@ class LockedOpen(object):
|
|
return f
|
|
|
|
def __exit__(self, _exc_type, _exc_value, _traceback):
|
|
+ fcntl.flock(self.fileobj, fcntl.LOCK_UN)
|
|
self.fileobj.close()
|
|
|
|
|
|
diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py
|
|
index 9245572..3451fe4 100644
|
|
--- a/geo-replication/syncdaemon/monitor.py
|
|
+++ b/geo-replication/syncdaemon/monitor.py
|
|
@@ -144,9 +144,6 @@ class Monitor(object):
|
|
"%s::%s" % (slave_host,
|
|
slave_vol))
|
|
|
|
- set_monitor_status(gconf.state_file, self.ST_STARTED)
|
|
- self.status[w[0]['dir']].set_worker_status(self.ST_INIT)
|
|
-
|
|
ret = 0
|
|
|
|
def nwait(p, o=0):
|
|
@@ -196,6 +193,7 @@ class Monitor(object):
|
|
# Spawn the worker and agent in lock to avoid fd leak
|
|
self.lock.acquire()
|
|
|
|
+ self.status[w[0]['dir']].set_worker_status(self.ST_INIT)
|
|
logging.info(lf('starting gsyncd worker',
|
|
brick=w[0]['dir'],
|
|
slave_node=remote_host))
|
|
@@ -375,6 +373,19 @@ class Monitor(object):
|
|
t = Thread(target=wmon, args=[wx])
|
|
t.start()
|
|
ta.append(t)
|
|
+
|
|
+ # monitor status was being updated in each monitor thread. It
|
|
+ # should not be done as it can cause deadlock for a worker start.
|
|
+ # set_monitor_status uses flock to synchronize multple instances
|
|
+ # updating the file. Since each monitor thread forks worker and
|
|
+ # agent, these processes can hold the reference to fd of status
|
|
+ # file causing deadlock to workers which starts later as flock
|
|
+ # will not be release until all references to same fd is closed.
|
|
+ # It will also cause fd leaks.
|
|
+
|
|
+ self.lock.acquire()
|
|
+ set_monitor_status(gconf.get("state-file"), self.ST_STARTED)
|
|
+ self.lock.release()
|
|
for t in ta:
|
|
t.join()
|
|
|
|
--
|
|
1.8.3.1
|
|
|