glusterfs/0101-cluster-ec-Prevent-self-heal-to-work-after-PARENT_DO.patch

From a4f3087ecbd1979525add83a149acaf2443d8e59 Mon Sep 17 00:00:00 2001
From: Xavier Hernandez <jahernan@redhat.com>
Date: Wed, 22 Nov 2017 11:10:32 +0100
Subject: [PATCH 101/128] cluster/ec: Prevent self-heal to work after
 PARENT_DOWN

When the volume is being stopped, PARENT_DOWN event is received.
This instructs EC to wait until all pending operations are completed
before declaring itself down. However heal operations are ignored
and allowed to continue even after having said it was down.

This may cause unexpected results and crashes.

To solve this, heal operations are considered exactly equal as any
other operation and EC won't propagate PARENT_DOWN until all
operations, including healing, are complete. To avoid big delays
if this happens in the middle of a big heal, a check has been
added to quit current heal if shutdown is detected.

>Change-Id: I26645e236ebd115eb22c7ad4972461111a2d2034
>BUG: 1515266
>Signed-off-by: Xavier Hernandez <jahernan@redhat.com>
Upstream Patch: https://review.gluster.org/#/c/18840/

BUG: 1505570
Change-Id: I26645e236ebd115eb22c7ad4972461111a2d2034
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/125199
Tested-by: RHGS Build Bot <nigelb@redhat.com>
---
 xlators/cluster/ec/src/ec-data.c | 21 ++------------
 xlators/cluster/ec/src/ec-heal.c | 59 +++++++++++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/xlators/cluster/ec/src/ec-data.c b/xlators/cluster/ec/src/ec-data.c
index 28bf988..54c708a 100644
--- a/xlators/cluster/ec/src/ec-data.c
+++ b/xlators/cluster/ec/src/ec-data.c
@@ -103,19 +103,6 @@ void ec_cbk_data_destroy(ec_cbk_data_t * cbk)
     mem_put(cbk);
 }

-/* PARENT_DOWN will be notified to children only after these fops are complete
- * when graph switch happens.  We do not want graph switch to be waiting on
- * heal to complete as healing big file/directory could take a while. Which
- * will lead to hang on the mount.
- */
-static gf_boolean_t
-ec_needs_graceful_completion (ec_fop_data_t *fop)
-{
-        if ((fop->id != EC_FOP_HEAL) && (fop->id != EC_FOP_FHEAL))
-                return _gf_true;
-        return _gf_false;
-}
-
 ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
                                      int32_t id, uint32_t flags,
                                      uintptr_t target, int32_t minimum,
@@ -202,13 +189,11 @@ ec_fop_data_t * ec_fop_data_allocate(call_frame_t * frame, xlator_t * this,
         fop->parent = parent;
     }

-    if (ec_needs_graceful_completion (fop)) {
-            LOCK(&ec->lock);
+    LOCK(&ec->lock);

-            list_add_tail(&fop->pending_list, &ec->pending_fops);
+    list_add_tail(&fop->pending_list, &ec->pending_fops);

-            UNLOCK(&ec->lock);
-    }
+    UNLOCK(&ec->lock);

     return fop;
 }
diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
index fd8c902..b8518d6 100644
--- a/xlators/cluster/ec/src/ec-heal.c
+++ b/xlators/cluster/ec/src/ec-heal.c
@@ -1418,6 +1418,12 @@ ec_name_heal_handler (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
         int                 i          = 0;
         int                 ret        = 0;

+        if (ec->shutdown) {
+                gf_msg_debug(this->name, 0, "Cancelling directory heal "
+                                            "because EC is stopping.");
+                return -ENOTCONN;
+        }
+
         memcpy (name_on, name_data->participants, ec->nodes);
         ret = ec_heal_name (name_data->frame, ec, parent->inode,
                             entry->d_name, name_on);
@@ -1439,6 +1445,7 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
         int j = 0;
         loc_t loc = {0};
         struct ec_name_data name_data = {0};
+        int ret = 0;

         loc.inode = inode_ref (inode);
         gf_uuid_copy (loc.gfid, inode->gfid);
@@ -1449,18 +1456,23 @@ ec_heal_names (call_frame_t *frame, ec_t *ec, inode_t *inode,
         for (i = 0; i < ec->nodes; i++) {
                 if (!participants[i])
                         continue;
-                syncop_dir_scan (ec->xl_list[i], &loc,
-                                GF_CLIENT_PID_SELF_HEALD, &name_data,
-                                ec_name_heal_handler);
+                ret = syncop_dir_scan (ec->xl_list[i], &loc,
+                                       GF_CLIENT_PID_SELF_HEALD, &name_data,
+                                       ec_name_heal_handler);
+                if (ret < 0) {
+                        break;
+                }
                 for (j = 0; j < ec->nodes; j++)
                         if (name_data.failed_on[j])
                                 participants[j] = 0;

-                if (EC_COUNT (participants, ec->nodes) <= ec->fragments)
-                        return -ENOTCONN;
+                if (EC_COUNT (participants, ec->nodes) <= ec->fragments) {
+                        ret = -ENOTCONN;
+                        break;
+                }
         }
         loc_wipe (&loc);
-        return 0;
+        return ret;
 }

 int
@@ -1999,6 +2011,17 @@ ec_rebuild_data (call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size,

         for (heal->offset = 0; (heal->offset < size) && !heal->done;
                                                    heal->offset += heal->size) {
+                /* We immediately abort any heal if a shutdown request has been
+                 * received to avoid delays. The healing of this file will be
+                 * restarted by another SHD or other client that accesses the
+                 * file. */
+                if (ec->shutdown) {
+                        gf_msg_debug(ec->xl->name, 0, "Cancelling heal because "
+                                                      "EC is stopping.");
+                        ret = -ENOTCONN;
+                        break;
+                }
+
                 gf_msg_debug (ec->xl->name, 0, "%s: sources: %d, sinks: "
                         "%d, offset: %"PRIu64" bsize: %"PRIu64,
                         uuid_utoa (fd->inode->gfid),
@@ -2595,16 +2618,32 @@ ec_handle_healers_done (ec_fop_data_t *fop)
                 return;

         LOCK (&ec->lock);
-        {
-                list_del_init (&fop->healer);
+
+        list_del_init (&fop->healer);
+
+        do {
                 ec->healers--;
                 heal_fop = __ec_dequeue_heals (ec);
-        }
+
+                if ((heal_fop != NULL) && ec->shutdown) {
+                        /* This will prevent ec_handle_healers_done() to be
+                         * called recursively. That would be problematic if
+                         * the queue is too big. */
+                        list_del_init(&heal_fop->healer);
+
+                        UNLOCK(&ec->lock);
+
+                        ec_fop_set_error(fop, ENOTCONN);
+                        ec_heal_fail(ec, heal_fop);
+
+                        LOCK(&ec->lock);
+                }
+        } while ((heal_fop != NULL) && ec->shutdown);
+
         UNLOCK (&ec->lock);

         if (heal_fop)
                 ec_launch_heal (ec, heal_fop);
-
 }

 void
--
1.8.3.1