From 772c9f37aaec75eca4b9ab0c3491e6a044d6b754 Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Mon, 26 Mar 2018 06:38:12 -0400 Subject: [PATCH] autobuild v3.12.2-6 Resolves: bz#1491785 bz#1518710 bz#1523599 bz#1528733 bz#1550474 Resolves: bz#1550982 bz#1551186 bz#1552360 bz#1552414 bz#1552425 Resolves: bz#1554255 bz#1554905 bz#1555261 bz#1556895 bz#1557297 Resolves: bz#1559084 bz#1559788 Signed-off-by: Milind Changire --- 0181-glusterd-get-state-memory-leak-fix.patch | 55 + ...overity-issues-in-glusterd-handler.c.patch | 141 + ...-cluster-afr-Fix-dict-leak-in-pre-op.patch | 144 + ...sterfsd-remove-copyright-information.patch | 65 + ...-rpcsvc-correct-event-thread-scaling.patch | 51 + 0186-cli-Remove-upstream-doc-reference.patch | 39 + ...o-list_del_init-while-list-memory-is.patch | 55 + ...ume-of-geo-replication-with-wrong-us.patch | 70 + ...nable-proper-fgetattr-like-semantics.patch | 73 + ...Adding-option-to-take-full-file-lock.patch | 157 + ...ter-afr-Make-afr_fsync-a-transaction.patch | 362 ++ ...fr-Remove-compound-fops-usage-in-afr.patch | 631 ++++ ...cluster-afr-Remove-unused-code-paths.patch | 1505 +++++++++ ...Make-AFR-eager-locking-similar-to-EC.patch | 3002 +++++++++++++++++ ...dd-active-fd-count-option-in-gluster.patch | 226 ++ ...ch-to-active-fd-count-for-open-fd-ch.patch | 83 + ...-create-remove-export-file-only-from.patch | 72 + ...nge-default-read-policy-to-gfid-hash.patch | 81 + ...cluster-ec-avoid-delays-in-self-heal.patch | 383 +++ ...rd-cache-for-fallocate-zerofill-and-.patch | 331 ++ ...storage.reserve-limit-df-does-not-sh.patch | 62 + glusterfs.spec | 28 +- 22 files changed, 7615 insertions(+), 1 deletion(-) create mode 100644 0181-glusterd-get-state-memory-leak-fix.patch create mode 100644 0182-glusterd-Fix-coverity-issues-in-glusterd-handler.c.patch create mode 100644 0183-cluster-afr-Fix-dict-leak-in-pre-op.patch create mode 100644 0184-cli-glusterfsd-remove-copyright-information.patch create mode 100644 0185-rpcsvc-correct-event-thread-scaling.patch create mode 100644 0186-cli-Remove-upstream-doc-reference.patch create mode 100644 0187-features-shard-Do-list_del_init-while-list-memory-is.patch create mode 100644 0188-georep-Pause-Resume-of-geo-replication-with-wrong-us.patch create mode 100644 0189-fuse-enable-proper-fgetattr-like-semantics.patch create mode 100644 0190-cluster-afr-Adding-option-to-take-full-file-lock.patch create mode 100644 0191-cluster-afr-Make-afr_fsync-a-transaction.patch create mode 100644 0192-cluster-afr-Remove-compound-fops-usage-in-afr.patch create mode 100644 0193-cluster-afr-Remove-unused-code-paths.patch create mode 100644 0194-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch create mode 100644 0195-storage-posix-Add-active-fd-count-option-in-gluster.patch create mode 100644 0196-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch create mode 100644 0197-glusterd-ganesha-create-remove-export-file-only-from.patch create mode 100644 0198-cluster-ec-Change-default-read-policy-to-gfid-hash.patch create mode 100644 0199-cluster-ec-avoid-delays-in-self-heal.patch create mode 100644 0200-quick-read-Discard-cache-for-fallocate-zerofill-and-.patch create mode 100644 0201-posix-After-set-storage.reserve-limit-df-does-not-sh.patch diff --git a/0181-glusterd-get-state-memory-leak-fix.patch b/0181-glusterd-get-state-memory-leak-fix.patch new file mode 100644 index 0000000..8f1709b --- /dev/null +++ b/0181-glusterd-get-state-memory-leak-fix.patch @@ -0,0 +1,55 @@ +From ee1c4f7d1303c61725f73870f32afa1bc4f68854 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Thu, 4 Jan 2018 22:07:54 +0530 +Subject: [PATCH 181/201] glusterd: get-state memory leak fix + +>upstream mainline patch : https://review.gluster.org/#/c/19139/ + +Change-Id: Ic4fcf2087f295d3dade944efb8fd08f7e2d7d516 +BUG: 1528733 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/132079 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index cf280a7..81926a8 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -5180,6 +5180,8 @@ glusterd_print_gsync_status_by_vol (FILE *fp, glusterd_volinfo_t *volinfo) + if (ret) + goto out; + out: ++ if (gsync_rsp_dict) ++ dict_unref (gsync_rsp_dict); + return ret; + } + +@@ -5495,9 +5497,19 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + if (odir[odirlen-1] != '/') + strcat (odir, "/"); + +- gf_asprintf (&ofilepath, "%s%s", odir, filename); ++ ret = gf_asprintf (&ofilepath, "%s%s", odir, filename); ++ if (ret < 0) { ++ GF_FREE (odir); ++ GF_FREE (filename); ++ gf_msg (this->name, GF_LOG_ERROR, 0, GD_MSG_DICT_SET_FAILED, ++ "Unable to get the output path"); ++ ret = -1; ++ goto out; ++ } ++ GF_FREE (odir); ++ GF_FREE (filename); + +- ret = dict_set_str (dict, "ofilepath", ofilepath); ++ ret = dict_set_dynstr (dict, "ofilepath", ofilepath); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_DICT_SET_FAILED, "Unable to set output path"); +-- +1.8.3.1 + diff --git a/0182-glusterd-Fix-coverity-issues-in-glusterd-handler.c.patch b/0182-glusterd-Fix-coverity-issues-in-glusterd-handler.c.patch new file mode 100644 index 0000000..77a0113 --- /dev/null +++ b/0182-glusterd-Fix-coverity-issues-in-glusterd-handler.c.patch @@ -0,0 +1,141 @@ +From 224af5bc6ea27a617f222cd83da871df85c6b7a1 Mon Sep 17 00:00:00 2001 +From: Samikshan Bairagya +Date: Thu, 9 Nov 2017 15:15:37 +0530 +Subject: [PATCH 182/201] glusterd: Fix coverity issues in glusterd-handler.c + +Fixes get-state CLI related coverity issues 477, 511, 515, 523, +526 and 527 from the report at [1] + +[1] https://download.gluster.org/pub/gluster/glusterfs/static-analysis/master/glusterfs-coverity/2017-10-30-9aa574a5/html/ + +>upstream mainline patch : https://review.gluster.org/#/c/18706/ + +Change-Id: Ieb6f64c9035b4d9338d9515de003d607b7a4e9bc +BUG: 1528733 +Signed-off-by: Samikshan Bairagya +Reviewed-on: https://code.engineering.redhat.com/gerrit/132080 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 34 ++++++++++++++++++++-------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 81926a8..16a3773 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -5426,6 +5426,7 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + char *odir = NULL; + char *filename = NULL; + char *ofilepath = NULL; ++ char *tmp_str = NULL; + int count = 0; + int count_bkp = 0; + int odirlen = 0; +@@ -5435,6 +5436,7 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + uint64_t memtotal = 0; + uint64_t memfree = 0; + int start_index = 0; ++ char id_str[64] = {0,}; + + char *vol_type_str = NULL; + char *hot_tier_type_str = NULL; +@@ -5453,12 +5455,14 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + + GF_VALIDATE_OR_GOTO (this->name, dict, out); + +- ret = dict_get_str (dict, "odir", &odir); ++ ret = dict_get_str (dict, "odir", &tmp_str); + if (ret) { + gf_asprintf (&odir, "%s", "/var/run/gluster/"); + gf_msg (this->name, GF_LOG_INFO, 0, + GD_MSG_DICT_GET_FAILED, + "Default output directory: %s", odir); ++ } else { ++ gf_asprintf (&odir, "%s", tmp_str); + } + + dp = sys_opendir (odir); +@@ -5481,7 +5485,7 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + goto out; + } + +- ret = dict_get_str (dict, "filename", &filename); ++ ret = dict_get_str (dict, "filename", &tmp_str); + if (ret) { + now = time (NULL); + strftime (timestamp, sizeof (timestamp), +@@ -5491,6 +5495,8 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + gf_msg (this->name, GF_LOG_INFO, 0, + GD_MSG_DICT_GET_FAILED, + "Default filename: %s", filename); ++ } else { ++ gf_asprintf (&filename, "%s", tmp_str); + } + + odirlen = strlen (odir); +@@ -5509,7 +5515,7 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + GF_FREE (odir); + GF_FREE (filename); + +- ret = dict_set_dynstr (dict, "ofilepath", ofilepath); ++ ret = dict_set_str (dict, "ofilepath", ofilepath); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, + GD_MSG_DICT_SET_FAILED, "Unable to set output path"); +@@ -5564,7 +5570,9 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + + fprintf (fp, "[Global]\n"); + +- fprintf (fp, "MYUUID: %s\n", gf_strdup (uuid_utoa (priv->uuid))); ++ uuid_utoa_r (priv->uuid, id_str); ++ fprintf (fp, "MYUUID: %s\n", id_str); ++ + fprintf (fp, "op-version: %d\n", priv->op_version); + + fprintf (fp, "\n[Global options]\n"); +@@ -5656,8 +5664,10 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + } + + fprintf (fp, "Volume%d.name: %s\n", ++count, volinfo->volname); +- fprintf (fp, "Volume%d.id: %s\n", count, +- gf_strdup (uuid_utoa (volinfo->volume_id))); ++ ++ uuid_utoa_r (volinfo->volume_id, id_str); ++ fprintf (fp, "Volume%d.id: %s\n", count, id_str); ++ + fprintf (fp, "Volume%d.type: %s\n", count, vol_type_str); + fprintf (fp, "Volume%d.transport_type: %s\n", count, + transport_type_str); +@@ -5777,8 +5787,11 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + fprintf (fp, "Volume%d.snapd_svc.inited: %s\n", count, + volinfo->snapd.svc.inited ? "True" : "False"); + +- fprintf (fp, "Volume%d.rebalance.id: %s\n", count, +- gf_strdup (uuid_utoa (volinfo->rebal.rebalance_id))); ++ uuid_utoa_r (volinfo->rebal.rebalance_id, id_str); ++ char *rebal_data = gf_uint64_2human_readable ( ++ volinfo->rebal.rebalance_data); ++ ++ fprintf (fp, "Volume%d.rebalance.id: %s\n", count, id_str); + fprintf (fp, "Volume%d.rebalance.status: %s\n", count, + rebal_status_str); + fprintf (fp, "Volume%d.rebalance.failures: %"PRIu64"\n", count, +@@ -5789,11 +5802,12 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + volinfo->rebal.lookedup_files); + fprintf (fp, "Volume%d.rebalance.files: %"PRIu64"\n", count, + volinfo->rebal.rebalance_files); +- fprintf (fp, "Volume%d.rebalance.data: %s\n", count, +- gf_uint64_2human_readable (volinfo->rebal.rebalance_data)); ++ fprintf (fp, "Volume%d.rebalance.data: %s\n", count, rebal_data); + fprintf (fp, "Volume%d.time_left: %"PRIu64"\n", count, + volinfo->rebal.time_left); + ++ GF_FREE (rebal_data); ++ + if (volinfo->type == GF_CLUSTER_TYPE_TIER) { + ret = glusterd_volume_get_hot_tier_type_str ( + volinfo, &hot_tier_type_str); +-- +1.8.3.1 + diff --git a/0183-cluster-afr-Fix-dict-leak-in-pre-op.patch b/0183-cluster-afr-Fix-dict-leak-in-pre-op.patch new file mode 100644 index 0000000..05908d2 --- /dev/null +++ b/0183-cluster-afr-Fix-dict-leak-in-pre-op.patch @@ -0,0 +1,144 @@ +From 37897f0b72617e442e4799b35ebda94294218e05 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 28 Feb 2018 17:58:31 +0530 +Subject: [PATCH 183/201] cluster/afr: Fix dict-leak in pre-op + +At the time of pre-op, pre_op_xdata is populted with the xattrs we get from the +disk and at the time of post-op it gets over-written without unreffing the +previous value stored leading to a leak. +This is a regression we missed in +https://review.gluster.org/#/q/ba149bac92d169ae2256dbc75202dc9e5d06538e + + >BUG: 1550078 + >Change-Id: I0456f9ad6f77ce6248b747964a037193af3a3da7 + >Signed-off-by: Pranith Kumar K + >Upstream: https://review.gluster.org/19647 + +BUG: 1552360 +Change-Id: I0456f9ad6f77ce6248b747964a037193af3a3da7 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/131936 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr-common.c | 16 ++++++++-------- + xlators/cluster/afr/src/afr-transaction.c | 20 ++++++++++---------- + xlators/cluster/afr/src/afr.h | 4 ++-- + 3 files changed, 20 insertions(+), 20 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 6e6f5fa..855e568 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -1039,7 +1039,7 @@ afr_readables_fill (call_frame_t *frame, xlator_t *this, inode_t *inode, + xdata = replies[i].xdata; + ia_type = replies[i].poststat.ia_type; + } else {/* pre-op xattrop */ +- xdata = local->transaction.pre_op_xdata[i]; ++ xdata = local->transaction.changelog_xdata[i]; + ia_type = inode->ia_type; + } + +@@ -1757,13 +1757,13 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) + GF_FREE (local->transaction.pre_op); + + GF_FREE (local->transaction.pre_op_sources); +- if (local->transaction.pre_op_xdata) { ++ if (local->transaction.changelog_xdata) { + for (i = 0; i < priv->child_count; i++) { +- if (!local->transaction.pre_op_xdata[i]) ++ if (!local->transaction.changelog_xdata[i]) + continue; +- dict_unref (local->transaction.pre_op_xdata[i]); ++ dict_unref (local->transaction.changelog_xdata[i]); + } +- GF_FREE (local->transaction.pre_op_xdata); ++ GF_FREE (local->transaction.changelog_xdata); + } + + GF_FREE (local->transaction.eager_lock); +@@ -5531,10 +5531,10 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (!local->transaction.pre_op) + goto out; + +- local->transaction.pre_op_xdata = +- GF_CALLOC (sizeof (*local->transaction.pre_op_xdata), ++ local->transaction.changelog_xdata = ++ GF_CALLOC (sizeof (*local->transaction.changelog_xdata), + priv->child_count, gf_afr_mt_dict_t); +- if (!local->transaction.pre_op_xdata) ++ if (!local->transaction.changelog_xdata) + goto out; + + if (priv->arbiter_count == 1) { +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 19740e1..97f9dd4 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -276,9 +276,9 @@ afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this) + matrix = ALLOC_MATRIX (priv->child_count, int); + + for (i = 0; i < priv->child_count; i++) { +- if (!local->transaction.pre_op_xdata[i]) ++ if (!local->transaction.changelog_xdata[i]) + continue; +- xdata = local->transaction.pre_op_xdata[i]; ++ xdata = local->transaction.changelog_xdata[i]; + afr_selfheal_fill_matrix (this, matrix, i, idx, xdata); + } + +@@ -295,13 +295,6 @@ afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this) + for (j = 0; j < priv->child_count; j++) + if (matrix[i][j] != 0) + local->transaction.pre_op_sources[j] = 0; +- +- /*We don't need the xattrs any more. */ +- for (i = 0; i < priv->child_count; i++) +- if (local->transaction.pre_op_xdata[i]) { +- dict_unref (local->transaction.pre_op_xdata[i]); +- local->transaction.pre_op_xdata[i] = NULL; +- } + } + + gf_boolean_t +@@ -1173,7 +1166,7 @@ afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + } + + if (xattr) +- local->transaction.pre_op_xdata[child_index] = dict_ref (xattr); ++ local->transaction.changelog_xdata[child_index] = dict_ref (xattr); + + call_count = afr_frame_return (frame); + +@@ -1605,6 +1598,13 @@ afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + local = frame->local; + priv = this->private; + ++ for (i = 0; i < priv->child_count; i++) { ++ if (local->transaction.changelog_xdata[i]) { ++ dict_unref (local->transaction.changelog_xdata[i]); ++ local->transaction.changelog_xdata[i] = NULL; ++ } ++ } ++ + ret = afr_changelog_prepare (this, frame, &call_count, changelog_resume, + op, &xdata, &newloc_xdata); + +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 96fefb1..c822221 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -748,8 +748,8 @@ typedef struct _afr_local { + + unsigned char *pre_op; + +- /* For arbiter configuration only. */ +- dict_t **pre_op_xdata; ++ /* Changelog xattr dict for [f]xattrop*/ ++ dict_t **changelog_xdata; + unsigned char *pre_op_sources; + + /* @failed_subvols: subvolumes on which a pre-op or a +-- +1.8.3.1 + diff --git a/0184-cli-glusterfsd-remove-copyright-information.patch b/0184-cli-glusterfsd-remove-copyright-information.patch new file mode 100644 index 0000000..555acba --- /dev/null +++ b/0184-cli-glusterfsd-remove-copyright-information.patch @@ -0,0 +1,65 @@ +From 0b7fa3bdd3334c70d99d1a1b99c3e37d49fc66e3 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Mon, 12 Mar 2018 19:47:11 +0530 +Subject: [PATCH 184/201] cli/glusterfsd: remove copyright information + +There's no point of dumping upstream copyright information in --version. + +Label: DOWNSTREAM ONLY + +Change-Id: I3a10e30878698e1d53082936bbf22bca560a3896 +BUG: 1550474 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/132445 +Tested-by: RHGS Build Bot +Reviewed-by: Milind Changire +--- + cli/src/cli.c | 10 +--------- + glusterfsd/src/glusterfsd.c | 10 +--------- + 2 files changed, 2 insertions(+), 18 deletions(-) + +diff --git a/cli/src/cli.c b/cli/src/cli.c +index ce06366..52c1b67 100644 +--- a/cli/src/cli.c ++++ b/cli/src/cli.c +@@ -66,15 +66,7 @@ extern int connected; + /* using argp for command line parsing */ + + const char *argp_program_version = "" \ +- PACKAGE_NAME" "PACKAGE_VERSION \ +- "\nRepository revision: " GLUSTERFS_REPOSITORY_REVISION "\n" \ +- "Copyright (c) 2006-2016 Red Hat, Inc. " \ +- "\n" \ +- "GlusterFS comes with ABSOLUTELY NO WARRANTY.\n" \ +- "It is licensed to you under your choice of the GNU Lesser\n" \ +- "General Public License, version 3 or any later version (LGPLv3\n" \ +- "or later), or the GNU General Public License, version 2 (GPLv2),\n" \ +- "in all cases as published by the Free Software Foundation."; ++ PACKAGE_NAME" "PACKAGE_VERSION; + const char *argp_program_bug_address = "<" PACKAGE_BUGREPORT ">"; + + struct rpc_clnt *global_quotad_rpc; +diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c +index eeffdc5..38b863c 100644 +--- a/glusterfsd/src/glusterfsd.c ++++ b/glusterfsd/src/glusterfsd.c +@@ -87,15 +87,7 @@ static char gf_doc[] = ""; + static char argp_doc[] = "--volfile-server=SERVER [MOUNT-POINT]\n" \ + "--volfile=VOLFILE [MOUNT-POINT]"; + const char *argp_program_version = "" \ +- PACKAGE_NAME" "PACKAGE_VERSION \ +- "\nRepository revision: " GLUSTERFS_REPOSITORY_REVISION "\n" \ +- "Copyright (c) 2006-2016 Red Hat, Inc. " \ +- "\n" \ +- "GlusterFS comes with ABSOLUTELY NO WARRANTY.\n" \ +- "It is licensed to you under your choice of the GNU Lesser\n" \ +- "General Public License, version 3 or any later version (LGPLv3\n" \ +- "or later), or the GNU General Public License, version 2 (GPLv2),\n" \ +- "in all cases as published by the Free Software Foundation."; ++ PACKAGE_NAME" "PACKAGE_VERSION; + const char *argp_program_bug_address = "<" PACKAGE_BUGREPORT ">"; + + static error_t parse_opts (int32_t key, char *arg, struct argp_state *_state); +-- +1.8.3.1 + diff --git a/0185-rpcsvc-correct-event-thread-scaling.patch b/0185-rpcsvc-correct-event-thread-scaling.patch new file mode 100644 index 0000000..f51b61c --- /dev/null +++ b/0185-rpcsvc-correct-event-thread-scaling.patch @@ -0,0 +1,51 @@ +From 96146ccae552c648f33a19783fad824cf8101790 Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Tue, 13 Mar 2018 12:03:56 +0530 +Subject: [PATCH 185/201] rpcsvc: correct event-thread scaling + +Problem: +Auto thread count derived from the number of attachs and detachs +was reset to 1 when server_reconfigure() was called. + +Solution: +Avoid auto-thread-count reset to 1. + +mainline: +> BUG: 1547888 +> Reviewed-on: https://review.gluster.org/19689 +> Reviewed-by: Raghavendra G +> Signed-off-by: Milind Changire +(cherry picked from commit 0c3d984287d91d3fe1ffeef297252d912c08a410) + +Change-Id: Ic00e86adb81ba3c828e354a6ccb638209ae58b3e +BUG: 1554255 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/132509 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/protocol/server/src/server.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c +index 4627ea0..89fde39 100644 +--- a/xlators/protocol/server/src/server.c ++++ b/xlators/protocol/server/src/server.c +@@ -978,10 +978,11 @@ do_rpc: + } + + /* +- * Let the event subsystem know that we're auto-scaling, with an +- * initial count of one. ++ * Update: ++ * We don't need to reset auto_thread_count since it has been derived ++ * out of the total bricks attached. We can reconfigure event threads ++ * but not auto threads. + */ +- ((struct event_pool *)(this->ctx->event_pool))->auto_thread_count = 1; + + GF_OPTION_RECONF ("event-threads", new_nthread, options, int32, out); + ret = server_check_event_threads (this, conf, new_nthread); +-- +1.8.3.1 + diff --git a/0186-cli-Remove-upstream-doc-reference.patch b/0186-cli-Remove-upstream-doc-reference.patch new file mode 100644 index 0000000..71c6532 --- /dev/null +++ b/0186-cli-Remove-upstream-doc-reference.patch @@ -0,0 +1,39 @@ +From 51b684d93e2a36dbf7cbded1e117994fddf2a6a9 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Thu, 15 Mar 2018 12:56:02 +0530 +Subject: [PATCH 186/201] cli: Remove upstream doc reference + +...that is displayed while creating replica 2 volumes. + +Label: DOWNSTREAM ONLY + +Change-Id: I16b45c8ad3a33cdd2a464d84f51d006d8f568b23 +BUG: 1554905 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/132744 +Reviewed-by: Karthik Subrahmanya +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + cli/src/cli-cmd-parser.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c +index 54bd57f..e071b7d 100644 +--- a/cli/src/cli-cmd-parser.c ++++ b/cli/src/cli-cmd-parser.c +@@ -533,9 +533,8 @@ cli_cmd_volume_create_parse (struct cli_state *state, const char **words, + question = "Replica 2 volumes are prone" + " to split-brain. Use " + "Arbiter or Replica 3 to " +- "avoid this. See: " +- "http://docs.gluster.org/en/latest/Administrator%20Guide/Split%20brain%20and%20ways%20to%20deal%20with%20it/." +- "\nDo you still want to " ++ "avoid this.\n" ++ "Do you still want to " + "continue?\n"; + answer = cli_cmd_get_confirmation (state, + question); +-- +1.8.3.1 + diff --git a/0187-features-shard-Do-list_del_init-while-list-memory-is.patch b/0187-features-shard-Do-list_del_init-while-list-memory-is.patch new file mode 100644 index 0000000..a344374 --- /dev/null +++ b/0187-features-shard-Do-list_del_init-while-list-memory-is.patch @@ -0,0 +1,55 @@ +From c4ce2bb15a9df0a1b6a999612ccd053e67dc7083 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 19 Mar 2018 12:09:18 +0530 +Subject: [PATCH 187/201] features/shard: Do list_del_init() while list memory + is valid + +Problem: +shard_post_lookup_fsync_handler() goes over the list of inode-ctx that need to +be fsynced and in cbk it removes each of the inode-ctx from the list. When the +first member of list is removed it tries to modifies list head's memory with +the latest next/prev and when this happens, there is no guarantee that the +list-head which is from stack memory of shard_post_lookup_fsync_handler() is +valid. + +Fix: +Do list_del_init() in the loop before winding fsync. + + >BUG: 1557876 + >Change-Id: If429d3634219e1a435bd0da0ed985c646c59c2ca + >Signed-off-by: Pranith Kumar K + +Upstream-patch: https://review.gluster.org/19737 +BUG: 1556895 +Change-Id: If429d3634219e1a435bd0da0ed985c646c59c2ca +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/133241 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index a661345..945458e 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -4521,7 +4521,6 @@ out: + if (op_ret == 0) + ctx->fsync_needed -= fsync_count; + GF_ASSERT (ctx->fsync_needed >= 0); +- list_del_init (&ctx->to_fsync_list); + if (ctx->fsync_needed != 0) { + list_add_tail (&ctx->to_fsync_list, + &base_ictx->to_fsync_list); +@@ -4596,6 +4595,7 @@ shard_post_lookup_fsync_handler (call_frame_t *frame, xlator_t *this) + anon_fd = NULL; + + list_for_each_entry_safe (iter, tmp, ©, to_fsync_list) { ++ list_del_init (&iter->to_fsync_list); + fsync_count = 0; + shard_inode_ctx_get_fsync_count (iter->inode, this, + &fsync_count); +-- +1.8.3.1 + diff --git a/0188-georep-Pause-Resume-of-geo-replication-with-wrong-us.patch b/0188-georep-Pause-Resume-of-geo-replication-with-wrong-us.patch new file mode 100644 index 0000000..c20223d --- /dev/null +++ b/0188-georep-Pause-Resume-of-geo-replication-with-wrong-us.patch @@ -0,0 +1,70 @@ +From 60575494a3b1ef52ea6374d62654693b6ee0d9bd Mon Sep 17 00:00:00 2001 +From: Sunny Kumar +Date: Fri, 16 Mar 2018 17:11:09 +0530 +Subject: [PATCH 188/201] georep : Pause/Resume of geo-replication with wrong + user + +While performing pause/resume on geo-replication with wrong user +(other user then you setup), always returns success. Which further +leads to snapshot creation failure as it is detecting active +geo-replication session. + +upstream patch : https://review.gluster.org/#/c/19658/ + +>BUG: 1550936 +>Signed-off-by: Sunny Kumar + +Change-Id: I6e96e8dd3e861348b057475387f0093cb903ae88 +BUG: 1557297 +Signed-off-by: Sunny Kumar +Reviewed-on: https://code.engineering.redhat.com/gerrit/132890 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-geo-rep.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c +index 93d4516..dff8065 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-geo-rep.c ++++ b/xlators/mgmt/glusterd/src/glusterd-geo-rep.c +@@ -3641,6 +3641,18 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) + if (path_list) + ret = -1; + } ++ ++ /* Check for geo-rep session is active or not for ++ * configured user.*/ ++ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); ++ if (ret) { ++ snprintf (errmsg, sizeof(errmsg), ++ "Geo-replication session between %s " ++ "and %s does not exist.", ++ volinfo->volname, slave); ++ ret = -1; ++ goto out; ++ } + } + break; + +@@ -3665,6 +3677,17 @@ glusterd_op_stage_gsync_set (dict_t *dict, char **op_errstr) + } + } + ++ /* Check for geo-rep session is active or not ++ * for configured user.*/ ++ ret = glusterd_gsync_get_uuid (slave, volinfo, uuid); ++ if (ret) { ++ snprintf (errmsg, sizeof(errmsg), "Geo-replication" ++ " session between %s and %s does not exist.", ++ volinfo->volname, slave); ++ ret = -1; ++ goto out; ++ } ++ + if (!is_force) { + ret = gd_pause_resume_validation (type, volinfo, slave, + statefile, op_errstr); +-- +1.8.3.1 + diff --git a/0189-fuse-enable-proper-fgetattr-like-semantics.patch b/0189-fuse-enable-proper-fgetattr-like-semantics.patch new file mode 100644 index 0000000..2dde0bd --- /dev/null +++ b/0189-fuse-enable-proper-fgetattr-like-semantics.patch @@ -0,0 +1,73 @@ +From 2be2ed1e0da026c4ae932daa263c1215d23342a9 Mon Sep 17 00:00:00 2001 +From: Csaba Henk +Date: Mon, 5 Mar 2018 13:02:09 +0100 +Subject: [PATCH 189/201] fuse: enable proper "fgetattr"-like semantics + +GETATTR FUSE message can carry a file handle +reference in which case it serves as a hint +for the FUSE server that the stat data is +preferably acquired in context of the given +filehandle (which we call '"fgetattr"-like +semantics'). + +So far FUSE ignored the GETTATTR provided +filehandle and grabbed a file handle +heuristically. This caused confusion in the +caching layers, which has been tracked down +as one of the reasons of referred BUG. + +As of the BUG, this is just a partial fix. + +> BUG: 1512691 +> Change-Id: I67eebbf5407ca725ed111fbda4181ead10d03f6d +> Reviewed-on: https://review.gluster.org/19673 +> Signed-off-by: Csaba Henk + +BUG: 1518710 +Change-Id: I67eebbf5407ca725ed111fbda4181ead10d03f6d +Signed-off-by: Csaba Henk +Reviewed-on: https://code.engineering.redhat.com/gerrit/133419 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mount/fuse/src/fuse-bridge.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c +index 03d26eb..3e31eca 100644 +--- a/xlators/mount/fuse/src/fuse-bridge.c ++++ b/xlators/mount/fuse/src/fuse-bridge.c +@@ -905,7 +905,10 @@ fuse_getattr_resume (fuse_state_t *state) + } + + if (!IA_ISDIR (state->loc.inode->ia_type)) { +- state->fd = fd_lookup (state->loc.inode, 0); ++ if (state->fd == NULL) ++ state->fd = fd_lookup (state->loc.inode, state->finh->pid); ++ if (state->fd == NULL) ++ state->fd = fd_lookup (state->loc.inode, 0); + } + + if (!state->fd) { +@@ -931,9 +934,18 @@ fuse_getattr_resume (fuse_state_t *state) + static void + fuse_getattr (xlator_t *this, fuse_in_header_t *finh, void *msg) + { ++#if FUSE_KERNEL_MINOR_VERSION >= 9 ++ struct fuse_getattr_in *fgi = msg; ++ fuse_private_t *priv = NULL; ++#endif + fuse_state_t *state; + + GET_STATE (this, finh, state); ++#if FUSE_KERNEL_MINOR_VERSION >= 9 ++ priv = this->private; ++ if (priv->proto_minor >= 9 && fgi->getattr_flags & FUSE_GETATTR_FH) ++ state->fd = fd_ref ((fd_t *)fgi->fh); ++#endif + + fuse_resolve_inode_init (state, &state->resolve, state->finh->nodeid); + +-- +1.8.3.1 + diff --git a/0190-cluster-afr-Adding-option-to-take-full-file-lock.patch b/0190-cluster-afr-Adding-option-to-take-full-file-lock.patch new file mode 100644 index 0000000..77fa2aa --- /dev/null +++ b/0190-cluster-afr-Adding-option-to-take-full-file-lock.patch @@ -0,0 +1,157 @@ +From 0f2adea7ae377ea2efbab388f3af7e2a048f5f68 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Wed, 17 Jan 2018 17:30:06 +0530 +Subject: [PATCH 190/201] cluster/afr: Adding option to take full file lock + +Problem: +In replica 3 volumes there is a possibilities of ending up in split +brain scenario, when multiple clients writing data on the same file +at non overlapping regions in parallel. + +Scenario: +- Initially all the copies are good and all the clients gets the value + of data readables as all good. +- Client C0 performs write W1 which fails on brick B0 and succeeds on + other two bricks. +- C1 performs write W2 which fails on B1 and succeeds on other two bricks. +- C2 performs write W3 which fails on B2 and succeeds on other two bricks. +- All the 3 writes above happen in parallel and fall on different ranges + so afr takes granular locks and all the writes are performed in parallel. + Since each client had data-readables as good, it does not see + file going into split-brain in the in_flight_split_brain check, hence + performs the post-op marking the pending xattrs. Now all the bricks + are being blamed by each other, ending up in split-brain. + +Fix: +Have an option to take either full lock or range lock on files while +doing data transactions, to prevent the possibility of ending up in +split brains. With this change, by default the files will take full +lock while doing IO. If you want to make use of the old range lock +change the value of "cluster.full-lock" to "no". + +Upstream patch: https://review.gluster.org/#/c/19218/ + +> Change-Id: I7893fa33005328ed63daa2f7c35eeed7c5218962 +> BUG: 1535438 +> Signed-off-by: karthik-us + +Change-Id: I4d8b1c90bfff8f597cf7f7e49a71f5f6eb19f986 +BUG: 1552414 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/131966 +Tested-by: RHGS Build Bot +Reviewed-by: Pranith Kumar Karampuri +--- + libglusterfs/src/globals.h | 4 +++- + xlators/cluster/afr/src/afr-transaction.c | 2 +- + xlators/cluster/afr/src/afr.c | 8 ++++++++ + xlators/cluster/afr/src/afr.h | 5 +++-- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 7 +++++++ + 5 files changed, 22 insertions(+), 4 deletions(-) + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index 6bbe3e6..8fd3318 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -43,7 +43,7 @@ + */ + #define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly + should not change */ +-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_1 /* MAX VERSION is the maximum ++#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_2 /* MAX VERSION is the maximum + count in VME table, should + keep changing with + introduction of newer +@@ -107,6 +107,8 @@ + + #define GD_OP_VERSION_3_13_1 31301 /* Op-version for GlusterFS 3.13.1 */ + ++#define GD_OP_VERSION_3_13_2 31302 /* Op-version for GlusterFS 3.13.2 */ ++ + #include "xlator.h" + + /* THIS */ +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 97f9dd4..1c80c6b 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -1991,7 +1991,7 @@ afr_set_transaction_flock (xlator_t *this, afr_local_t *local) + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + priv = this->private; + +- if (priv->arbiter_count && ++ if ((priv->arbiter_count || priv->full_lock) && + local->transaction.type == AFR_DATA_TRANSACTION) { + /*Lock entire file to avoid network split brains.*/ + inodelk->flock.l_len = 0; +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index d3aee77..9493fbb 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -244,6 +244,7 @@ reconfigure (xlator_t *this, dict_t *options) + out); + GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str, + out); ++ GF_OPTION_RECONF ("full-lock", priv->full_lock, options, bool, out); + GF_OPTION_RECONF ("use-compound-fops", priv->use_compound_fops, + options, bool, + out); +@@ -534,6 +535,7 @@ init (xlator_t *this) + + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); ++ GF_OPTION_INIT ("full-lock", priv->full_lock, bool, out); + GF_OPTION_INIT ("use-compound-fops", priv->use_compound_fops, + bool, out); + GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out); +@@ -1084,6 +1086,12 @@ struct volume_options options[] = { + "stop being compatible with afr-v1, which helps afr " + "be more granular while self-healing", + }, ++ { .key = {"full-lock"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .default_value = "yes", ++ .description = "If this option is disabled, then the IOs will take " ++ "range locks same as versions till 3.13.1." ++ }, + { .key = {"granular-entry-heal"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index c822221..b6f5388 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -178,9 +178,10 @@ typedef struct _afr_private { + void *pump_private; + gf_boolean_t use_afr_in_pump; + char *locking_scheme; +- gf_boolean_t esh_granular; ++ gf_boolean_t full_lock; ++ gf_boolean_t esh_granular; + gf_boolean_t consistent_io; +- gf_boolean_t use_compound_fops; ++ gf_boolean_t use_compound_fops; + } afr_private_t; + + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index b603c7f..8d3407d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -1507,6 +1507,13 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .flags = OPT_FLAG_CLIENT_OPT + }, + ++ { .key = "cluster.full-lock", ++ .voltype = "cluster/replicate", ++ .type = NO_DOC, ++ .op_version = GD_OP_VERSION_3_13_2, ++ .flags = OPT_FLAG_CLIENT_OPT ++ }, ++ + /* stripe xlator options */ + { .key = "cluster.stripe-block-size", + .voltype = "cluster/stripe", +-- +1.8.3.1 + diff --git a/0191-cluster-afr-Make-afr_fsync-a-transaction.patch b/0191-cluster-afr-Make-afr_fsync-a-transaction.patch new file mode 100644 index 0000000..92944c8 --- /dev/null +++ b/0191-cluster-afr-Make-afr_fsync-a-transaction.patch @@ -0,0 +1,362 @@ +From 9f670a342ffed3eee7cb91a67dcc2f2a27600983 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Fri, 23 Feb 2018 15:12:19 +0530 +Subject: [PATCH 191/201] cluster/afr: Make afr_fsync a transaction + +Upstream patch: https://review.gluster.org/#/c/19621/ + +Change-Id: I713401feb96393f668efb074f2d5b870d19e6fda +BUG: 1552425 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/131942 +Tested-by: RHGS Build Bot +Reviewed-by: Pranith Kumar Karampuri +--- + xlators/cluster/afr/src/afr-common.c | 163 ------------------------------ + xlators/cluster/afr/src/afr-inode-write.c | 108 ++++++++++++++++++++ + xlators/cluster/afr/src/afr-inode-write.h | 4 + + xlators/cluster/afr/src/afr.c | 2 +- + xlators/cluster/afr/src/afr.h | 4 + + 5 files changed, 117 insertions(+), 164 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 855e568..a790402 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -3435,169 +3435,6 @@ out: + return 0; + } + +-/* }}} */ +- +- +-/* {{{ fsync */ +- +-int +-afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) +-{ +- AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, +- xdata); +- return 0; +-} +- +-int +-afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- int i = 0; +- int call_count = -1; +- int child_index = (long) cookie; +- int read_subvol = 0; +- call_stub_t *stub = NULL; +- +- local = frame->local; +- priv = this->private; +- +- LOCK (&frame->lock); +- { +- local->replies[child_index].valid = 1; +- local->replies[child_index].op_ret = op_ret; +- local->replies[child_index].op_errno = op_errno; +- if (op_ret == 0) { +- if (prebuf) +- local->replies[child_index].prestat = *prebuf; +- if (postbuf) +- local->replies[child_index].poststat = *postbuf; +- if (xdata) +- local->replies[child_index].xdata = +- dict_ref (xdata); +- } +- } +- UNLOCK (&frame->lock); +- +- call_count = afr_frame_return (frame); +- +- if (call_count == 0) { +- local->op_ret = -1; +- local->op_errno = afr_final_errno (local, priv); +- read_subvol = afr_data_subvol_get (local->inode, this, NULL, +- local->readable, NULL, NULL); +- /* Pick a reply that is valid and readable, with a preference +- * given to read_subvol. */ +- for (i = 0; i < priv->child_count; i++) { +- if (!local->replies[i].valid) +- continue; +- if (local->replies[i].op_ret != 0) +- continue; +- if (!local->readable[i]) +- continue; +- local->op_ret = local->replies[i].op_ret; +- local->op_errno = local->replies[i].op_errno; +- local->cont.inode_wfop.prebuf = +- local->replies[i].prestat; +- local->cont.inode_wfop.postbuf = +- local->replies[i].poststat; +- if (local->replies[i].xdata) { +- if (local->xdata_rsp) +- dict_unref (local->xdata_rsp); +- local->xdata_rsp = +- dict_ref (local->replies[i].xdata); +- } +- if (i == read_subvol) +- break; +- } +- +- /* Make a stub out of the frame, and register it +- with the waking up post-op. When the call-stub resumes, +- we are guaranteed that there was no post-op pending +- (i.e changelogs were unset in the server). This is an +- essential "guarantee", that fsync() returns only after +- completely finishing EVERYTHING, including the delayed +- post-op. This guarantee is expected by FUSE graph switching +- for example. +- */ +- stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk, +- local->op_ret, local->op_errno, +- &local->cont.inode_wfop.prebuf, +- &local->cont.inode_wfop.postbuf, +- local->xdata_rsp); +- if (!stub) { +- AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0); +- return 0; +- } +- +- /* If no new unstable writes happened between the +- time we cleared the unstable write witness flag in afr_fsync +- and now, calling afr_delayed_changelog_wake_up() should +- wake up and skip over the fsync phase and go straight to +- afr_changelog_post_op_now() +- */ +- afr_delayed_changelog_wake_resume (this, local->fd, stub); +- } +- +- return 0; +-} +- +- +-int +-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, +- dict_t *xdata) +-{ +- afr_private_t *priv = NULL; +- afr_local_t *local = NULL; +- int i = 0; +- int32_t call_count = 0; +- int32_t op_errno = ENOMEM; +- +- priv = this->private; +- +- local = AFR_FRAME_INIT (frame, op_errno); +- if (!local) +- goto out; +- +- local->op = GF_FOP_FSYNC; +- if (!afr_is_consistent_io_possible (local, priv, &op_errno)) +- goto out; +- +- local->fd = fd_ref (fd); +- +- if (afr_fd_has_witnessed_unstable_write (this, fd)) { +- /* don't care. we only wanted to CLEAR the bit */ +- } +- +- local->inode = inode_ref (fd->inode); +- +- call_count = local->call_count; +- for (i = 0; i < priv->child_count; i++) { +- if (local->child_up[i]) { +- STACK_WIND_COOKIE (frame, afr_fsync_cbk, +- (void *) (long) i, +- priv->children[i], +- priv->children[i]->fops->fsync, +- fd, datasync, xdata); +- if (!--call_count) +- break; +- } +- } +- +- return 0; +-out: +- AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); +- +- return 0; +-} +- +-/* }}} */ +- +-/* {{{ fsync */ + + int + afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index f0231b7..0e50443 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -2539,3 +2539,111 @@ out: + AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL); + return 0; + } ++ ++ ++int ++afr_fsync_unwind (call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = NULL; ++ call_frame_t *main_frame = NULL; ++ ++ local = frame->local; ++ ++ main_frame = afr_transaction_detach_fop_frame (frame); ++ if (!main_frame) ++ return 0; ++ ++ AFR_STACK_UNWIND (fsync, main_frame, local->op_ret, local->op_errno, ++ &local->cont.inode_wfop.prebuf, ++ &local->cont.inode_wfop.postbuf, local->xdata_rsp); ++ ++ return 0; ++} ++ ++ ++int ++afr_fsync_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno, ++ prebuf, postbuf, NULL, xdata); ++} ++ ++ ++int ++afr_fsync_wind (call_frame_t *frame, xlator_t *this, int subvol) ++{ ++ afr_local_t *local = NULL; ++ afr_private_t *priv = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ STACK_WIND_COOKIE (frame, afr_fsync_wind_cbk, (void *)(long) subvol, ++ priv->children[subvol], ++ priv->children[subvol]->fops->fsync, ++ local->fd, local->cont.fsync.datasync, ++ local->xdata_req); ++ return 0; ++} ++ ++int ++afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, ++ dict_t *xdata) ++{ ++ afr_local_t *local = NULL; ++ call_frame_t *transaction_frame = NULL; ++ int ret = -1; ++ int32_t op_errno = ENOMEM; ++ ++ transaction_frame = copy_frame (frame); ++ if (!transaction_frame) ++ goto out; ++ ++ local = AFR_FRAME_INIT (transaction_frame, op_errno); ++ if (!local) ++ goto out; ++ ++ if (xdata) ++ local->xdata_req = dict_copy_with_ref (xdata, NULL); ++ else ++ local->xdata_req = dict_new (); ++ ++ if (!local->xdata_req) ++ goto out; ++ ++ local->fd = fd_ref (fd); ++ ret = afr_set_inode_local (this, local, fd->inode); ++ if (ret) ++ goto out; ++ ++ local->op = GF_FOP_FSYNC; ++ local->cont.fsync.datasync = datasync; ++ ++ if (afr_fd_has_witnessed_unstable_write (this, fd)) { ++ /* don't care. we only wanted to CLEAR the bit */ ++ } ++ ++ local->transaction.wind = afr_fsync_wind; ++ local->transaction.fop = __afr_txn_write_fop; ++ local->transaction.done = __afr_txn_write_done; ++ local->transaction.unwind = afr_fsync_unwind; ++ ++ local->transaction.main_frame = frame; ++ ++ ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); ++ if (ret < 0) { ++ op_errno = -ret; ++ goto out; ++ } ++ ++ return 0; ++out: ++ if (transaction_frame) ++ AFR_STACK_DESTROY (transaction_frame); ++ ++ AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL); ++ ++ return 0; ++} +diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h +index e174cc2..1e8bb5c 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.h ++++ b/xlators/cluster/afr/src/afr-inode-write.h +@@ -87,4 +87,8 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t + afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata); ++ ++int ++afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, ++ dict_t *xdata); + #endif /* __INODE_WRITE_H__ */ +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 9493fbb..aa48e76 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -664,7 +664,6 @@ struct xlator_fops fops = { + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, +- .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, +@@ -696,6 +695,7 @@ struct xlator_fops fops = { + .zerofill = afr_zerofill, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, ++ .fsync = afr_fsync, + + /*inode open*/ + .opendir = afr_opendir, +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index b6f5388..11278fb 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -725,6 +725,10 @@ typedef struct _afr_local { + gf_seek_what_t what; + } seek; + ++ struct { ++ int32_t datasync; ++ } fsync; ++ + } cont; + + struct { +-- +1.8.3.1 + diff --git a/0192-cluster-afr-Remove-compound-fops-usage-in-afr.patch b/0192-cluster-afr-Remove-compound-fops-usage-in-afr.patch new file mode 100644 index 0000000..1f77cd1 --- /dev/null +++ b/0192-cluster-afr-Remove-compound-fops-usage-in-afr.patch @@ -0,0 +1,631 @@ +From 18093998f1e5178e707055904b32149c52acc774 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Fri, 2 Mar 2018 10:13:20 +0530 +Subject: [PATCH 192/201] cluster/afr: Remove compound-fops usage in afr + +We are not seeing much improvement with this change. So removing the +feature so that it doesn't need to be maintained anymore. + + > Fixes: #414 +Upstream-patch: https://review.gluster.org/19655 + +BUG: 1559788 +Change-Id: Ic7969b151544daf2547bd262a9fa03f575626411 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/131943 +Tested-by: RHGS Build Bot +--- + tests/basic/afr/compounded-write-txns.t | 37 ---- + xlators/cluster/afr/src/afr-common.c | 43 ---- + xlators/cluster/afr/src/afr-transaction.c | 333 +----------------------------- + xlators/cluster/afr/src/afr-transaction.h | 4 +- + xlators/cluster/afr/src/afr.c | 10 +- + xlators/cluster/afr/src/afr.h | 13 -- + 6 files changed, 7 insertions(+), 433 deletions(-) + delete mode 100644 tests/basic/afr/compounded-write-txns.t + +diff --git a/tests/basic/afr/compounded-write-txns.t b/tests/basic/afr/compounded-write-txns.t +deleted file mode 100644 +index 7cecd87..0000000 +--- a/tests/basic/afr/compounded-write-txns.t ++++ /dev/null +@@ -1,37 +0,0 @@ +-#!/bin/bash +-. $(dirname $0)/../../include.rc +-. $(dirname $0)/../../volume.rc +- +-cleanup +- +-TEST glusterd +-TEST pidof glusterd +-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +-TEST $CLI volume set $V0 write-behind off +-TEST $CLI volume set $V0 client-io-threads off +-TEST $CLI volume start $V0 +-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 +- +-# Create and generate data into a src file +- +-TEST `printf %1024s |tr " " "1" > /tmp/source` +-TEST `printf %1024s |tr " " "2" >> /tmp/source` +- +-TEST dd if=/tmp/source of=$M0/file bs=1024 count=2 2>/dev/null +-md5sum_file=$(md5sum $M0/file | awk '{print $1}') +- +-TEST $CLI volume set $V0 cluster.use-compound-fops on +- +-TEST dd if=$M0/file of=$M0/file-copy bs=1024 count=2 2>/dev/null +- +-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 +- +-EXPECT "$md5sum_file" echo `md5sum $M0/file-copy | awk '{print $1}'` +- +-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +-TEST $CLI volume stop $V0 +-TEST $CLI volume delete $V0 +- +-TEST rm -f /tmp/source +-cleanup +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index a790402..04c8613 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -43,7 +43,6 @@ + #include "afr-self-heal.h" + #include "afr-self-heald.h" + #include "afr-messages.h" +-#include "compound-fop-utils.h" + + int32_t + afr_quorum_errno (afr_private_t *priv) +@@ -5246,7 +5245,6 @@ afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno) + + local->need_full_crawl = _gf_false; + +- local->compound = _gf_false; + INIT_LIST_HEAD (&local->healer); + return 0; + out: +@@ -5393,7 +5391,6 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (!local->pending) + goto out; + +- local->compound = _gf_false; + INIT_LIST_HEAD (&local->transaction.eager_locked); + + ret = 0; +@@ -6142,46 +6139,6 @@ afr_get_msg_id (char *op_type) + return -1; + } + +-gf_boolean_t +-afr_can_compound_pre_op_and_op (afr_private_t *priv, glusterfs_fop_t fop) +-{ +- if (priv->arbiter_count != 0) +- return _gf_false; +- +- if (!priv->use_compound_fops) +- return _gf_false; +- +- switch (fop) { +- case GF_FOP_WRITE: +- return _gf_true; +- default: +- return _gf_false; +- } +-} +- +-afr_compound_cbk_t +-afr_pack_fop_args (call_frame_t *frame, compound_args_t *args, +- glusterfs_fop_t fop, int index) +-{ +- afr_local_t *local = frame->local; +- +- switch (fop) { +- case GF_FOP_WRITE: +- COMPOUND_PACK_ARGS (writev, GF_FOP_WRITE, +- args, index, +- local->fd, local->cont.writev.vector, +- local->cont.writev.count, +- local->cont.writev.offset, +- local->cont.writev.flags, +- local->cont.writev.iobref, +- local->xdata_req); +- return afr_pre_op_writev_cbk; +- default: +- break; +- } +- return NULL; +-} +- + int + afr_fav_child_reset_sink_xattrs_cbk (int ret, call_frame_t *heal_frame, + void *opaque) +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 1c80c6b..02fb10a 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -17,7 +17,6 @@ + #include "afr-transaction.h" + #include "afr-self-heal.h" + #include "afr-messages.h" +-#include "compound-fop-utils.h" + + #include + +@@ -38,10 +37,6 @@ afr_changelog_call_count (afr_transaction_type type, + unsigned char *failed_subvols, + unsigned int child_count); + int +-afr_post_op_unlock_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +- afr_changelog_resume_t changelog_resume, +- afr_xattrop_type_t op); +-int + afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op); +@@ -847,12 +842,10 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + afr_private_t *priv = this->private; + afr_local_t *local = NULL; + dict_t *xattr = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + int ret = 0; + int idx = 0; + int nothing_failed = 1; +- gf_boolean_t compounded_unlock = _gf_true; + gf_boolean_t need_undirty = _gf_false; + + afr_handle_quorum (frame); +@@ -918,36 +911,8 @@ afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this) + goto out; + } + +- if (local->compound && local->fd) { +- LOCK (&local->fd->lock); +- { +- fd_ctx = __afr_fd_ctx_get (local->fd, this); +- for (i = 0; i < priv->child_count; i++) { +- if (local->transaction.pre_op[i] && +- local->transaction.eager_lock[i]) { +- if (fd_ctx->lock_piggyback[i]) +- compounded_unlock = _gf_false; +- else if (fd_ctx->lock_acquired[i]) +- compounded_unlock = _gf_false; +- } +- if (compounded_unlock == _gf_false) +- break; +- } +- } +- UNLOCK (&local->fd->lock); +- } +- +- /* Do not compound if any brick got piggybacked lock as +- * unlock should not be done for that. */ +- if (local->compound && compounded_unlock) { +- afr_post_op_unlock_do (frame, this, xattr, +- afr_changelog_post_op_done, +- AFR_TRANSACTION_POST_OP); +- } else { +- afr_changelog_do (frame, this, xattr, +- afr_changelog_post_op_done, +- AFR_TRANSACTION_POST_OP); +- } ++ afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done, ++ AFR_TRANSACTION_POST_OP); + out: + if (xattr) + dict_unref (xattr); +@@ -1277,66 +1242,6 @@ out: + } + + int +-afr_pre_op_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int op_ret, int op_errno, +- void *data, dict_t *xdata) +-{ +- afr_local_t *local = NULL; +- call_frame_t *fop_frame = NULL; +- default_args_cbk_t *write_args_cbk = NULL; +- compound_args_cbk_t *args_cbk = data; +- int call_count = -1; +- int child_index = -1; +- +- local = frame->local; +- child_index = (long) cookie; +- +- if (local->pre_op_compat) +- afr_changelog_pre_op_update (frame, this); +- +- if (op_ret == -1) { +- local->op_errno = op_errno; +- afr_transaction_fop_failed (frame, this, child_index); +- } +- +- /* If the compound fop failed due to saved_frame_unwind(), then +- * protocol/client fails it even before args_cbk is allocated. +- * Handle that case by passing the op_ret, op_errno values explicitly. +- */ +- if ((op_ret == -1) && (args_cbk == NULL)) { +- afr_inode_write_fill (frame, this, child_index, op_ret, +- op_errno, NULL, NULL, NULL); +- } else { +- write_args_cbk = &args_cbk->rsp_list[1]; +- afr_inode_write_fill (frame, this, child_index, +- write_args_cbk->op_ret, +- write_args_cbk->op_errno, +- &write_args_cbk->prestat, +- &write_args_cbk->poststat, +- write_args_cbk->xdata); +- } +- +- call_count = afr_frame_return (frame); +- +- if (call_count == 0) { +- compound_args_cleanup (local->c_args); +- local->c_args = NULL; +- afr_process_post_writev (frame, this); +- if (!afr_txn_nothing_failed (frame, this)) { +- /* Don't unwind until post-op is complete */ +- local->transaction.resume (frame, this); +- } else { +- /* frame change, place frame in post-op delay and unwind */ +- fop_frame = afr_transaction_detach_fop_frame (frame); +- afr_writev_copy_outvars (frame, fop_frame); +- local->transaction.resume (frame, this); +- afr_writev_unwind (fop_frame, this); +- } +- } +- return 0; +-} +- +-int + afr_changelog_prepare (xlator_t *this, call_frame_t *frame, int *call_count, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op, dict_t **xdata, +@@ -1366,223 +1271,6 @@ afr_changelog_prepare (xlator_t *this, call_frame_t *frame, int *call_count, + } + + int +-afr_pre_op_fop_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +- afr_changelog_resume_t changelog_resume, +- afr_xattrop_type_t op) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- dict_t *xdata = NULL; +- dict_t *newloc_xdata = NULL; +- compound_args_t *args = NULL; +- int i = 0, call_count = 0; +- afr_compound_cbk_t compound_cbk; +- int ret = 0; +- int op_errno = ENOMEM; +- +- local = frame->local; +- priv = this->private; +- +- /* If lock failed on all, just unlock and unwind */ +- ret = afr_changelog_prepare (this, frame, &call_count, changelog_resume, +- op, &xdata, &newloc_xdata); +- +- if (ret) +- return 0; +- +- local->call_count = call_count; +- +- afr_save_lk_owner (frame); +- frame->root->lk_owner = +- local->transaction.main_frame->root->lk_owner; +- +- args = compound_fop_alloc (2, GF_CFOP_XATTROP_WRITEV, NULL); +- +- if (!args) +- goto err; +- +- /* pack pre-op part */ +- i = 0; +- COMPOUND_PACK_ARGS (fxattrop, GF_FOP_FXATTROP, +- args, i, +- local->fd, GF_XATTROP_ADD_ARRAY, +- xattr, xdata); +- i++; +- /* pack whatever fop needs to be packed +- * @compound_cbk holds the cbk that would need to be called +- */ +- compound_cbk = afr_pack_fop_args (frame, args, local->op, i); +- +- local->c_args = args; +- +- for (i = 0; i < priv->child_count; i++) { +- /* Means lock did not succeed on this brick */ +- if (!local->transaction.pre_op[i] || +- local->transaction.failed_subvols[i]) +- continue; +- +- STACK_WIND_COOKIE (frame, compound_cbk, +- (void *) (long) i, +- priv->children[i], +- priv->children[i]->fops->compound, +- args, +- NULL); +- if (!--call_count) +- break; +- } +- +- if (xdata) +- dict_unref (xdata); +- if (newloc_xdata) +- dict_unref (newloc_xdata); +- return 0; +-err: +- local->internal_lock.lock_cbk = local->transaction.done; +- local->op_ret = -1; +- local->op_errno = op_errno; +- +- afr_restore_lk_owner (frame); +- afr_unlock (frame, this); +- +- if (xdata) +- dict_unref (xdata); +- if (newloc_xdata) +- dict_unref (newloc_xdata); +- return 0; +-} +- +-int +-afr_post_op_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int op_ret, int op_errno, +- void *data, dict_t *xdata) +-{ +- afr_local_t *local = NULL; +- int call_count = -1; +- afr_internal_lock_t *int_lock = NULL; +- int32_t child_index = (long)cookie; +- +- local = frame->local; +- child_index = (long) cookie; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- afr_update_uninodelk (local, int_lock, child_index); +- +- LOCK (&frame->lock); +- { +- call_count = --int_lock->lk_call_count; +- } +- UNLOCK (&frame->lock); +- +- if (call_count == 0) { +- compound_args_cleanup (local->c_args); +- local->c_args = NULL; +- if (local->transaction.resume_stub) { +- call_resume (local->transaction.resume_stub); +- local->transaction.resume_stub = NULL; +- } +- gf_msg_trace (this->name, 0, +- "All internal locks unlocked"); +- int_lock->lock_cbk (frame, this); +- } +- +- return 0; +-} +- +-int +-afr_post_op_unlock_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, +- afr_changelog_resume_t changelog_resume, +- afr_xattrop_type_t op) +-{ +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- dict_t *xdata = NULL; +- dict_t *newloc_xdata = NULL; +- compound_args_t *args = NULL; +- afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; +- int i = 0; +- int call_count = 0; +- struct gf_flock flock = {0,}; +- int ret = 0; +- +- local = frame->local; +- priv = this->private; +- int_lock = &local->internal_lock; +- +- if (afr_is_inodelk_transaction(local)) { +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; +- flock.l_type = F_UNLCK; +- } +- +- ret = afr_changelog_prepare (this, frame, &call_count, changelog_resume, +- op, &xdata, &newloc_xdata); +- +- if (ret) +- return 0; +- +- int_lock->lk_call_count = call_count; +- +- int_lock->lock_cbk = local->transaction.done; +- +- args = compound_fop_alloc (2, GF_CFOP_XATTROP_UNLOCK, NULL); +- +- if (!args) { +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- afr_changelog_post_op_done (frame, this); +- goto out; +- } +- +- i = 0; +- COMPOUND_PACK_ARGS (fxattrop, GF_FOP_FXATTROP, +- args, i, +- local->fd, GF_XATTROP_ADD_ARRAY, +- xattr, xdata); +- i++; +- +- if (afr_is_inodelk_transaction(local)) { +- if (local->fd) { +- COMPOUND_PACK_ARGS (finodelk, GF_FOP_FINODELK, +- args, i, +- int_lock->domain, local->fd, +- F_SETLK, &flock, NULL); +- } else { +- COMPOUND_PACK_ARGS (inodelk, GF_FOP_INODELK, +- args, i, +- int_lock->domain, &local->loc, +- F_SETLK, &flock, NULL); +- } +- } +- +- local->c_args = args; +- +- for (i = 0; i < priv->child_count; i++) { +- if (!local->transaction.pre_op[i] || +- local->transaction.failed_subvols[i]) +- continue; +- STACK_WIND_COOKIE (frame, afr_post_op_unlock_cbk, +- (void *) (long) i, +- priv->children[i], +- priv->children[i]->fops->compound, +- args, +- NULL); +- if (!--call_count) +- break; +- } +-out: +- if (xdata) +- dict_unref (xdata); +- if (newloc_xdata) +- dict_unref (newloc_xdata); +- return 0; +-} +- +-int + afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr, + afr_changelog_resume_t changelog_resume, + afr_xattrop_type_t op) +@@ -1793,21 +1481,8 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) + goto next; + } + +- /* Till here we have already decided if pre-op needs to be done, +- * based on various criteria. The only thing that needs to be checked +- * now on is whether compound-fops is enabled or not. +- * If it is, then perform pre-op and fop together for writev op. +- */ +- if (afr_can_compound_pre_op_and_op (priv, local->op)) { +- local->compound = _gf_true; +- afr_pre_op_fop_do (frame, this, xdata_req, +- afr_transaction_perform_fop, +- AFR_TRANSACTION_PRE_OP); +- } else { +- afr_changelog_do (frame, this, xdata_req, +- afr_transaction_perform_fop, +- AFR_TRANSACTION_PRE_OP); +- } ++ afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop, ++ AFR_TRANSACTION_PRE_OP); + + if (xdata_req) + dict_unref (xdata_req); +diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h +index dd19e5b..d01e144 100644 +--- a/xlators/cluster/afr/src/afr-transaction.h ++++ b/xlators/cluster/afr/src/afr-transaction.h +@@ -58,7 +58,5 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv, + inode_t *inode1, unsigned char *readable1, + inode_t *inode2, unsigned char *readable2); + int +-afr_pre_op_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int op_ret, int op_errno, +- void *data, dict_t *xdata); ++afr_transaction_resume (call_frame_t *frame, xlator_t *this); + #endif /* __TRANSACTION_H__ */ +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index aa48e76..9ed0481 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -245,9 +245,6 @@ reconfigure (xlator_t *this, dict_t *options) + GF_OPTION_RECONF ("locking-scheme", priv->locking_scheme, options, str, + out); + GF_OPTION_RECONF ("full-lock", priv->full_lock, options, bool, out); +- GF_OPTION_RECONF ("use-compound-fops", priv->use_compound_fops, +- options, bool, +- out); + GF_OPTION_RECONF ("granular-entry-heal", priv->esh_granular, options, + bool, out); + +@@ -536,8 +533,6 @@ init (xlator_t *this) + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); + GF_OPTION_INIT ("full-lock", priv->full_lock, bool, out); +- GF_OPTION_INIT ("use-compound-fops", priv->use_compound_fops, +- bool, out); + GF_OPTION_INIT ("granular-entry-heal", priv->esh_granular, bool, out); + + GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out); +@@ -1121,9 +1116,8 @@ struct volume_options options[] = { + { .key = {"use-compound-fops"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "no", +- .description = "Use compound fops framework to modify afr " +- "transaction such that network roundtrips are " +- "reduced, thus improving the performance.", ++ .description = "this option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {NULL} }, + }; +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 11278fb..a5b0d3b 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -44,10 +44,6 @@ typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int + + typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this); + +-typedef int (*afr_compound_cbk_t) (call_frame_t *frame, void *cookie, +- xlator_t *this, int op_ret, int op_errno, +- void *data, dict_t *xdata); +- + #define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;}) + #define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];}) + #define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;}) +@@ -181,7 +177,6 @@ typedef struct _afr_private { + gf_boolean_t full_lock; + gf_boolean_t esh_granular; + gf_boolean_t consistent_io; +- gf_boolean_t use_compound_fops; + } afr_private_t; + + +@@ -843,9 +838,7 @@ typedef struct _afr_local { + call_frame_t *heal_frame; + + gf_boolean_t need_full_crawl; +- gf_boolean_t compound; + afr_fop_lock_state_t fop_lock_state; +- compound_args_t *c_args; + + gf_boolean_t is_read_txn; + afr_inode_ctx_t *inode_ctx; +@@ -1252,12 +1245,6 @@ afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame); + void + afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index); +-gf_boolean_t +-afr_can_compound_pre_op_and_op (afr_private_t *priv, glusterfs_fop_t fop); +- +-afr_compound_cbk_t +-afr_pack_fop_args (call_frame_t *frame, compound_args_t *args, +- glusterfs_fop_t fop, int index); + int + afr_is_inodelk_transaction(afr_local_t *local); + +-- +1.8.3.1 + diff --git a/0193-cluster-afr-Remove-unused-code-paths.patch b/0193-cluster-afr-Remove-unused-code-paths.patch new file mode 100644 index 0000000..de7e658 --- /dev/null +++ b/0193-cluster-afr-Remove-unused-code-paths.patch @@ -0,0 +1,1505 @@ +From a33f967c77ccf580db42a9f02708fa39f5d7f310 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Fri, 2 Mar 2018 12:37:42 +0530 +Subject: [PATCH 193/201] cluster/afr: Remove unused code paths + +Removed +1) afr-v1 self-heal locks related code which is not used anymore +2) transaction has some data types that are not needed, so removed them +3) Never used lock tracing available in afr as gluster's network tracing does +the job. So removed that as well. +4) Changelog is always enabled and afr is always used with locks, so +__changelog_enabled, afr_lock_server_count etc functions can be deleted. +5) transaction.fop/done/resume always call the same functions, so no need +to have these variables. + + > BUG: 1549606 + +Upstream-patch: https://review.gluster.org/19661 +BUG: 1491785 +Change-Id: I370c146fec2892d40e674d232a5d7256e003c7f1 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/131944 +Tested-by: RHGS Build Bot +--- + xlators/cluster/afr/src/afr-common.c | 10 +- + xlators/cluster/afr/src/afr-dir-write.c | 18 +- + xlators/cluster/afr/src/afr-inode-write.c | 39 +-- + xlators/cluster/afr/src/afr-lk-common.c | 508 +----------------------------- + xlators/cluster/afr/src/afr-transaction.c | 137 +------- + xlators/cluster/afr/src/afr-transaction.h | 2 - + xlators/cluster/afr/src/afr.c | 35 +- + xlators/cluster/afr/src/afr.h | 52 +-- + 8 files changed, 41 insertions(+), 760 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 04c8613..06863b6 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -4526,9 +4526,6 @@ afr_priv_dump (xlator_t *this) + gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal); + gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal); + gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal); +- gf_proc_dump_write("data_change_log", "%d", priv->data_change_log); +- gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log); +- gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log); + gf_proc_dump_write("read_child", "%d", priv->read_child); + gf_proc_dump_write("favorite_child", "%d", priv->favorite_child); + gf_proc_dump_write("wait_count", "%u", priv->wait_count); +@@ -5252,8 +5249,7 @@ out: + } + + int +-afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, +- transaction_lk_type_t lk_type) ++afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count) + { + int ret = -ENOMEM; + +@@ -5269,7 +5265,6 @@ afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, + + lk->lock_op_ret = -1; + lk->lock_op_errno = EUCLEAN; +- lk->transaction_lk_type = lk_type; + + ret = 0; + out: +@@ -5336,8 +5331,7 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + afr_private_t *priv = NULL; + + priv = this->private; +- ret = afr_internal_lock_init (&local->internal_lock, priv->child_count, +- AFR_TRANSACTION_LK); ++ ret = afr_internal_lock_init (&local->internal_lock, priv->child_count); + if (ret < 0) + goto out; + +diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c +index e088ed6..75889de 100644 +--- a/xlators/cluster/afr/src/afr-dir-write.c ++++ b/xlators/cluster/afr/src/afr-dir-write.c +@@ -267,7 +267,7 @@ __afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + afr_mark_entry_pending_changelog (frame, this); + +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + } + + return 0; +@@ -496,8 +496,6 @@ afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + goto out; + + local->transaction.wind = afr_create_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_create_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -626,8 +624,6 @@ afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + goto out; + + local->transaction.wind = afr_mknod_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_mknod_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -762,8 +758,6 @@ afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + + local->op = GF_FOP_MKDIR; + local->transaction.wind = afr_mkdir_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_mkdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -891,8 +885,6 @@ afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + local->op = GF_FOP_LINK; + + local->transaction.wind = afr_link_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_link_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc, +@@ -1021,8 +1013,6 @@ afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath, + + local->op = GF_FOP_SYMLINK; + local->transaction.wind = afr_symlink_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_symlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -1156,8 +1146,6 @@ afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + + local->op = GF_FOP_RENAME; + local->transaction.wind = afr_rename_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_rename_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc, +@@ -1308,8 +1296,6 @@ afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + + local->op = GF_FOP_UNLINK; + local->transaction.wind = afr_unlink_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_unlink_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +@@ -1436,8 +1422,6 @@ afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + + local->op = GF_FOP_RMDIR; + local->transaction.wind = afr_rmdir_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_rmdir_unwind; + + ret = afr_build_parent_loc (&local->transaction.parent_loc, loc, +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index 0e50443..2402bb2 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -207,7 +207,7 @@ __afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->transaction.unwind (frame, this); + } + +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + } + + return 0; +@@ -357,13 +357,10 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) + { +- afr_local_t *local = NULL; + call_frame_t *fop_frame = NULL; + int child_index = (long) cookie; + int call_count = -1; + +- local = frame->local; +- + afr_inode_write_fill (frame, this, child_index, op_ret, op_errno, + prebuf, postbuf, xdata); + +@@ -374,7 +371,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + if (!afr_txn_nothing_failed (frame, this)) { + //Don't unwind until post-op is complete +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + } else { + /* + * Generally inode-write fops do transaction.unwind then +@@ -389,7 +386,7 @@ afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + fop_frame = afr_transaction_detach_fop_frame (frame); + afr_writev_copy_outvars (frame, fop_frame); +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + afr_writev_unwind (fop_frame, this); + } + } +@@ -462,8 +459,6 @@ afr_do_writev (call_frame_t *frame, xlator_t *this) + local->op = GF_FOP_WRITE; + + local->transaction.wind = afr_writev_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_transaction_writev_unwind; + + local->transaction.main_frame = frame; +@@ -652,8 +647,6 @@ afr_truncate (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_truncate_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); +@@ -780,8 +773,6 @@ afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->op = GF_FOP_FTRUNCATE; + + local->transaction.wind = afr_ftruncate_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->transaction.main_frame = frame; +@@ -888,8 +879,6 @@ afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf, + goto out; + + local->transaction.wind = afr_setattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setattr_unwind; + + loc_copy (&local->loc, loc); +@@ -995,8 +984,6 @@ afr_fsetattr (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_fsetattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetattr_unwind; + + local->fd = fd_ref (fd); +@@ -1639,8 +1626,6 @@ afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + goto out; + + local->transaction.wind = afr_setxattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); +@@ -1753,8 +1738,6 @@ afr_fsetxattr (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_fsetxattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsetxattr_unwind; + + local->fd = fd_ref (fd); +@@ -1868,8 +1851,6 @@ afr_removexattr (call_frame_t *frame, xlator_t *this, + goto out; + + local->transaction.wind = afr_removexattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); +@@ -1977,8 +1958,6 @@ afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + goto out; + + local->transaction.wind = afr_fremovexattr_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fremovexattr_unwind; + + local->fd = fd_ref (fd); +@@ -2094,8 +2073,6 @@ afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + local->op = GF_FOP_FALLOCATE; + + local->transaction.wind = afr_fallocate_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fallocate_unwind; + + local->transaction.main_frame = frame; +@@ -2208,8 +2185,6 @@ afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->op = GF_FOP_DISCARD; + + local->transaction.wind = afr_discard_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_discard_unwind; + + local->transaction.main_frame = frame; +@@ -2319,8 +2294,6 @@ afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + local->op = GF_FOP_ZEROFILL; + + local->transaction.wind = afr_zerofill_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_zerofill_unwind; + + local->transaction.main_frame = frame; +@@ -2413,8 +2386,6 @@ afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + local->xdata_req = dict_ref (xdata); + + local->transaction.wind = afr_xattrop_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_xattrop_unwind; + + loc_copy (&local->loc, loc); +@@ -2509,8 +2480,6 @@ afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + local->xdata_req = dict_ref (xdata); + + local->transaction.wind = afr_fxattrop_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fxattrop_unwind; + + local->fd = fd_ref (fd); +@@ -2626,8 +2595,6 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + } + + local->transaction.wind = afr_fsync_wind; +- local->transaction.fop = __afr_txn_write_fop; +- local->transaction.done = __afr_txn_write_done; + local->transaction.unwind = afr_fsync_unwind; + + local->transaction.main_frame = frame; +diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c +index f50c7b6..260815f 100644 +--- a/xlators/cluster/afr/src/afr-lk-common.c ++++ b/xlators/cluster/afr/src/afr-lk-common.c +@@ -23,38 +23,6 @@ + #define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */ + #define LOCKED_LOWER 0x2 /* for lower path */ + +-#define AFR_TRACE_INODELK_IN(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->inodelk_trace) \ +- break; \ +- afr_trace_inodelk_in (frame, this, params); \ +- } while (0); +- +-#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->inodelk_trace) \ +- break; \ +- afr_trace_inodelk_out (frame, this, params); \ +- } while (0); +- +-#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->entrylk_trace) \ +- break; \ +- afr_trace_entrylk_in (frame, this, params); \ +- } while (0); +- +-#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \ +- do { \ +- afr_private_t *_priv = this->private; \ +- if (!_priv->entrylk_trace) \ +- break; \ +- afr_trace_entrylk_out (frame, this, params); \ +- } while (0); +- + int + afr_entry_lockee_cmp (const void *l1, const void *l2) + { +@@ -119,28 +87,6 @@ afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) + set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner); + } + +-static int +-is_afr_lock_selfheal (afr_local_t *local) +-{ +- afr_internal_lock_t *int_lock = NULL; +- int ret = -1; +- +- int_lock = &local->internal_lock; +- +- switch (int_lock->selfheal_lk_type) { +- case AFR_DATA_SELF_HEAL_LK: +- case AFR_METADATA_SELF_HEAL_LK: +- ret = 1; +- break; +- case AFR_ENTRY_SELF_HEAL_LK: +- ret = 0; +- break; +- } +- +- return ret; +- +-} +- + int32_t + internal_lock_count (call_frame_t *frame, xlator_t *this) + { +@@ -160,315 +106,12 @@ internal_lock_count (call_frame_t *frame, xlator_t *this) + return call_count; + } + +-static void +-afr_print_inodelk (char *str, int size, int cmd, +- struct gf_flock *flock, gf_lkowner_t *owner) +-{ +- char *cmd_str = NULL; +- char *type_str = NULL; +- +- switch (cmd) { +-#if F_GETLK != F_GETLK64 +- case F_GETLK64: +-#endif +- case F_GETLK: +- cmd_str = "GETLK"; +- break; +- +-#if F_SETLK != F_SETLK64 +- case F_SETLK64: +-#endif +- case F_SETLK: +- cmd_str = "SETLK"; +- break; +- +-#if F_SETLKW != F_SETLKW64 +- case F_SETLKW64: +-#endif +- case F_SETLKW: +- cmd_str = "SETLKW"; +- break; +- +- default: +- cmd_str = ""; +- break; +- } +- +- switch (flock->l_type) { +- case F_RDLCK: +- type_str = "READ"; +- break; +- case F_WRLCK: +- type_str = "WRITE"; +- break; +- case F_UNLCK: +- type_str = "UNLOCK"; +- break; +- default: +- type_str = "UNKNOWN"; +- break; +- } +- +- snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, " +- "start=%llu, len=%llu, pid=%llu, lk-owner=%s", +- cmd_str, type_str, (unsigned long long) flock->l_start, +- (unsigned long long) flock->l_len, +- (unsigned long long) flock->l_pid, +- lkowner_utoa (owner)); +- +-} +- +-static void +-afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd, +- int child_index) +-{ +- snprintf (str, size, "path=%s, fd=%p, child=%d", +- loc->path ? loc->path : "", +- fd ? fd : NULL, +- child_index); +-} +- +-void +-afr_print_entrylk (char *str, int size, const char *basename, +- gf_lkowner_t *owner) +-{ +- snprintf (str, size, "Basename=%s, lk-owner=%s", +- basename ? basename : "", +- lkowner_utoa (owner)); +-} +- +-static void +-afr_print_verdict (int op_ret, int op_errno, char *str) +-{ +- if (op_ret < 0) { +- if (op_errno == EAGAIN) +- strcpy (str, "EAGAIN"); +- else +- strcpy (str, "FAILED"); +- } +- else +- strcpy (str, "GRANTED"); +-} +- +-static void +-afr_set_lock_call_type (afr_lock_call_type_t lock_call_type, +- char *lock_call_type_str, +- afr_internal_lock_t *int_lock) +-{ +- switch (lock_call_type) { +- case AFR_INODELK_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL"); +- break; +- case AFR_INODELK_NB_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL"); +- break; +- case AFR_ENTRYLK_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL"); +- break; +- case AFR_ENTRYLK_NB_TRANSACTION: +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) +- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION"); +- else +- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL"); +- break; +- default: +- strcpy (lock_call_type_str, "UNKNOWN"); +- break; +- } +- +-} +- +-static void +-afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, struct gf_flock *flock, +- int op_ret, int op_errno, int32_t child_index) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- +- char lockee[256]; +- char lock_call_type_str[256]; +- char verdict[16]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- afr_print_verdict (op_ret, op_errno, verdict); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", +- verdict, lkowner_utoa (&frame->root->lk_owner), lockee, +- (unsigned long long) int_lock->lock_number); +- +-} +- +-static void +-afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, struct gf_flock *flock, +- int32_t cmd, int32_t child_index) +-{ +- afr_local_t *local = NULL; +- afr_internal_lock_t *int_lock = NULL; +- +- char lock[256]; +- char lockee[256]; +- char lock_call_type_str[256]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner); +- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", +- lock, lockee, +- (unsigned long long) int_lock->lock_number); +- +-} +- +-static void +-afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, const char *basename, +- int32_t cookie) +-{ +- afr_local_t *local = NULL; +- afr_internal_lock_t *int_lock = NULL; +- afr_private_t *priv = NULL; +- int child_index = 0; +- int lockee_no = 0; +- +- char lock[256]; +- char lockee[256]; +- char lock_call_type_str[256]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- priv = this->private; +- +- if (!priv->entrylk_trace) { +- return; +- } +- lockee_no = cookie / priv->child_count; +- child_index = cookie % priv->child_count; +- +- afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); +- afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, +- child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST", +- lock, lockee, +- (unsigned long long) int_lock->lock_number, +- cookie); +-} +- +-static void +-afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this, +- afr_lock_call_type_t lock_call_type, +- afr_lock_op_type_t lk_op_type, const char *basename, +- int op_ret, int op_errno, int32_t cookie) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- int lockee_no = 0; +- int child_index = 0; +- +- char lock[256]; +- char lockee[256]; +- char lock_call_type_str[256]; +- char verdict[16]; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- priv = this->private; +- +- if (!priv->entrylk_trace) { +- return; +- } +- lockee_no = cookie / priv->child_count; +- child_index = cookie % priv->child_count; +- +- afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner); +- afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd, +- child_index); +- +- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock); +- +- afr_print_verdict (op_ret, op_errno, verdict); +- +- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO, +- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}", +- lock_call_type_str, +- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY", +- verdict, +- lock, lockee, +- (unsigned long long) int_lock->lock_number, +- cookie); +- +-} +- +-static int +-transaction_lk_op (afr_local_t *local) +-{ +- afr_internal_lock_t *int_lock = NULL; +- int ret = -1; +- +- int_lock = &local->internal_lock; +- +- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) { +- gf_msg_debug (THIS->name, 0, +- "lk op is for a transaction"); +- ret = 1; +- } +- else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) { +- gf_msg_debug (THIS->name, 0, +- "lk op is for a self heal"); +- +- ret = 0; +- } +- +- if (ret == -1) +- gf_msg_debug (THIS->name, 0, +- "lk op is not set"); +- +- return ret; +- +-} +- + int +-afr_is_inodelk_transaction(afr_local_t *local) ++afr_is_inodelk_transaction(afr_transaction_type type) + { + int ret = 0; + +- switch (local->transaction.type) { ++ switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + ret = 1; +@@ -664,10 +307,6 @@ afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, +- AFR_UNLOCK_OP, NULL, op_ret, +- op_errno, child_index); +- + priv = this->private; + + if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) { +@@ -764,11 +403,6 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + + flock_use = &full_flock; + wind: +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_UNLOCK_OP, flock_use, F_SETLK, +- i); +- + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], +@@ -780,9 +414,6 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + break; + + } else { +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_UNLOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, +@@ -816,11 +447,6 @@ afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_UNLOCK_OP, +- int_lock->lockee[lockee_no].basename, op_ret, +- op_errno, (int) ((long)cookie)); +- + if (op_ret < 0) { + gf_msg (this->name, GF_LOG_ERROR, op_errno, + AFR_MSG_ENTRY_UNLOCK_FAIL, +@@ -866,10 +492,6 @@ afr_unlock_entrylk (call_frame_t *frame, xlator_t *this) + lockee_no = i / copies; + index = i % copies; + if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +- AFR_UNLOCK_OP, +- int_lock->lockee[lockee_no].basename, +- i); + + STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk, + (void *) (long) i, +@@ -963,10 +585,6 @@ static int32_t + afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { +- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION, +- AFR_LOCK_OP, NULL, op_ret, +- op_errno, (long) cookie); +- + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + return 0; + +@@ -976,10 +594,6 @@ static int32_t + afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { +- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, NULL, op_ret, +- op_errno, (long)cookie); +- + afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata); + return 0; + } +@@ -1017,27 +631,11 @@ afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) + } + + static gf_boolean_t +-afr_is_entrylk (afr_internal_lock_t *int_lock, +- afr_transaction_type trans_type) ++afr_is_entrylk (afr_transaction_type trans_type) + { +- gf_boolean_t is_entrylk = _gf_false; +- +- if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) && +- int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) { +- +- is_entrylk = _gf_true; +- +- } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) && +- (trans_type == AFR_ENTRY_TRANSACTION || +- trans_type == AFR_ENTRY_RENAME_TRANSACTION)) { +- +- is_entrylk = _gf_true; +- +- } else { +- is_entrylk = _gf_false; +- } +- +- return is_entrylk; ++ if (afr_is_inodelk_transaction (trans_type)) ++ return _gf_false; ++ return _gf_true; + } + + static gf_boolean_t +@@ -1092,7 +690,7 @@ is_blocking_locks_count_sufficient (call_frame_t *frame, xlator_t *this) + priv = this->private; + int_lock = &local->internal_lock; + lockee_count = int_lock->lockee_count; +- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); ++ is_entrylk = afr_is_entrylk (local->transaction.type); + + if (!is_entrylk) { + if (int_lock->lock_count == 0) { +@@ -1150,7 +748,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + priv = this->private; + child_index = cookie % priv->child_count; + lockee_no = cookie / priv->child_count; +- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type); ++ is_entrylk = afr_is_entrylk (local->transaction.type); + + + if (!is_entrylk) { +@@ -1217,10 +815,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + case AFR_METADATA_TRANSACTION: + + if (local->fd) { +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_LOCK_OP, &flock, F_SETLKW, +- child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, +@@ -1230,10 +824,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + F_SETLKW, &flock, NULL); + + } else { +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_TRANSACTION, +- AFR_LOCK_OP, &flock, F_SETLKW, +- child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, +@@ -1251,10 +841,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + *and 'fd-less' children */ + + if (local->fd) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, +- cookie); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, +@@ -1264,10 +850,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { +- AFR_TRACE_ENTRYLK_IN (frame, this, +- AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, local->transaction.basename, +- child_index); + + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, +@@ -1340,10 +922,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, op_ret, +- op_errno, (long) cookie); + + LOCK (&frame->lock); + { +@@ -1453,10 +1031,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, +- i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, +@@ -1479,10 +1053,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION, +- AFR_LOCK_OP, +- int_lock->lockee[lockee_no].basename, +- i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, +@@ -1517,10 +1087,6 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int_lock = &local->internal_lock; + inodelk = afr_get_inodelk (int_lock, int_lock->domain); + +- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION, +- AFR_LOCK_OP, NULL, op_ret, +- op_errno, (long) cookie); +- + if (local->fd) + fd_ctx = afr_fd_ctx_get (local->fd, this); + +@@ -1691,9 +1257,6 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + } + flock_use = &full_flock; + wind: +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_NB_TRANSACTION, +- AFR_LOCK_OP, flock_use, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, +@@ -1713,9 +1276,6 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + for (i = 0; i < priv->child_count; i++) { + if (!local->child_up[i]) + continue; +- AFR_TRACE_INODELK_IN (frame, this, +- AFR_INODELK_NB_TRANSACTION, +- AFR_LOCK_OP, &flock, F_SETLK, i); + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, +@@ -1739,54 +1299,10 @@ afr_unlock (call_frame_t *frame, xlator_t *this) + + local = frame->local; + +- if (transaction_lk_op (local)) { +- if (afr_is_inodelk_transaction(local)) +- afr_unlock_inodelk (frame, this); +- else +- afr_unlock_entrylk (frame, this); +- +- } else { +- if (is_afr_lock_selfheal (local)) +- afr_unlock_inodelk (frame, this); +- else +- afr_unlock_entrylk (frame, this); +- } ++ if (afr_is_inodelk_transaction(local->transaction.type)) ++ afr_unlock_inodelk (frame, this); ++ else ++ afr_unlock_entrylk (frame, this); + + return 0; + } +- +-int +-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, +- unsigned int child_count) +-{ +- afr_local_t *dst_local = NULL; +- afr_local_t *src_local = NULL; +- afr_internal_lock_t *dst_lock = NULL; +- afr_internal_lock_t *src_lock = NULL; +- afr_inodelk_t *dst_inodelk = NULL; +- afr_inodelk_t *src_inodelk = NULL; +- int ret = -1; +- +- src_local = src->local; +- src_lock = &src_local->internal_lock; +- src_inodelk = afr_get_inodelk (src_lock, dom); +- dst_local = dst->local; +- dst_lock = &dst_local->internal_lock; +- dst_inodelk = afr_get_inodelk (dst_lock, dom); +- if (!dst_inodelk || !src_inodelk) +- goto out; +- if (src_inodelk->locked_nodes) { +- memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes, +- sizeof (*dst_inodelk->locked_nodes) * child_count); +- memset (src_inodelk->locked_nodes, 0, +- sizeof (*src_inodelk->locked_nodes) * child_count); +- } +- +- dst_lock->transaction_lk_type = src_lock->transaction_lk_type; +- dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type; +- dst_inodelk->lock_count = src_inodelk->lock_count; +- src_inodelk->lock_count = 0; +- ret = 0; +-out: +- return ret; +-} +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 02fb10a..acbfe1a 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -133,7 +133,7 @@ afr_needs_changelog_update (afr_local_t *local) + } + + int +-__afr_txn_write_fop (call_frame_t *frame, xlator_t *this) ++afr_transaction_fop (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +@@ -150,7 +150,7 @@ __afr_txn_write_fop (call_frame_t *frame, xlator_t *this) + priv->child_count); + + if (call_count == 0) { +- local->transaction.resume (frame, this); ++ afr_transaction_resume (frame, this); + return 0; + } + +@@ -170,7 +170,7 @@ __afr_txn_write_fop (call_frame_t *frame, xlator_t *this) + + + int +-__afr_txn_write_done (call_frame_t *frame, xlator_t *this) ++afr_transaction_done (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +@@ -345,13 +345,13 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { +- local->internal_lock.lock_cbk = local->transaction.done; ++ local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = ENOTCONN; + afr_restore_lk_owner (frame); + afr_unlock (frame, this); + } else { +- local->transaction.fop (frame, this); ++ afr_transaction_fop (frame, this); + } + + return; +@@ -407,75 +407,12 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop (frame, this); + } else { +- local->transaction.fop (frame, this); ++ afr_transaction_fop (frame, this); + } + + return 0; + } + +-static int +-__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +-{ +- int ret = 0; +- +- switch (type) { +- case AFR_DATA_TRANSACTION: +- if (priv->data_change_log) +- ret = 1; +- +- break; +- +- case AFR_METADATA_TRANSACTION: +- if (priv->metadata_change_log) +- ret = 1; +- +- break; +- +- case AFR_ENTRY_TRANSACTION: +- case AFR_ENTRY_RENAME_TRANSACTION: +- if (priv->entry_change_log) +- ret = 1; +- +- break; +- } +- +- return ret; +-} +- +- +-static int +-__fop_changelog_needed (call_frame_t *frame, xlator_t *this) +-{ +- afr_private_t * priv = NULL; +- afr_local_t * local = NULL; +- int op_ret = 0; +- afr_transaction_type type = -1; +- +- priv = this->private; +- local = frame->local; +- type = local->transaction.type; +- +- if (__changelog_enabled (priv, type)) { +- switch (local->op) { +- +- case GF_FOP_WRITE: +- case GF_FOP_FTRUNCATE: +- op_ret = 1; +- break; +- +- case GF_FOP_FLUSH: +- op_ret = 0; +- break; +- +- default: +- op_ret = 1; +- } +- } +- +- return op_ret; +-} +- +- + int + afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) + { +@@ -496,29 +433,6 @@ afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending) + return ret; + } + +-int +-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +-{ +- int ret = 0; +- +- switch (type) { +- case AFR_DATA_TRANSACTION: +- ret = priv->child_count; +- break; +- +- case AFR_METADATA_TRANSACTION: +- ret = priv->child_count; +- break; +- +- case AFR_ENTRY_TRANSACTION: +- case AFR_ENTRY_RENAME_TRANSACTION: +- ret = priv->child_count; +- break; +- } +- +- return ret; +-} +- + /* {{{ pending */ + + +@@ -552,11 +466,9 @@ int + afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; +- afr_private_t *priv = NULL; + afr_internal_lock_t *int_lock = NULL; + + local = frame->local; +- priv = this->private; + int_lock = &local->internal_lock; + + /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ +@@ -567,12 +479,8 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + local->transaction.resume_stub = NULL; + } + +- if (afr_lock_server_count (priv, local->transaction.type) == 0) { +- local->transaction.done (frame, this); +- } else { +- int_lock->lock_cbk = local->transaction.done; +- afr_unlock (frame, this); +- } ++ int_lock->lock_cbk = afr_transaction_done; ++ afr_unlock (frame, this); + + return 0; + } +@@ -1496,7 +1404,7 @@ next: + + return 0; + err: +- local->internal_lock.lock_cbk = local->transaction.done; ++ local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = op_errno; + +@@ -1522,7 +1430,7 @@ afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + gf_msg (this->name, GF_LOG_INFO, + 0, AFR_MSG_BLOCKING_LKS_FAILED, + "Blocking inodelks failed."); +- local->transaction.done (frame, this); ++ afr_transaction_done (frame, this); + } else { + + gf_msg_debug (this->name, 0, +@@ -1573,7 +1481,7 @@ afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + gf_msg (this->name, GF_LOG_INFO, 0, + AFR_MSG_BLOCKING_LKS_FAILED, + "Blocking entrylks failed."); +- local->transaction.done (frame, this); ++ afr_transaction_done (frame, this); + } else { + + gf_msg_debug (this->name, 0, +@@ -1626,7 +1534,7 @@ afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this) + AFR_MSG_BLOCKING_LKS_FAILED, + "Blocking entrylks failed."); + +- local->transaction.done (frame, this); ++ afr_transaction_done (frame, this); + } else { + + gf_msg_debug (this->name, 0, +@@ -1689,7 +1597,6 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) + local = frame->local; + int_lock = &local->internal_lock; + +- int_lock->transaction_lk_type = AFR_TRANSACTION_LK; + int_lock->domain = this->name; + + switch (local->transaction.type) { +@@ -1738,11 +1645,7 @@ afr_lock (call_frame_t *frame, xlator_t *this) + int + afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) + { +- if (__fop_changelog_needed (frame, this)) { +- afr_changelog_pre_op (frame, this); +- } else { +- afr_transaction_perform_fop (frame, this); +- } ++ afr_changelog_pre_op (frame, this); + + return 0; + } +@@ -2152,11 +2055,7 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) + with OP */ + afr_changelog_pre_op_update (frame, this); + +- if (__fop_changelog_needed (frame, this)) { +- afr_changelog_post_op (frame, this); +- } else { +- afr_changelog_post_op_done (frame, this); +- } ++ afr_changelog_post_op (frame, this); + + return 0; + } +@@ -2263,7 +2162,6 @@ void + afr_transaction_start (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = frame->local; +- afr_private_t *priv = this->private; + fd_t *fd = NULL; + + afr_transaction_eager_lock_init (local, this); +@@ -2285,11 +2183,7 @@ afr_transaction_start (call_frame_t *frame, xlator_t *this) + } + } + +- if (afr_lock_server_count (priv, local->transaction.type) == 0) { +- afr_internal_lock_finish (frame, this); +- } else { +- afr_lock (frame, this); +- } ++ afr_lock (frame, this); + } + + int +@@ -2321,7 +2215,6 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + local = frame->local; + priv = this->private; + +- local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + + if (!afr_is_consistent_io_possible (local, priv, &ret)) { +diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h +index d01e144..ddcb1eb 100644 +--- a/xlators/cluster/afr/src/afr-transaction.h ++++ b/xlators/cluster/afr/src/afr-transaction.h +@@ -46,8 +46,6 @@ int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode, + + int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol); + +-int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this); +-int __afr_txn_write_done (call_frame_t *frame, xlator_t *this); + call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame); + gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this); + gf_boolean_t afr_needs_changelog_update (afr_local_t *local); +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 9ed0481..dec6e60 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -168,15 +168,6 @@ reconfigure (xlator_t *this, dict_t *options) + priv->data_self_heal_window_size, options, + uint32, out); + +- GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options, +- bool, out); +- +- GF_OPTION_RECONF ("metadata-change-log", +- priv->metadata_change_log, options, bool, out); +- +- GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options, +- bool, out); +- + GF_OPTION_RECONF ("data-self-heal-algorithm", + priv->data_self_heal_algorithm, options, str, out); + +@@ -516,20 +507,9 @@ init (xlator_t *this) + + GF_OPTION_INIT ("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out); + +- GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out); +- +- GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool, +- out); +- +- GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out); +- + GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log, + bool, out); + +- GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out); +- +- GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out); +- + GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out); + GF_OPTION_INIT ("locking-scheme", priv->locking_scheme, str, out); + GF_OPTION_INIT ("full-lock", priv->full_lock, bool, out); +@@ -878,23 +858,20 @@ struct volume_options options[] = { + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Data fops like write/truncate will not perform " +- "pre/post fop changelog operations in afr transaction " +- "if this option is disabled" ++ .description = "This option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Metadata fops like setattr/setxattr will not perform " +- "pre/post fop changelog operations in afr transaction " +- "if this option is disabled" ++ .description = "This option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Entry fops like create/unlink will not perform " +- "pre/post fop changelog operations in afr transaction " +- "if this option is disabled" ++ .description = "This option exists only for backward compatibility " ++ "and configuring it doesn't have any effect" + }, + { .key = {"optimistic-change-log"}, + .type = GF_OPTION_TYPE_BOOL, +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index a5b0d3b..5ff57c0 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -110,10 +110,6 @@ typedef struct _afr_private { + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + +- gf_boolean_t data_change_log; /* on/off */ +- gf_boolean_t metadata_change_log; /* on/off */ +- gf_boolean_t entry_change_log; /* on/off */ +- + gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */ + int read_child; /* read-subvolume */ + unsigned int hash_mode; /* for when read_child is not set */ +@@ -123,9 +119,6 @@ typedef struct _afr_private { + afr_favorite_child_policy fav_child_policy;/*Policy to use for automatic + resolution of split-brains.*/ + +- gf_boolean_t inodelk_trace; +- gf_boolean_t entrylk_trace; +- + unsigned int wait_count; /* # of servers to wait for success */ + + gf_timer_t *timer; /* launched when parent up is received */ +@@ -187,33 +180,6 @@ typedef enum { + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + } afr_transaction_type; + +-typedef enum { +- AFR_TRANSACTION_LK, +- AFR_SELFHEAL_LK, +-} transaction_lk_type_t; +- +-typedef enum { +- AFR_LOCK_OP, +- AFR_UNLOCK_OP, +-} afr_lock_op_type_t; +- +-typedef enum { +- AFR_DATA_SELF_HEAL_LK, +- AFR_METADATA_SELF_HEAL_LK, +- AFR_ENTRY_SELF_HEAL_LK, +-}selfheal_lk_type_t; +- +-typedef enum { +- AFR_INODELK_TRANSACTION, +- AFR_INODELK_NB_TRANSACTION, +- AFR_ENTRYLK_TRANSACTION, +- AFR_ENTRYLK_NB_TRANSACTION, +- AFR_INODELK_SELFHEAL, +- AFR_INODELK_NB_SELFHEAL, +- AFR_ENTRYLK_SELFHEAL, +- AFR_ENTRYLK_NB_SELFHEAL, +-} afr_lock_call_type_t; +- + /* + xattr format: trusted.afr.volume = [x y z] + x - data pending +@@ -286,9 +252,6 @@ typedef struct { + unsigned char *locked_nodes; + unsigned char *lower_locked_nodes; + +- selfheal_lk_type_t selfheal_lk_type; +- transaction_lk_type_t transaction_lk_type; +- + int32_t lock_count; + int32_t entrylk_lock_count; + +@@ -809,12 +772,6 @@ typedef struct _afr_local { + + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + +- int (*fop) (call_frame_t *frame, xlator_t *this); +- +- int (*done) (call_frame_t *frame, xlator_t *this); +- +- int (*resume) (call_frame_t *frame, xlator_t *this); +- + int (*unwind) (call_frame_t *frame, xlator_t *this); + + /* post-op hook */ +@@ -973,10 +930,6 @@ int + afr_internal_lock_finish (call_frame_t *frame, xlator_t *this); + + int +-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom, +- unsigned int child_count); +- +-int + __afr_fd_ctx_set (xlator_t *this, fd_t *fd); + + afr_fd_ctx_t * +@@ -1100,8 +1053,7 @@ int + afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno); + + int +-afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count, +- transaction_lk_type_t lk_type); ++afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count); + + int + afr_higher_errno (int32_t old_errno, int32_t new_errno); +@@ -1246,7 +1198,7 @@ void + afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index); + int +-afr_is_inodelk_transaction(afr_local_t *local); ++afr_is_inodelk_transaction(afr_transaction_type type); + + afr_fd_ctx_t * + __afr_fd_ctx_get (fd_t *fd, xlator_t *this); +-- +1.8.3.1 + diff --git a/0194-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch b/0194-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch new file mode 100644 index 0000000..b5e9541 --- /dev/null +++ b/0194-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch @@ -0,0 +1,3002 @@ +From 30fb0e640ae94d9591e9bb64800b0971e52d5416 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 31 Jan 2018 16:41:14 +0530 +Subject: [PATCH 194/201] cluster/afr: Make AFR eager-locking similar to EC + +Problem: +1) Afr's eager-lock only works for data transactions. +2) When there are conflicting writes, write with conflicting region initiates +unlock of eager-lock leading to extra pre-ops and post-ops on the file. When +eager-lock goes off, it leads to extra fsyncs for random-write workload in afr. + +Solution (that is modeled after EC): +In EC, when there is a conflicting write, it waits for the current write to +complete before it winds the conflicted write. This leads to better utilization +of network and disk, because we will not be doing extra xattrops and FSYNCs and +inodelk/unlock. Moved fd based counters to inode based counters. + +I tried to model the solution based on EC's locking, but it is not similar to +AFR because we had to keep backward compatibility. + +Lifecycle of lock: +================== +First transaction is added to inode->owners list and an inodelk will be sent on +the wire. All the next transactions will be put in inode->waiters list until +the first transaction completes inodelk and [f]xattrop completely. Once +[f]xattrop also completes, all the requests in the inode->waiters list are +checked if it conflict with any of the existing locks which are in +inode->owners list and if not are added to inode->owners list and resumed with +doing transaction. When these transactions complete fop phase they will be +moved to inode->post_op list and resume the transactions that were paused +because of conflicts. Post-op and unlock will not be issued on the wire until +that is the last transaction on that inode. Last transaction when it has to +perform post-op can choose to sleep for deyed-post-op-secs value. During that +time if any other transaction comes, it will wake up the sleeping transaction +and takes over the ownership of the lock and the cycle continues. If the +dealyed-post-op-secs expire, then the timer thread will wakeup the sleeping +transaction and it will set lock->release to true and starts doing post-op and +then unlock. During this time if any other transactions come, they will be put +in inode->frozen list. Once the previous unlock comes it will move the frozen +list to waiters list and moves the first element from this waiters-list to +owners-list and attempts the lock and the cycle continues. This is the general +idea. There is logic at the time of dealying and at the time of new +transaction or in flush fop to wakeup existing sleeping transactions or +choosing whether to delay a transaction etc, which is subjected to change based +on future enhancements etc. + + >Fixes: #418 + >BUG: 1549606 + +Upstream-patch: https://review.gluster.org/19503 +BUG: 1491785 +Change-Id: I88b570bbcf332a27c82d2767dfa82472f60055dc +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/131945 +Tested-by: RHGS Build Bot +--- + tests/bugs/replicate/bug-966018.t | 36 - + xlators/cluster/afr/src/afr-common.c | 315 ++++----- + xlators/cluster/afr/src/afr-inode-write.c | 6 +- + xlators/cluster/afr/src/afr-lk-common.c | 348 +++------- + xlators/cluster/afr/src/afr-self-heal-common.c | 13 +- + xlators/cluster/afr/src/afr-self-heal-data.c | 14 +- + xlators/cluster/afr/src/afr-self-heal.h | 2 +- + xlators/cluster/afr/src/afr-transaction.c | 913 ++++++++++++++----------- + xlators/cluster/afr/src/afr-transaction.h | 13 +- + xlators/cluster/afr/src/afr.h | 96 ++- + 10 files changed, 813 insertions(+), 943 deletions(-) + delete mode 100644 tests/bugs/replicate/bug-966018.t + +diff --git a/tests/bugs/replicate/bug-966018.t b/tests/bugs/replicate/bug-966018.t +deleted file mode 100644 +index 1b5296b..0000000 +--- a/tests/bugs/replicate/bug-966018.t ++++ /dev/null +@@ -1,36 +0,0 @@ +-#!/bin/bash +- +-. $(dirname $0)/../../include.rc +-. $(dirname $0)/../../volume.rc +-. $(dirname $0)/../../nfs.rc +- +-#This tests if cluster.eager-lock blocks metadata operations on nfs/fuse mounts. +-#If it is not woken up, INODELK from the next command waits +-#for post-op-delay secs. +- +-cleanup; +-TEST glusterd +-TEST pidof glusterd +- +-TEST $CLI volume create $V0 replica 2 $H0:$B0/r2_0 $H0:$B0/r2_1 +-TEST $CLI volume set $V0 ensure-durability off +-TEST $CLI volume set $V0 cluster.eager-lock on +-TEST $CLI volume set $V0 cluster.post-op-delay-secs 3 +-TEST $CLI volume set $V0 nfs.disable false +- +-TEST $CLI volume start $V0 +-TEST $CLI volume profile $V0 start +-EXPECT_WITHIN $NFS_EXPORT_TIMEOUT "1" is_nfs_export_available; +-TEST mount_nfs $H0:/$V0 $N0 nolock; +-TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 +-echo 1 > $N0/1 && chmod +x $N0/1 +-echo 1 > $M0/1 && chmod +x $M0/1 +- +-#Check that INODELK MAX latency is not in the order of seconds +-#Test if the MAX INODELK fop latency is of the order of seconds. +-inodelk_max_latency=$($CLI volume profile $V0 info | grep INODELK | awk 'BEGIN {max = 0} {if ($6 > max) max=$6;} END {print max}' | cut -d. -f 1 | egrep "[0-9]{7,}") +- +-TEST [ -z $inodelk_max_latency ] +-EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $N0 +- +-cleanup; +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 06863b6..6025a60 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -126,37 +126,77 @@ afr_is_possibly_under_txn (afr_transaction_type type, afr_local_t *local, + return _gf_false; + } + ++static void ++afr_inode_ctx_destroy (afr_inode_ctx_t *ctx) ++{ ++ int i = 0; ++ ++ if (!ctx) ++ return; ++ ++ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { ++ GF_FREE (ctx->pre_op_done[i]); ++ } ++ ++ GF_FREE (ctx); ++} ++ + int + __afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx) + { +- uint64_t ctx_int = 0; +- int ret = -1; +- afr_inode_ctx_t *tmp_ctx = NULL; ++ uint64_t ctx_int = 0; ++ int ret = -1; ++ int i = -1; ++ int num_locks = -1; ++ afr_inode_ctx_t *ictx = NULL; ++ afr_lock_t *lock = NULL; ++ afr_private_t *priv = this->private; + + ret = __inode_ctx_get (inode, this, &ctx_int); +- if (ret) { +- tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), +- gf_afr_mt_inode_ctx_t); +- if (!tmp_ctx) +- goto out; ++ if (ret == 0) { ++ *ctx = (afr_inode_ctx_t *)ctx_int; ++ return 0; ++ } + +- ctx_int = (long) tmp_ctx; +- ret = __inode_ctx_set (inode, this, &ctx_int); +- if (ret) { +- GF_FREE (tmp_ctx); ++ ictx = GF_CALLOC (1, sizeof (afr_inode_ctx_t), gf_afr_mt_inode_ctx_t); ++ if (!ictx) ++ goto out; ++ ++ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { ++ ictx->pre_op_done[i] = GF_CALLOC (sizeof *ictx->pre_op_done[i], ++ priv->child_count, ++ gf_afr_mt_int32_t); ++ if (!ictx->pre_op_done[i]) { ++ ret = -ENOMEM; + goto out; + } +- tmp_ctx->spb_choice = -1; +- tmp_ctx->read_subvol = 0; +- tmp_ctx->write_subvol = 0; +- tmp_ctx->lock_count = 0; +- } else { +- tmp_ctx = (afr_inode_ctx_t *) ctx_int; + } + +- *ctx = tmp_ctx; ++ num_locks = sizeof(ictx->lock)/sizeof(afr_lock_t); ++ for (i = 0; i < num_locks; i++) { ++ lock = &ictx->lock[i]; ++ INIT_LIST_HEAD (&lock->post_op); ++ INIT_LIST_HEAD (&lock->frozen); ++ INIT_LIST_HEAD (&lock->waiting); ++ INIT_LIST_HEAD (&lock->owners); ++ } ++ ++ ctx_int = (uint64_t)ictx; ++ ret = __inode_ctx_set (inode, this, &ctx_int); ++ if (ret) { ++ goto out; ++ } ++ ++ ictx->spb_choice = -1; ++ ictx->read_subvol = 0; ++ ictx->write_subvol = 0; ++ ictx->lock_count = 0; + ret = 0; ++ *ctx = ictx; + out: ++ if (ret) { ++ afr_inode_ctx_destroy (ictx); ++ } + return ret; + } + +@@ -1745,10 +1785,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) + + GF_FREE (local->internal_lock.locked_nodes); + +- for (i = 0; local->internal_lock.inodelk[i].domain; i++) { +- GF_FREE (local->internal_lock.inodelk[i].locked_nodes); +- } +- + GF_FREE (local->internal_lock.lower_locked_nodes); + + afr_entry_lockee_cleanup (&local->internal_lock); +@@ -1765,7 +1801,6 @@ afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this) + GF_FREE (local->transaction.changelog_xdata); + } + +- GF_FREE (local->transaction.eager_lock); + GF_FREE (local->transaction.failed_subvols); + + GF_FREE (local->transaction.basename); +@@ -1812,16 +1847,6 @@ afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv) + memset (local->replies, 0, sizeof(*local->replies) * priv->child_count); + } + +-void +-afr_remove_eager_lock_stub (afr_local_t *local) +-{ +- LOCK (&local->fd->lock); +- { +- list_del_init (&local->transaction.eager_locked); +- } +- UNLOCK (&local->fd->lock); +-} +- + static gf_boolean_t + afr_fop_lock_is_unlock (call_frame_t *frame) + { +@@ -1926,10 +1951,6 @@ afr_local_cleanup (afr_local_t *local, xlator_t *this) + + syncbarrier_destroy (&local->barrier); + +- if (local->transaction.eager_lock_on && +- !list_empty (&local->transaction.eager_locked)) +- afr_remove_eager_lock_stub (local); +- + afr_local_transaction_cleanup (local, this); + + priv = this->private; +@@ -3160,22 +3181,8 @@ out: + void + _afr_cleanup_fd_ctx (afr_fd_ctx_t *fd_ctx) + { +- int i = 0; +- +- +- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) +- GF_FREE (fd_ctx->pre_op_done[i]); +- + GF_FREE (fd_ctx->opened_on); +- +- GF_FREE (fd_ctx->lock_piggyback); +- +- GF_FREE (fd_ctx->lock_acquired); +- +- pthread_mutex_destroy (&fd_ctx->delay_lock); +- + GF_FREE (fd_ctx); +- + return; + } + +@@ -3193,15 +3200,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd) + fd_ctx = (afr_fd_ctx_t *)(long) ctx; + + if (fd_ctx) { +- /*no need to take any locks*/ +- if (!list_empty (&fd_ctx->eager_locked)) +- gf_msg (this->name, GF_LOG_WARNING, 0, +- AFR_MSG_INVALID_DATA, "%s: Stale " +- "Eager-lock stubs found", +- uuid_utoa (fd->inode->gfid)); +- + _afr_cleanup_fd_ctx (fd_ctx); +- + } + + out: +@@ -3282,23 +3281,6 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) + goto out; + } + +- ret = pthread_mutex_init (&fd_ctx->delay_lock, NULL); +- if (ret) { +- GF_FREE (fd_ctx); +- fd_ctx = NULL; +- goto out; +- } +- +- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) { +- fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]), +- priv->child_count, +- gf_afr_mt_int32_t); +- if (!fd_ctx->pre_op_done[i]) { +- ret = -ENOMEM; +- goto out; +- } +- } +- + fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on), + priv->child_count, + gf_afr_mt_int32_t); +@@ -3314,26 +3296,8 @@ __afr_fd_ctx_set (xlator_t *this, fd_t *fd) + fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED; + } + +- fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback), +- priv->child_count, +- gf_afr_mt_char); +- if (!fd_ctx->lock_piggyback) { +- ret = -ENOMEM; +- goto out; +- } +- +- fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired), +- priv->child_count, +- gf_afr_mt_char); +- if (!fd_ctx->lock_acquired) { +- ret = -ENOMEM; +- goto out; +- } +- + fd_ctx->readdir_subvol = -1; + +- INIT_LIST_HEAD (&fd_ctx->eager_locked); +- + ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx); + if (ret) + gf_msg_debug (this->name, 0, +@@ -3405,12 +3369,70 @@ afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + return 0; + } + ++afr_local_t* ++afr_wakeup_same_fd_delayed_op (xlator_t *this, afr_lock_t *lock, fd_t *fd) ++{ ++ afr_local_t *local = NULL; ++ ++ if (lock->delay_timer) { ++ local = list_entry(lock->post_op.next, afr_local_t, ++ transaction.owner_list); ++ if (fd == local->fd) { ++ if (gf_timer_call_cancel (this->ctx, ++ lock->delay_timer)) { ++ local = NULL; ++ } else { ++ lock->delay_timer = NULL; ++ } ++ } else { ++ local = NULL; ++ } ++ } ++ ++ return local; ++} ++ ++void ++afr_delayed_changelog_wake_resume (xlator_t *this, inode_t *inode, ++ call_stub_t *stub) ++{ ++ afr_inode_ctx_t *ctx = NULL; ++ afr_lock_t *lock = NULL; ++ afr_local_t *metadata_local = NULL; ++ afr_local_t *data_local = NULL; ++ LOCK (&inode->lock); ++ { ++ (void)__afr_inode_ctx_get (this, inode, &ctx); ++ lock = &ctx->lock[AFR_DATA_TRANSACTION]; ++ data_local = afr_wakeup_same_fd_delayed_op (this, lock, ++ stub->args.fd); ++ lock = &ctx->lock[AFR_METADATA_TRANSACTION]; ++ metadata_local = afr_wakeup_same_fd_delayed_op (this, lock, ++ stub->args.fd); ++ } ++ UNLOCK (&inode->lock); ++ ++ if (data_local) { ++ data_local->transaction.resume_stub = stub; ++ } else if (metadata_local) { ++ metadata_local->transaction.resume_stub = stub; ++ } else { ++ call_resume (stub); ++ } ++ if (data_local) { ++ afr_delayed_changelog_wake_up_cbk (data_local); ++ } ++ if (metadata_local) { ++ afr_delayed_changelog_wake_up_cbk (metadata_local); ++ } ++} ++ + int + afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- afr_local_t *local = NULL; +- call_stub_t *stub = NULL; +- int op_errno = ENOMEM; ++ afr_local_t *local = NULL; ++ call_stub_t *stub = NULL; ++ int op_errno = ENOMEM; + + local = AFR_FRAME_INIT (frame, op_errno); + if (!local) +@@ -3426,7 +3448,7 @@ afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + if (!stub) + goto out; + +- afr_delayed_changelog_wake_resume (this, fd, stub); ++ afr_delayed_changelog_wake_resume (this, fd->inode, stub); + + return 0; + out: +@@ -3434,7 +3456,6 @@ out: + return 0; + } + +- + int + afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +@@ -4497,7 +4518,7 @@ afr_forget (xlator_t *this, inode_t *inode) + return 0; + + ctx = (afr_inode_ctx_t *)ctx_int; +- GF_FREE (ctx); ++ afr_inode_ctx_destroy (ctx); + return 0; + } + +@@ -5310,21 +5331,6 @@ out: + } + + int +-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count) +-{ +- int ret = -ENOMEM; +- +- lk->domain = dom; +- lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes), +- child_count, gf_afr_mt_char); +- if (NULL == lk->locked_nodes) +- goto out; +- ret = 0; +-out: +- return ret; +-} +- +-int + afr_transaction_local_init (afr_local_t *local, xlator_t *this) + { + int ret = -ENOMEM; +@@ -5335,25 +5341,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (ret < 0) + goto out; + +- if ((local->transaction.type == AFR_DATA_TRANSACTION) || +- (local->transaction.type == AFR_METADATA_TRANSACTION)) { +- ret = afr_inodelk_init (&local->internal_lock.inodelk[0], +- this->name, priv->child_count); +- if (ret < 0) +- goto out; +- } +- + ret = -ENOMEM; + local->pre_op_compat = priv->pre_op_compat; + +- local->transaction.eager_lock = +- GF_CALLOC (sizeof (*local->transaction.eager_lock), +- priv->child_count, +- gf_afr_mt_int32_t); +- +- if (!local->transaction.eager_lock) +- goto out; +- + local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op), + priv->child_count, + gf_afr_mt_char); +@@ -5385,9 +5375,9 @@ afr_transaction_local_init (afr_local_t *local, xlator_t *this) + if (!local->pending) + goto out; + +- INIT_LIST_HEAD (&local->transaction.eager_locked); +- + ret = 0; ++ INIT_LIST_HEAD (&local->transaction.wait_list); ++ INIT_LIST_HEAD (&local->transaction.owner_list); + out: + return ret; + } +@@ -5422,24 +5412,6 @@ out: + return; + } + +-void +-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this) +-{ +- afr_local_t *local = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; +- +- local = frame->local; +- +- if (!local->fd) +- return; +- +- fd_ctx = afr_fd_ctx_get (local->fd, this); +- if (!fd_ctx) +- return; +- +- fd_ctx->open_fd_count = local->open_fd_count; +-} +- + int** + afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending, + dict_t *xattr, ia_type_t iat) +@@ -5548,7 +5520,7 @@ out: + + int + afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, +- inode_t *inode, gf_boolean_t *dsh, ++ fd_t *fd, gf_boolean_t *dsh, + gf_boolean_t *pflag) + { + int ret = -1; +@@ -5558,8 +5530,8 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, + unsigned char *healed_sinks = NULL; + unsigned char *undid_pending = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; + struct afr_reply *locked_replies = NULL; ++ inode_t *inode = fd->inode; + + priv = this->private; + data_lock = alloca0 (priv->child_count); +@@ -5568,18 +5540,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, + healed_sinks = alloca0 (priv->child_count); + undid_pending = alloca0 (priv->child_count); + +- /* Heal-info does an open() on the file being examined so that the +- * current eager-lock holding client, if present, at some point sees +- * open-fd count being > 1 and releases the eager-lock so that heal-info +- * doesn't remain blocked forever until IO completes. +- */ +- ret = afr_selfheal_data_open (this, inode, &fd); +- if (ret < 0) { +- gf_msg_debug (this->name, -ret, "%s: Failed to open", +- uuid_utoa (inode->gfid)); +- goto out; +- } +- + locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count); + + ret = afr_selfheal_inodelk (frame, this, inode, this->name, +@@ -5602,8 +5562,6 @@ afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this, + out: + if (locked_replies) + afr_replies_wipe (locked_replies, priv->child_count); +- if (fd) +- fd_unref (fd); + return ret; + } + +@@ -5688,6 +5646,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, + + { + int ret = -1; ++ fd_t *fd = NULL; + gf_boolean_t dsh = _gf_false; + gf_boolean_t msh = _gf_false; + gf_boolean_t esh = _gf_false; +@@ -5699,6 +5658,21 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, + + /* For every heal type hold locks and check if it indeed needs heal */ + ++ ++ /* Heal-info does an open() on the file being examined so that the ++ * current eager-lock holding client, if present, at some point sees ++ * open-fd count being > 1 and releases the eager-lock so that heal-info ++ * doesn't remain blocked forever until IO completes. ++ */ ++ if ((*inode)->ia_type == IA_IFREG) { ++ ret = afr_selfheal_data_open (this, *inode, &fd); ++ if (ret < 0) { ++ gf_msg_debug (this->name, -ret, "%s: Failed to open", ++ uuid_utoa ((*inode)->gfid)); ++ goto out; ++ } ++ } ++ + if (msh) { + ret = afr_selfheal_locked_metadata_inspect (frame, this, + *inode, &msh, +@@ -5708,7 +5682,7 @@ afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid, + } + + if (dsh) { +- ret = afr_selfheal_locked_data_inspect (frame, this, *inode, ++ ret = afr_selfheal_locked_data_inspect (frame, this, fd, + &dsh, pending); + if (ret == -EIO || (ret == -EAGAIN)) + goto out; +@@ -5723,6 +5697,8 @@ out: + *data_selfheal = dsh; + *entry_selfheal = esh; + *metadata_selfheal = msh; ++ if (fd) ++ fd_unref (fd); + return ret; + } + +@@ -6352,6 +6328,7 @@ afr_write_subvol_reset (call_frame_t *frame, xlator_t *this) + local = frame->local; + LOCK(&local->inode->lock); + { ++ GF_ASSERT (local->inode_ctx->lock_count > 0); + local->inode_ctx->lock_count--; + + if (!local->inode_ctx->lock_count) +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index 2402bb2..b52b6ca 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -341,14 +341,14 @@ afr_process_post_writev (call_frame_t *frame, xlator_t *this) + the xattrs are not reliably pointing at + a stale file. + */ +- afr_fd_report_unstable_write (this, local->fd); ++ afr_fd_report_unstable_write (this, local); + + __afr_inode_write_finalize (frame, this); + + afr_writev_handle_short_writes (frame, this); + + if (local->update_open_fd_count) +- afr_handle_open_fd_count (frame, this); ++ local->inode_ctx->open_fd_count = local->open_fd_count; + + } + +@@ -2590,7 +2590,7 @@ afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + local->op = GF_FOP_FSYNC; + local->cont.fsync.datasync = datasync; + +- if (afr_fd_has_witnessed_unstable_write (this, fd)) { ++ if (afr_fd_has_witnessed_unstable_write (this, fd->inode)) { + /* don't care. we only wanted to CLEAR the bit */ + } + +diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c +index 260815f..be3de01 100644 +--- a/xlators/cluster/afr/src/afr-lk-common.c ++++ b/xlators/cluster/afr/src/afr-lk-common.c +@@ -52,31 +52,6 @@ afr_entry_lockee_cmp (const void *l1, const void *l2) + + int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index); + +-static int +-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this); +- +-static uint64_t afr_lock_number = 1; +- +-static uint64_t +-get_afr_lock_number () +-{ +- return (++afr_lock_number); +-} +- +-int +-afr_set_lock_number (call_frame_t *frame, xlator_t *this) +-{ +- afr_local_t *local = NULL; +- afr_internal_lock_t *int_lock = NULL; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- int_lock->lock_number = get_afr_lock_number (); +- +- return 0; +-} +- + void + afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner) + { +@@ -203,21 +178,16 @@ initialize_inodelk_variables (call_frame_t *frame, xlator_t *this) + afr_local_t *local = NULL; + afr_internal_lock_t *int_lock = NULL; + afr_private_t *priv = NULL; +- afr_inodelk_t *inodelk = NULL; + + priv = this->private; + local = frame->local; + int_lock = &local->internal_lock; + +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- inodelk->lock_count = 0; ++ int_lock->lock_count = 0; + int_lock->lk_attempted_count = 0; + int_lock->lock_op_ret = -1; + int_lock->lock_op_errno = 0; + +- memset (inodelk->locked_nodes, 0, +- sizeof (*inodelk->locked_nodes) * priv->child_count); + memset (int_lock->locked_nodes, 0, + sizeof (*int_lock->locked_nodes) * priv->child_count); + +@@ -286,12 +256,7 @@ void + afr_update_uninodelk (afr_local_t *local, afr_internal_lock_t *int_lock, + int32_t child_index) + { +- afr_inodelk_t *inodelk = NULL; +- +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- inodelk->locked_nodes[child_index] &= LOCKED_NO; +- if (local->transaction.eager_lock) +- local->transaction.eager_lock[child_index] = 0; ++ int_lock->locked_nodes[child_index] &= LOCKED_NO; + + } + +@@ -331,35 +296,27 @@ static int + afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; +- struct gf_flock full_flock = {0,}; +- struct gf_flock *flock_use = NULL; + int call_count = 0; + int i = 0; +- int piggyback = 0; +- afr_fd_ctx_t *fd_ctx = NULL; +- + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; ++ flock.l_start = int_lock->flock.l_start; ++ flock.l_len = int_lock->flock.l_len; + flock.l_type = F_UNLCK; + +- full_flock.l_type = F_UNLCK; +- call_count = afr_locked_nodes_count (inodelk->locked_nodes, ++ call_count = afr_locked_nodes_count (int_lock->locked_nodes, + priv->child_count); + + int_lock->lk_call_count = call_count; + + if (!call_count) { ++ GF_ASSERT (!local->transaction.do_eager_unlock); + gf_msg_trace (this->name, 0, + "No internal locks unlocked"); + +@@ -367,64 +324,28 @@ afr_unlock_inodelk (call_frame_t *frame, xlator_t *this) + goto out; + } + +- if (local->fd) +- fd_ctx = afr_fd_ctx_get (local->fd, this); +- + for (i = 0; i < priv->child_count; i++) { +- if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) ++ if ((int_lock->locked_nodes[i] & LOCKED_YES) != LOCKED_YES) + continue; + + if (local->fd) { +- flock_use = &flock; +- if (!local->transaction.eager_lock[i]) { +- goto wind; +- } +- +- piggyback = 0; +- +- LOCK (&local->fd->lock); +- { +- if (fd_ctx->lock_piggyback[i]) { +- fd_ctx->lock_piggyback[i]--; +- piggyback = 1; +- } else { +- fd_ctx->lock_acquired[i]--; +- } +- } +- UNLOCK (&local->fd->lock); +- +- if (piggyback) { +- afr_unlock_inodelk_cbk (frame, (void *) (long) i, +- this, 1, 0, NULL); +- if (!--call_count) +- break; +- continue; +- } +- +- flock_use = &full_flock; +- wind: + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, +- F_SETLK, flock_use, NULL); +- +- if (!--call_count) +- break; +- ++ F_SETLK, &flock, NULL); + } else { +- + STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk, + (void *) (long)i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, + F_SETLK, &flock, NULL); +- +- if (!--call_count) +- break; + } ++ ++ if (!--call_count) ++ break; + } + out: + return 0; +@@ -512,6 +433,18 @@ out: + + } + ++int32_t ++afr_unlock_now (call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = frame->local; ++ ++ if (afr_is_inodelk_transaction(local->transaction.type)) ++ afr_unlock_inodelk (frame, this); ++ else ++ afr_unlock_entrylk (frame, this); ++ return 0; ++} ++ + static int32_t + afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) +@@ -553,7 +486,7 @@ afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + if ((op_ret == -1) && + (op_errno == ENOSYS)) { +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + } else { + if (op_ret == 0) { + if (local->transaction.type == AFR_ENTRY_TRANSACTION || +@@ -598,38 +531,6 @@ afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + return 0; + } + +-static int +-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- +- priv = this->private; +- local = frame->local; +- int_lock = &local->internal_lock; +- +- switch (local->transaction.type) { +- case AFR_DATA_TRANSACTION: +- case AFR_METADATA_TRANSACTION: +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- memcpy (inodelk->locked_nodes, int_lock->locked_nodes, +- sizeof (*inodelk->locked_nodes) * priv->child_count); +- inodelk->lock_count = int_lock->lock_count; +- break; +- +- case AFR_ENTRY_RENAME_TRANSACTION: +- case AFR_ENTRY_TRANSACTION: +- /*entrylk_count is being used in both non-blocking and blocking +- * modes */ +- break; +- } +- +- return 0; +- +-} +- + static gf_boolean_t + afr_is_entrylk (afr_transaction_type trans_type) + { +@@ -733,7 +634,6 @@ int + afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + struct gf_flock flock = {0,}; +@@ -752,10 +652,9 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + + + if (!is_entrylk) { +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; +- flock.l_type = inodelk->flock.l_type; ++ flock.l_start = int_lock->flock.l_start; ++ flock.l_len = int_lock->flock.l_len; ++ flock.l_type = int_lock->flock.l_type; + } + + if (local->fd) { +@@ -770,9 +669,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + local->op_ret = -1; + int_lock->lock_op_ret = -1; + +- afr_copy_locked_nodes (frame, this); +- +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + + return 0; + } +@@ -784,9 +681,7 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + local->op_ret = -1; + int_lock->lock_op_ret = -1; + +- afr_copy_locked_nodes (frame, this); +- +- afr_unlock(frame, this); ++ afr_unlock_now(frame, this); + + return 0; + } +@@ -798,8 +693,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + gf_msg_debug (this->name, 0, + "we're done locking"); + +- afr_copy_locked_nodes (frame, this); +- + int_lock->lock_op_ret = 0; + int_lock->lock_cbk (frame, this); + return 0; +@@ -815,7 +708,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + case AFR_METADATA_TRANSACTION: + + if (local->fd) { +- + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], +@@ -824,7 +716,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + F_SETLKW, &flock, NULL); + + } else { +- + STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk, + (void *) (long) child_index, + priv->children[child_index], +@@ -841,7 +732,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + *and 'fd-less' children */ + + if (local->fd) { +- + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], +@@ -850,7 +740,6 @@ afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie) + int_lock->lockee[lockee_no].basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL); + } else { +- + STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk, + (void *) (long) cookie, + priv->children[child_index], +@@ -922,7 +811,6 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local = frame->local; + int_lock = &local->internal_lock; + +- + LOCK (&frame->lock); + { + if (op_ret < 0 ) { +@@ -969,7 +857,7 @@ afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + "with blocking calls", + int_lock->lock_count); + +- afr_unlock(frame, this); ++ afr_unlock_now(frame, this); + } + } + +@@ -1009,7 +897,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + return -1; + } + +@@ -1021,7 +909,7 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + gf_msg (this->name, GF_LOG_INFO, 0, + AFR_MSG_INFO_COMMON, + "fd not open on any subvolumes. aborting."); +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + goto out; + } + +@@ -1031,7 +919,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, + priv->children[index], +@@ -1053,7 +940,6 @@ afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this) + index = i%copies; + lockee_no = i/copies; + if (local->child_up[index]) { +- + STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk, + (void *) (long) i, + priv->children[index], +@@ -1077,18 +963,12 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; + int call_count = 0; + int child_index = (long) cookie; + + local = frame->local; + int_lock = &local->internal_lock; +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- if (local->fd) +- fd_ctx = afr_fd_ctx_get (local->fd, this); + + LOCK (&frame->lock); + { +@@ -1105,43 +985,27 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int_lock->lock_op_errno = op_errno; + local->op_errno = op_errno; + } +- if (local->transaction.eager_lock) +- local->transaction.eager_lock[child_index] = 0; + } else { +- inodelk->locked_nodes[child_index] |= LOCKED_YES; +- inodelk->lock_count++; +- +- if (local->transaction.eager_lock && +- local->transaction.eager_lock[child_index] && +- local->fd) { +- /* piggybacked */ +- if (op_ret == 1) { +- /* piggybacked */ +- } else if (op_ret == 0) { +- /* lock acquired from server */ +- fd_ctx->lock_acquired[child_index]++; +- } +- } +- +- if (local->transaction.type == AFR_DATA_TRANSACTION && +- op_ret == 0) { +- LOCK(&local->inode->lock); +- { +- local->inode_ctx->lock_count++; +- } +- UNLOCK (&local->inode->lock); +- } ++ int_lock->locked_nodes[child_index] |= LOCKED_YES; ++ int_lock->lock_count++; + } + + call_count = --int_lock->lk_call_count; + } + UNLOCK (&frame->lock); + ++ if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) { ++ LOCK (&local->inode->lock); ++ { ++ local->inode_ctx->lock_count++; ++ } ++ UNLOCK (&local->inode->lock); ++ } + if (call_count == 0) { + gf_msg_trace (this->name, 0, + "Last inode locking reply received"); + /* all locks successful. Proceed to call FOP */ +- if (inodelk->lock_count == int_lock->lk_expected_count) { ++ if (int_lock->lock_count == int_lock->lk_expected_count) { + gf_msg_trace (this->name, 0, + "All servers locked. Calling the cbk"); + int_lock->lock_op_ret = 0; +@@ -1155,7 +1019,7 @@ afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + "Trying again with blocking calls", + int_lock->lock_count); + +- afr_unlock(frame, this); ++ afr_unlock_now(frame, this); + } + } + +@@ -1166,30 +1030,17 @@ int + afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_fd_ctx_t *fd_ctx = NULL; + int32_t call_count = 0; + int i = 0; + int ret = 0; +- struct gf_flock flock = {0,}; +- struct gf_flock full_flock = {0,}; +- struct gf_flock *flock_use = NULL; +- int piggyback = 0; + + local = frame->local; + int_lock = &local->internal_lock; + priv = this->private; + +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- +- flock.l_start = inodelk->flock.l_start; +- flock.l_len = inodelk->flock.l_len; +- flock.l_type = inodelk->flock.l_type; +- +- full_flock.l_type = inodelk->flock.l_type; +- + initialize_inodelk_variables (frame, this); + + if (local->fd) { +@@ -1205,88 +1056,48 @@ afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this) + local->op_errno = EINVAL; + int_lock->lock_op_errno = EINVAL; + +- afr_unlock (frame, this); ++ afr_unlock_now (frame, this); + ret = -1; + goto out; + } ++ } + +- call_count = internal_lock_count (frame, this); +- int_lock->lk_call_count = call_count; +- int_lock->lk_expected_count = call_count; +- +- if (!call_count) { +- gf_msg (this->name, GF_LOG_INFO, 0, +- AFR_MSG_SUBVOLS_DOWN, +- "All bricks are down, aborting."); +- afr_unlock (frame, this); +- goto out; +- } +- +- /* Send non-blocking inodelk calls only on up children +- and where the fd has been opened */ +- for (i = 0; i < priv->child_count; i++) { +- if (!local->child_up[i]) +- continue; +- +- flock_use = &flock; +- if (!local->transaction.eager_lock_on) { +- goto wind; +- } +- +- piggyback = 0; +- local->transaction.eager_lock[i] = 1; +- +- afr_set_delayed_post_op (frame, this); ++ call_count = internal_lock_count (frame, this); ++ int_lock->lk_call_count = call_count; ++ int_lock->lk_expected_count = call_count; + +- LOCK (&local->fd->lock); +- { +- if (fd_ctx->lock_acquired[i]) { +- fd_ctx->lock_piggyback[i]++; +- piggyback = 1; +- } +- } +- UNLOCK (&local->fd->lock); ++ if (!call_count) { ++ gf_msg (this->name, GF_LOG_INFO, 0, ++ AFR_MSG_SUBVOLS_DOWN, ++ "All bricks are down, aborting."); ++ afr_unlock_now (frame, this); ++ goto out; ++ } + +- if (piggyback) { +- /* (op_ret == 1) => indicate piggybacked lock */ +- afr_nonblocking_inodelk_cbk (frame, (void *) (long) i, +- this, 1, 0, NULL); +- if (!--call_count) +- break; +- continue; +- } +- flock_use = &full_flock; +- wind: ++ /* Send non-blocking inodelk calls only on up children ++ and where the fd has been opened */ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!local->child_up[i]) ++ continue; + ++ if (local->fd) { + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->finodelk, + int_lock->domain, local->fd, +- F_SETLK, flock_use, NULL); +- +- if (!--call_count) +- break; +- } +- } else { +- call_count = internal_lock_count (frame, this); +- int_lock->lk_call_count = call_count; +- int_lock->lk_expected_count = call_count; +- +- for (i = 0; i < priv->child_count; i++) { +- if (!local->child_up[i]) +- continue; ++ F_SETLK, &int_lock->flock, NULL); ++ } else { + + STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + int_lock->domain, &local->loc, +- F_SETLK, &flock, NULL); +- +- if (!--call_count) +- break; ++ F_SETLK, &int_lock->flock, NULL); + } ++ if (!--call_count) ++ break; + } + out: + return ret; +@@ -1296,13 +1107,32 @@ int32_t + afr_unlock (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; ++ afr_lock_t *lock = NULL; + + local = frame->local; + +- if (afr_is_inodelk_transaction(local->transaction.type)) +- afr_unlock_inodelk (frame, this); +- else +- afr_unlock_entrylk (frame, this); ++ if (!local->transaction.eager_lock_on) ++ goto out; ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ list_del_init (&local->transaction.owner_list); ++ if (list_empty (&lock->owners) && list_empty (&lock->post_op)) { ++ local->transaction.do_eager_unlock = _gf_true; ++ /*TODO: Need to get metadata use on_disk and inherit/uninherit ++ *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]); ++ *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]); ++ */ ++ GF_ASSERT (lock->release); ++ } ++ } ++ UNLOCK (&local->inode->lock); ++ if (!local->transaction.do_eager_unlock) { ++ local->internal_lock.lock_cbk (frame, this); ++ return 0; ++ } + ++out: ++ afr_unlock_now (frame, this); + return 0; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index f61b237..32fd24a 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -2463,6 +2463,7 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) + int data_ret = 1; + int or_ret = 0; + inode_t *inode = NULL; ++ fd_t *fd = NULL; + gf_boolean_t data_selfheal = _gf_false; + gf_boolean_t metadata_selfheal = _gf_false; + gf_boolean_t entry_selfheal = _gf_false; +@@ -2487,8 +2488,16 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) + goto out; + } + ++ if (inode->ia_type == IA_IFREG) { ++ ret = afr_selfheal_data_open (this, inode, &fd); ++ if (!fd) { ++ ret = -EIO; ++ goto out; ++ } ++ } ++ + if (data_selfheal && dataheal_enabled) +- data_ret = afr_selfheal_data (frame, this, inode); ++ data_ret = afr_selfheal_data (frame, this, fd); + + if (metadata_selfheal && priv->metadata_self_heal) + metadata_ret = afr_selfheal_metadata (frame, this, inode); +@@ -2510,6 +2519,8 @@ afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid) + out: + if (inode) + inode_unref (inode); ++ if (fd) ++ fd_unref (fd); + return ret; + } + /* +diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c +index bcd0dec..f872a98 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-data.c ++++ b/xlators/cluster/afr/src/afr-self-heal-data.c +@@ -856,22 +856,15 @@ out: + } + + int +-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) ++afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd) + { + afr_private_t *priv = NULL; + unsigned char *locked_on = NULL; + int ret = 0; +- fd_t *fd = NULL; ++ inode_t *inode = fd->inode; + + priv = this->private; + +- ret = afr_selfheal_data_open (this, inode, &fd); +- if (!fd) { +- gf_msg_debug (this->name, -ret, "%s: Failed to open", +- uuid_utoa (inode->gfid)); +- return -EIO; +- } +- + locked_on = alloca0 (priv->child_count); + + ret = afr_selfheal_tie_breaker_inodelk (frame, this, inode, +@@ -898,8 +891,5 @@ unlock: + afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, + locked_on); + +- if (fd) +- fd_unref (fd); +- + return ret; + } +diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h +index 188a334..b015976 100644 +--- a/xlators/cluster/afr/src/afr-self-heal.h ++++ b/xlators/cluster/afr/src/afr-self-heal.h +@@ -102,7 +102,7 @@ afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name, + void *gfid_req, dict_t *xdata); + + int +-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode); ++afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd); + + int + afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode); +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index acbfe1a..993029d 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -25,6 +25,18 @@ typedef enum { + AFR_TRANSACTION_POST_OP, + } afr_xattrop_type_t; + ++static void ++afr_lock_resume_shared (struct list_head *list); ++ ++void ++__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared); ++ ++void ++afr_changelog_post_op (call_frame_t *frame, xlator_t *this); ++ ++int ++afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this); ++ + gf_boolean_t + afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this); + +@@ -168,13 +180,14 @@ afr_transaction_fop (call_frame_t *frame, xlator_t *this) + return 0; + } + +- + int + afr_transaction_done (call_frame_t *frame, xlator_t *this) + { +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; +- gf_boolean_t unwind = _gf_false; ++ afr_local_t *local = NULL; ++ afr_private_t *priv = NULL; ++ gf_boolean_t unwind = _gf_false; ++ afr_lock_t *lock = NULL; ++ afr_local_t *lock_local = NULL; + + priv = this->private; + local = frame->local; +@@ -188,6 +201,31 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this) + if (unwind)/*It definitely did post-op*/ + afr_zero_fill_stat (local); + } ++ ++ if (local->transaction.do_eager_unlock) { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ lock->acquired = _gf_false; ++ lock->release = _gf_false; ++ list_splice_init (&lock->frozen, ++ &lock->waiting); ++ if (list_empty (&lock->waiting)) ++ goto unlock; ++ lock_local = list_entry (lock->waiting.next, ++ afr_local_t, ++ transaction.wait_list); ++ list_del_init (&lock_local->transaction.wait_list); ++ list_add (&lock_local->transaction.owner_list, ++ &lock->owners); ++ } ++unlock: ++ UNLOCK (&local->inode->lock); ++ } ++ if (lock_local) { ++ afr_lock (lock_local->transaction.frame, ++ lock_local->transaction.frame->this); ++ } + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); +@@ -195,6 +233,52 @@ afr_transaction_done (call_frame_t *frame, xlator_t *this) + return 0; + } + ++static void ++afr_lock_fail_shared (afr_local_t *local, struct list_head *list) ++{ ++ afr_local_t *each = NULL; ++ ++ while (!list_empty(list)) { ++ each = list_entry (list->next, afr_local_t, ++ transaction.wait_list); ++ list_del_init(&each->transaction.wait_list); ++ each->op_ret = -1; ++ each->op_errno = local->op_errno; ++ afr_transaction_done (each->transaction.frame, ++ each->transaction.frame->this); ++ } ++} ++ ++static void ++afr_handle_lock_acquire_failure (afr_local_t *local, gf_boolean_t locked) ++{ ++ struct list_head shared; ++ afr_lock_t *lock = NULL; ++ ++ if (!local->transaction.eager_lock_on) ++ goto out; ++ ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ ++ INIT_LIST_HEAD (&shared); ++ LOCK (&local->inode->lock); ++ { ++ list_splice_init (&lock->waiting, &shared); ++ } ++ UNLOCK (&local->inode->lock); ++ ++ afr_lock_fail_shared (local, &shared); ++ local->transaction.do_eager_unlock = _gf_true; ++out: ++ if (locked) { ++ local->internal_lock.lock_cbk = afr_transaction_done; ++ afr_unlock (local->transaction.frame, ++ local->transaction.frame->this); ++ } else { ++ afr_transaction_done (local->transaction.frame, ++ local->transaction.frame->this); ++ } ++} + + call_frame_t* + afr_transaction_detach_fop_frame (call_frame_t *frame) +@@ -334,6 +418,7 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + int pre_op_sources_count = 0; ++ int i = 0; + + priv = this->private; + local = frame->local; +@@ -345,11 +430,11 @@ afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this) + /* If arbiter is the only source, do not proceed. */ + if (pre_op_sources_count < 2 && + local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) { +- local->internal_lock.lock_cbk = afr_transaction_done; + local->op_ret = -1; + local->op_errno = ENOTCONN; +- afr_restore_lk_owner (frame); +- afr_unlock (frame, this); ++ for (i = 0; i < priv->child_count; i++) ++ local->transaction.failed_subvols[i] = 1; ++ afr_changelog_post_op (frame, this);/*uninherit should happen*/ + } else { + afr_transaction_fop (frame, this); + } +@@ -362,14 +447,16 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; + int i = 0; + int ret = 0; ++ int failure_count = 0; ++ struct list_head shared; ++ afr_lock_t *lock = NULL; + + local = frame->local; + priv = this->private; +- fd = local->fd; + ++ INIT_LIST_HEAD (&shared); + if (local->transaction.type == AFR_DATA_TRANSACTION && + !local->transaction.inherited) { + ret = afr_write_subvol_set (frame, this); +@@ -394,22 +481,31 @@ afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this) + just now, before OP */ + afr_changelog_pre_op_update (frame, this); + +- /* The wake up needs to happen independent of +- what type of fop arrives here. If it was +- a write, then it has already inherited the +- lock and changelog. If it was not a write, +- then the presumption of the optimization (of +- optimizing for successive write operations) +- fails. +- */ +- if (fd) +- afr_delayed_changelog_wake_up (this, fd); ++ if (!local->transaction.eager_lock_on || ++ local->transaction.inherited) ++ goto fop; ++ failure_count = AFR_COUNT (local->transaction.failed_subvols, ++ priv->child_count); ++ if (failure_count == priv->child_count) { ++ afr_handle_lock_acquire_failure (local, _gf_true); ++ } else { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ lock->acquired = _gf_true; ++ __afr_transaction_wake_shared (local, &shared); ++ } ++ UNLOCK (&local->inode->lock); ++ } ++ ++fop: + if (priv->arbiter_count == 1) { + afr_txn_arbitrate_fop (frame, this); + } else { + afr_transaction_fop (frame, this); + } + ++ afr_lock_resume_shared (&shared); + return 0; + } + +@@ -486,30 +582,14 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + } + + +-afr_inodelk_t* +-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom) +-{ +- afr_inodelk_t *inodelk = NULL; +- int i = 0; +- +- for (i = 0; int_lock->inodelk[i].domain; i++) { +- inodelk = &int_lock->inodelk[i]; +- if (strcmp (dom, inodelk->domain) == 0) +- return inodelk; +- } +- return NULL; +-} +- + unsigned char* + afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock) + { + unsigned char *locked_nodes = NULL; +- afr_inodelk_t *inodelk = NULL; + switch (type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); +- locked_nodes = inodelk->locked_nodes; ++ locked_nodes = int_lock->locked_nodes; + break; + + case AFR_ENTRY_TRANSACTION: +@@ -834,27 +914,19 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; ++ afr_inode_ctx_t *ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; +- afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; +- fd = local->fd; ++ ctx = local->inode_ctx; + + type = afr_index_for_transaction_type (local->transaction.type); + if (type != AFR_DATA_TRANSACTION) + return !local->transaction.dirtied; + +- if (!fd) +- return !local->transaction.dirtied; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_false; +- + if (local->transaction.no_uninherit) + return _gf_false; + +@@ -868,34 +940,34 @@ afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this) + if (local->transaction.uninherit_done) + return local->transaction.uninherit_value; + +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != +- fd_ctx->pre_op_done[type][i]) { ++ ctx->pre_op_done[type][i]) { + ret = !local->transaction.dirtied; + goto unlock; + } + } + +- if (fd_ctx->inherited[type]) { ++ if (ctx->inherited[type]) { + ret = _gf_true; +- fd_ctx->inherited[type]--; +- } else if (fd_ctx->on_disk[type]) { ++ ctx->inherited[type]--; ++ } else if (ctx->on_disk[type]) { + ret = _gf_false; +- fd_ctx->on_disk[type]--; ++ ctx->on_disk[type]--; + } else { + /* ASSERT */ + ret = _gf_false; + } + +- if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) { ++ if (!ctx->inherited[type] && !ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) +- fd_ctx->pre_op_done[type][i] = 0; ++ ctx->pre_op_done[type][i] = 0; + } + } + unlock: +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + local->transaction.uninherit_done = _gf_true; + local->transaction.uninherit_value = ret; +@@ -909,31 +981,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; +- afr_fd_ctx_t *fd_ctx = NULL; + int type = 0; + + local = frame->local; + priv = this->private; +- fd = local->fd; + + if (local->transaction.type != AFR_DATA_TRANSACTION) + return _gf_false; + + type = afr_index_for_transaction_type (local->transaction.type); + +- if (!fd) +- return _gf_false; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_false; +- +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { +- if (!fd_ctx->on_disk[type]) { ++ if (!local->inode_ctx->on_disk[type]) { + /* nothing to inherit yet */ + ret = _gf_false; + goto unlock; +@@ -941,21 +1003,21 @@ afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this) + + for (i = 0; i < priv->child_count; i++) { + if (local->transaction.pre_op[i] != +- fd_ctx->pre_op_done[type][i]) { ++ local->inode_ctx->pre_op_done[type][i]) { + /* either inherit exactly, or don't */ + ret = _gf_false; + goto unlock; + } + } + +- fd_ctx->inherited[type]++; ++ local->inode_ctx->inherited[type]++; + + ret = _gf_true; + + local->transaction.inherited = _gf_true; + } + unlock: +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + return ret; + } +@@ -966,22 +1028,16 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) + { + afr_local_t *local = NULL; + afr_private_t *priv = NULL; +- fd_t *fd = NULL; +- afr_fd_ctx_t *fd_ctx = NULL; + int i = 0; + gf_boolean_t ret = _gf_false; + int type = 0; + + local = frame->local; + priv = this->private; +- fd = local->fd; + +- if (!fd) +- return _gf_false; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_false; ++ if (local->transaction.type == AFR_ENTRY_TRANSACTION || ++ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) ++ return _gf_false; + + if (local->transaction.inherited) + /* was already inherited in afr_changelog_pre_op */ +@@ -997,26 +1053,26 @@ afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this) + + ret = _gf_false; + +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { +- if (!fd_ctx->on_disk[type]) { ++ if (!local->inode_ctx->on_disk[type]) { + for (i = 0; i < priv->child_count; i++) +- fd_ctx->pre_op_done[type][i] = ++ local->inode_ctx->pre_op_done[type][i] = + (!local->transaction.failed_subvols[i]); + } else { + for (i = 0; i < priv->child_count; i++) +- if (fd_ctx->pre_op_done[type][i] != ++ if (local->inode_ctx->pre_op_done[type][i] != + (!local->transaction.failed_subvols[i])) { + local->transaction.no_uninherit = 1; + goto unlock; + } + } +- fd_ctx->on_disk[type]++; ++ local->inode_ctx->on_disk[type]++; + + ret = _gf_true; + } + unlock: +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + return ret; + } +@@ -1324,6 +1380,9 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) + + afr_init_optimistic_changelog_for_txn (this, local); + ++ if (afr_changelog_pre_op_inherit (frame, this)) ++ goto next; ++ + /* This condition should not be met with present code, as + * transaction.done will be called if locks are not acquired on even a + * single node. +@@ -1349,9 +1408,6 @@ afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) + goto err; + } + +- if (afr_changelog_pre_op_inherit (frame, this)) +- goto next; +- + if (call_count < priv->child_count) + pre_nop = _gf_false; + +@@ -1408,7 +1464,7 @@ err: + local->op_ret = -1; + local->op_errno = op_errno; + +- afr_unlock (frame, this); ++ afr_handle_lock_acquire_failure (local, _gf_true); + + if (xdata_req) + dict_unref (xdata_req); +@@ -1418,31 +1474,6 @@ err: + + + int +-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- if (int_lock->lock_op_ret < 0) { +- gf_msg (this->name, GF_LOG_INFO, +- 0, AFR_MSG_BLOCKING_LKS_FAILED, +- "Blocking inodelks failed."); +- afr_transaction_done (frame, this); +- } else { +- +- gf_msg_debug (this->name, 0, +- "Blocking inodelks done. Proceeding to FOP"); +- afr_internal_lock_finish (frame, this); +- } +- +- return 0; +-} +- +- +-int + afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +@@ -1455,7 +1486,7 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + if (int_lock->lock_op_ret < 0) { + gf_msg_debug (this->name, 0, + "Non blocking inodelks failed. Proceeding to blocking"); +- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk; ++ int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock (frame, this); + } else { + +@@ -1469,31 +1500,6 @@ afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this) + + + int +-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) +-{ +- afr_internal_lock_t *int_lock = NULL; +- afr_local_t *local = NULL; +- +- local = frame->local; +- int_lock = &local->internal_lock; +- +- if (int_lock->lock_op_ret < 0) { +- gf_msg (this->name, GF_LOG_INFO, 0, +- AFR_MSG_BLOCKING_LKS_FAILED, +- "Blocking entrylks failed."); +- afr_transaction_done (frame, this); +- } else { +- +- gf_msg_debug (this->name, 0, +- "Blocking entrylks done. Proceeding to FOP"); +- afr_internal_lock_finish (frame, this); +- } +- +- return 0; +-} +- +- +-int + afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; +@@ -1506,7 +1512,7 @@ afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this) + if (int_lock->lock_op_ret < 0) { + gf_msg_debug (this->name, 0, + "Non blocking entrylks failed. Proceeding to blocking"); +- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk; ++ int_lock->lock_cbk = afr_internal_lock_finish; + afr_blocking_lock (frame, this); + } else { + +@@ -1567,29 +1573,28 @@ int + afr_set_transaction_flock (xlator_t *this, afr_local_t *local) + { + afr_internal_lock_t *int_lock = NULL; +- afr_inodelk_t *inodelk = NULL; + afr_private_t *priv = NULL; + + int_lock = &local->internal_lock; +- inodelk = afr_get_inodelk (int_lock, int_lock->domain); + priv = this->private; + +- if ((priv->arbiter_count || priv->full_lock) && ++ if ((priv->arbiter_count || local->transaction.eager_lock_on || ++ priv->full_lock) && + local->transaction.type == AFR_DATA_TRANSACTION) { + /*Lock entire file to avoid network split brains.*/ +- inodelk->flock.l_len = 0; +- inodelk->flock.l_start = 0; ++ int_lock->flock.l_len = 0; ++ int_lock->flock.l_start = 0; + } else { +- inodelk->flock.l_len = local->transaction.len; +- inodelk->flock.l_start = local->transaction.start; ++ int_lock->flock.l_len = local->transaction.len; ++ int_lock->flock.l_start = local->transaction.start; + } +- inodelk->flock.l_type = F_WRLCK; ++ int_lock->flock.l_type = F_WRLCK; + + return 0; + } + + int +-afr_lock_rec (call_frame_t *frame, xlator_t *this) ++afr_lock (call_frame_t *frame, xlator_t *this) + { + afr_internal_lock_t *int_lock = NULL; + afr_local_t *local = NULL; +@@ -1630,74 +1635,153 @@ afr_lock_rec (call_frame_t *frame, xlator_t *this) + return 0; + } + ++static gf_boolean_t ++afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) ++{ ++ uint64_t start1 = local1->transaction.start; ++ uint64_t start2 = local2->transaction.start; ++ uint64_t end1 = 0; ++ uint64_t end2 = 0; ++ ++ if (local1->transaction.len) ++ end1 = start1 + local1->transaction.len - 1; ++ else ++ end1 = ULLONG_MAX; ++ ++ if (local2->transaction.len) ++ end2 = start2 + local2->transaction.len - 1; ++ else ++ end2 = ULLONG_MAX; + +-int +-afr_lock (call_frame_t *frame, xlator_t *this) ++ return ((end1 >= start2) && (end2 >= start1)); ++} ++ ++gf_boolean_t ++afr_has_lock_conflict (afr_local_t *local, gf_boolean_t waitlist_check) + { +- afr_set_lock_number (frame, this); ++ afr_local_t *each = NULL; ++ afr_lock_t *lock = NULL; + +- return afr_lock_rec (frame, this); ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ /* ++ * Once full file lock is acquired in eager-lock phase, overlapping ++ * writes do not compete for inode-locks, instead are transferred to the ++ * next writes. Because of this overlapping writes are not ordered. ++ * This can cause inconsistencies in replication. ++ * Example: ++ * Two overlapping writes w1, w2 are sent in parallel on same fd ++ * in two threads t1, t2. ++ * Both threads can execute afr_writev_wind in the following manner. ++ * t1 winds w1 on brick-0 ++ * t2 winds w2 on brick-0 ++ * t2 winds w2 on brick-1 ++ * t1 winds w1 on brick-1 ++ * ++ * This check makes sure the locks are not transferred for ++ * overlapping writes. ++ */ ++ list_for_each_entry (each, &lock->owners, transaction.owner_list) { ++ if (afr_locals_overlap (each, local)) { ++ return _gf_true; ++ } ++ } ++ ++ if (!waitlist_check) ++ return _gf_false; ++ list_for_each_entry (each, &lock->waiting, transaction.wait_list) { ++ if (afr_locals_overlap (each, local)) { ++ return _gf_true; ++ } ++ } ++ return _gf_false; + } + + + /* }}} */ +- +-int +-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) ++static void ++afr_copy_inodelk_vars (afr_internal_lock_t *dst, afr_internal_lock_t *src, ++ xlator_t *this) + { +- afr_changelog_pre_op (frame, this); ++ afr_private_t *priv = this->private; + +- return 0; ++ dst->domain = src->domain; ++ dst->flock.l_len = src->flock.l_len; ++ dst->flock.l_start = src->flock.l_start; ++ dst->flock.l_type = src->flock.l_type; ++ dst->lock_count = src->lock_count; ++ memcpy (dst->locked_nodes, src->locked_nodes, ++ priv->child_count * sizeof (*dst->locked_nodes)); + } + +- + void +-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this) ++__afr_transaction_wake_shared (afr_local_t *local, struct list_head *shared) + { +- afr_local_t *local = NULL; +- afr_private_t *priv = NULL; ++ gf_boolean_t conflict = _gf_false; ++ afr_local_t *each = NULL; ++ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type]; + +- /* call this function from any of the related optimizations +- which benefit from delaying post op are enabled, namely: +- +- - changelog piggybacking +- - eager locking +- */ ++ while (!conflict) { ++ if (list_empty (&lock->waiting)) ++ return; ++ each = list_entry(lock->waiting.next, afr_local_t, ++ transaction.wait_list); ++ if (afr_has_lock_conflict (each, _gf_false)) { ++ conflict = _gf_true; ++ } ++ if (conflict && !list_empty (&lock->owners)) ++ return; ++ afr_copy_inodelk_vars (&each->internal_lock, ++ &local->internal_lock, ++ each->transaction.frame->this); ++ list_move_tail (&each->transaction.wait_list, shared); ++ list_add_tail(&each->transaction.owner_list, &lock->owners); ++ } ++} + +- priv = this->private; +- if (!priv) +- return; ++static void ++afr_lock_resume_shared (struct list_head *list) ++{ ++ afr_local_t *each = NULL; + +- if (!priv->post_op_delay_secs) +- return; ++ while (!list_empty(list)) { ++ each = list_entry(list->next, afr_local_t, ++ transaction.wait_list); ++ list_del_init(&each->transaction.wait_list); ++ afr_changelog_pre_op (each->transaction.frame, ++ each->transaction.frame->this); ++ } ++} + +- local = frame->local; +- if (!local) +- return; ++int ++afr_internal_lock_finish (call_frame_t *frame, xlator_t *this) ++{ ++ afr_local_t *local = frame->local; ++ afr_lock_t *lock = NULL; + +- if (!local->transaction.eager_lock_on) +- return; + +- if (!local->fd) +- return; ++ local->internal_lock.lock_cbk = NULL; ++ if (!local->transaction.eager_lock_on) { ++ if (local->internal_lock.lock_op_ret < 0) { ++ afr_transaction_done (frame, this); ++ return 0; ++ } ++ afr_changelog_pre_op (frame, this); ++ } else { ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ if (local->internal_lock.lock_op_ret < 0) { ++ afr_handle_lock_acquire_failure (local, _gf_false); ++ } else { ++ lock->event_generation = local->event_generation; ++ afr_changelog_pre_op (frame, this); ++ } ++ } + +- if (local->op == GF_FOP_WRITE) +- local->delayed_post_op = _gf_true; ++ return 0; + } + + gf_boolean_t +-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) ++afr_are_multiple_fds_opened (afr_local_t *local, xlator_t *this) + { +- afr_fd_ctx_t *fd_ctx = NULL; +- +- if (!fd) { +- /* If false is returned, it may keep on taking eager-lock +- * which may lead to starvation, so return true to avoid that. +- */ +- gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF, +- AFR_MSG_INVALID_ARG, "Invalid fd"); +- return _gf_true; +- } + /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock + * is taken mount2 opened the same file, it won't be able to + * perform any data operations until mount1 releases eager-lock. +@@ -1705,11 +1789,7 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) + * if open-fd-count is > 1 + */ + +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- return _gf_true; +- +- if (fd_ctx->open_fd_count > 1) ++ if (local->inode_ctx->open_fd_count > 1) + return _gf_true; + + return _gf_false; +@@ -1717,24 +1797,45 @@ afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this) + + + gf_boolean_t +-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this) ++afr_is_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this, ++ int delay) + { +- afr_local_t *local = NULL; +- gf_boolean_t res = _gf_false; ++ afr_local_t *local = NULL; ++ afr_lock_t *lock = NULL; ++ gf_boolean_t res = _gf_false; + + local = frame->local; +- if (!local) ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ ++ if (!afr_txn_nothing_failed (frame, this)) { ++ lock->release = _gf_true; + goto out; ++ } + +- if (!local->delayed_post_op) ++ if (afr_are_multiple_fds_opened (local, this)) { ++ lock->release = _gf_true; + goto out; ++ } + +- //Mark pending changelog ASAP +- if (!afr_txn_nothing_failed (frame, this)) ++ if (!list_empty (&lock->owners)) ++ goto out; ++ else ++ GF_ASSERT (list_empty (&lock->waiting)); ++ ++ if (lock->release) { ++ goto out; ++ } ++ ++ if (!delay) { + goto out; ++ } + +- if (local->fd && afr_are_multiple_fds_opened (local->fd, this)) ++ if ((local->op != GF_FOP_WRITE) && ++ (local->op != GF_FOP_FXATTROP)) { ++ /*Only allow writes but shard does [f]xattrops on writes, so ++ * they are fine too*/ + goto out; ++ } + + res = _gf_true; + out: +@@ -1745,50 +1846,61 @@ out: + void + afr_delayed_changelog_wake_up_cbk (void *data) + { +- fd_t *fd = NULL; ++ afr_lock_t *lock = NULL; ++ afr_local_t *local = data; ++ afr_local_t *timer_local = NULL; ++ struct list_head shared; + +- fd = data; +- +- afr_delayed_changelog_wake_up (THIS, fd); ++ INIT_LIST_HEAD (&shared); ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ timer_local = list_entry(lock->post_op.next, ++ afr_local_t, ++ transaction.owner_list); ++ if (list_empty (&lock->owners) && (local == timer_local)) { ++ GF_ASSERT (list_empty (&lock->waiting)); ++ /*Last owner*/ ++ lock->release = _gf_true; ++ lock->delay_timer = NULL; ++ } ++ } ++ UNLOCK (&local->inode->lock); ++ afr_changelog_post_op_now (local->transaction.frame, ++ local->transaction.frame->this); + } + + + /* SET operation */ + int +-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd) ++afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local) + { +- afr_fd_ctx_t *fdctx = NULL; +- +- fdctx = afr_fd_ctx_get (fd, this); +- +- LOCK(&fd->lock); ++ LOCK(&local->inode->lock); + { +- fdctx->witnessed_unstable_write = _gf_true; ++ local->inode_ctx->witnessed_unstable_write = _gf_true; + } +- UNLOCK(&fd->lock); ++ UNLOCK(&local->inode->lock); + + return 0; + } + + /* TEST and CLEAR operation */ + gf_boolean_t +-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd) ++afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode) + { +- afr_fd_ctx_t *fdctx = NULL; ++ afr_inode_ctx_t *ctx = NULL; + gf_boolean_t witness = _gf_false; + +- fdctx = afr_fd_ctx_get (fd, this); +- if (!fdctx) +- return _gf_true; +- +- LOCK(&fd->lock); ++ LOCK(&inode->lock); + { +- if (fdctx->witnessed_unstable_write) { ++ (void)__afr_inode_ctx_get (this, inode, &ctx); ++ ++ if (ctx->witnessed_unstable_write) { + witness = _gf_true; +- fdctx->witnessed_unstable_write = _gf_false; ++ ctx->witnessed_unstable_write = _gf_false; + } + } +- UNLOCK (&fd->lock); ++ UNLOCK (&inode->lock); + + return witness; + } +@@ -1931,7 +2043,7 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) + mark a flag in the fdctx whenever an unstable write is witnessed. + */ + +- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) { ++ if (!afr_fd_has_witnessed_unstable_write (this, local->inode)) { + afr_changelog_post_op_now (frame, this); + return 0; + } +@@ -1949,87 +2061,64 @@ afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this) + return 0; + } + +- + void +-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd, +- call_stub_t *stub) ++afr_changelog_post_op (call_frame_t *frame, xlator_t *this) + { +- afr_fd_ctx_t *fd_ctx = NULL; +- call_frame_t *prev_frame = NULL; +- struct timespec delta = {0, }; +- afr_private_t *priv = NULL; +- afr_local_t *local = NULL; ++ struct timespec delta = {0, }; ++ afr_private_t *priv = NULL; ++ afr_local_t *local = frame->local; ++ afr_lock_t *lock = NULL; ++ gf_boolean_t post_op = _gf_true; ++ struct list_head shared; + + priv = this->private; +- +- fd_ctx = afr_fd_ctx_get (fd, this); +- if (!fd_ctx) +- goto out; +- + delta.tv_sec = priv->post_op_delay_secs; + delta.tv_nsec = 0; + +- pthread_mutex_lock (&fd_ctx->delay_lock); +- { +- prev_frame = fd_ctx->delay_frame; +- fd_ctx->delay_frame = NULL; +- if (fd_ctx->delay_timer) +- gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer); +- fd_ctx->delay_timer = NULL; +- if (!frame) +- goto unlock; +- fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta, +- afr_delayed_changelog_wake_up_cbk, +- fd); +- fd_ctx->delay_frame = frame; +- } +-unlock: +- pthread_mutex_unlock (&fd_ctx->delay_lock); +- +-out: +- if (prev_frame) { +- local = prev_frame->local; +- local->transaction.resume_stub = stub; +- afr_changelog_post_op_now (prev_frame, this); +- } else if (stub) { +- call_resume (stub); +- } +-} +- +- +-void +-afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +-{ +- afr_local_t *local = NULL; +- +- local = frame->local; +- +- if (is_afr_delayed_changelog_post_op_needed (frame, this)) +- afr_delayed_changelog_post_op (this, frame, local->fd, NULL); +- else +- afr_changelog_post_op_safe (frame, this); +-} +- ++ INIT_LIST_HEAD (&shared); ++ if (!local->transaction.eager_lock_on) ++ goto out; + ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ LOCK (&local->inode->lock); ++ { ++ list_del_init (&local->transaction.owner_list); ++ list_add (&local->transaction.owner_list, &lock->post_op); ++ __afr_transaction_wake_shared (local, &shared); ++ ++ if (!afr_is_delayed_changelog_post_op_needed (frame, this, ++ delta.tv_sec)) { ++ if (list_empty (&lock->owners)) ++ lock->release = _gf_true; ++ goto unlock; ++ } + +-/* Wake up the sleeping/delayed post-op, and also register +- a stub to have it resumed after this transaction +- completely finishes. ++ GF_ASSERT (lock->delay_timer == NULL); ++ lock->delay_timer = gf_timer_call_after (this->ctx, delta, ++ afr_delayed_changelog_wake_up_cbk, ++ local); ++ if (!lock->delay_timer) { ++ lock->release = _gf_true; ++ } else { ++ post_op = _gf_false; ++ } + +- The @stub gets saved in @local and gets resumed in +- afr_local_cleanup() +- */ +-void +-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub) +-{ +- afr_delayed_changelog_post_op (this, NULL, fd, stub); +-} ++ } ++unlock: ++ UNLOCK (&local->inode->lock); + ++ if (!list_empty (&shared)) { ++ afr_lock_resume_shared (&shared); ++ } + +-void +-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd) +-{ +- afr_delayed_changelog_post_op (this, NULL, fd, NULL); ++out: ++ if (post_op) { ++ if (!local->transaction.eager_lock_on || lock->release) { ++ afr_changelog_post_op_safe (frame, this); ++ } else { ++ afr_changelog_post_op_now (frame, this); ++ } ++ } + } + + int +@@ -2039,13 +2128,6 @@ afr_transaction_resume (call_frame_t *frame, xlator_t *this) + + local = frame->local; + +- if (local->transaction.eager_lock_on) { +- /* We don't need to retain "local" in the +- fd list anymore, writes to all subvols +- are finished by now */ +- afr_remove_eager_lock_stub (local); +- } +- + afr_restore_lk_owner (frame); + + afr_handle_symmetric_errors (frame, this); +@@ -2076,114 +2158,149 @@ afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + local->transaction.failed_subvols[child_index] = 1; + } + +- +- + static gf_boolean_t +-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2) ++__need_previous_lock_unlocked (afr_local_t *local) + { +- uint64_t start1 = local1->transaction.start; +- uint64_t start2 = local2->transaction.start; +- uint64_t end1 = 0; +- uint64_t end2 = 0; +- +- if (local1->transaction.len) +- end1 = start1 + local1->transaction.len - 1; +- else +- end1 = ULLONG_MAX; ++ afr_lock_t *lock = NULL; + +- if (local2->transaction.len) +- end2 = start2 + local2->transaction.len - 1; +- else +- end2 = ULLONG_MAX; ++ if (!local->transaction.eager_lock_on) ++ return _gf_true; + +- return ((end1 >= start2) && (end2 >= start1)); ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ if (!lock->acquired) ++ return _gf_false; ++ if (lock->acquired && lock->event_generation != local->event_generation) ++ return _gf_true; ++ return _gf_false; + } + + void +-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this) ++__afr_eager_lock_handle (afr_local_t *local, gf_boolean_t *take_lock, ++ gf_boolean_t *do_pre_op, afr_local_t **timer_local) + { +- afr_private_t *priv = NULL; +- afr_fd_ctx_t *fdctx = NULL; +- afr_local_t *each = NULL; ++ afr_lock_t *lock = NULL; ++ afr_local_t *owner_local = NULL; ++ xlator_t *this = local->transaction.frame->this; + +- priv = this->private; +- +- if (!local->fd) +- return; +- +- if (local->transaction.type != AFR_DATA_TRANSACTION) +- return; ++ if (local->fd && !afr_are_multiple_fds_opened (local, this)) { ++ local->transaction.eager_lock_on = _gf_true; ++ } + +- if (!priv->eager_lock) +- return; ++ lock = &local->inode_ctx->lock[local->transaction.type]; ++ if (__need_previous_lock_unlocked (local)) { ++ if (!list_empty (&lock->owners)) { ++ lock->release = _gf_true; ++ } else if (lock->delay_timer) { ++ lock->release = _gf_true; ++ if (gf_timer_call_cancel (this->ctx, ++ lock->delay_timer)) { ++ /* It will be put in frozen list ++ * in the code flow below*/ ++ } else { ++ *timer_local = list_entry(lock->post_op.next, ++ afr_local_t, ++ transaction.owner_list); ++ lock->delay_timer = NULL; ++ } ++ } ++ if (!local->transaction.eager_lock_on) ++ goto out; ++ } + +- fdctx = afr_fd_ctx_get (local->fd, this); +- if (!fdctx) +- return; ++ if (lock->release) { ++ list_add_tail (&local->transaction.wait_list, ++ &lock->frozen); ++ *take_lock = _gf_false; ++ goto out; ++ } + +- if (afr_are_multiple_fds_opened (local->fd, this)) +- return; +- /* +- * Once full file lock is acquired in eager-lock phase, overlapping +- * writes do not compete for inode-locks, instead are transferred to the +- * next writes. Because of this overlapping writes are not ordered. +- * This can cause inconsistencies in replication. +- * Example: +- * Two overlapping writes w1, w2 are sent in parallel on same fd +- * in two threads t1, t2. +- * Both threads can execute afr_writev_wind in the following manner. +- * t1 winds w1 on brick-0 +- * t2 winds w2 on brick-0 +- * t2 winds w2 on brick-1 +- * t1 winds w1 on brick-1 +- * +- * This check makes sure the locks are not transferred for +- * overlapping writes. +- */ +- LOCK (&local->fd->lock); +- { +- list_for_each_entry (each, &fdctx->eager_locked, +- transaction.eager_locked) { +- if (afr_locals_overlap (each, local)) { +- local->transaction.eager_lock_on = _gf_false; +- goto unlock; +- } ++ if (lock->delay_timer) { ++ *take_lock = _gf_false; ++ if (gf_timer_call_cancel (this->ctx, ++ lock->delay_timer)) { ++ list_add_tail (&local->transaction.wait_list, ++ &lock->frozen); ++ } else { ++ *timer_local = list_entry(lock->post_op.next, ++ afr_local_t, ++ transaction.owner_list); ++ afr_copy_inodelk_vars (&local->internal_lock, ++ &(*timer_local)->internal_lock, ++ this); ++ lock->delay_timer = NULL; ++ *do_pre_op = _gf_true; ++ list_add_tail (&local->transaction.owner_list, ++ &lock->owners); + } ++ goto out; ++ } + +- local->transaction.eager_lock_on = _gf_true; +- list_add_tail (&local->transaction.eager_locked, +- &fdctx->eager_locked); ++ if (!list_empty (&lock->owners)) { ++ if (!lock->acquired || ++ afr_has_lock_conflict (local, _gf_true)) { ++ list_add_tail (&local->transaction.wait_list, ++ &lock->waiting); ++ *take_lock = _gf_false; ++ goto out; ++ } ++ owner_local = list_entry (lock->owners.next, ++ afr_local_t, ++ transaction.owner_list); ++ afr_copy_inodelk_vars (&local->internal_lock, ++ &owner_local->internal_lock, ++ this); ++ *take_lock = _gf_false; ++ *do_pre_op = _gf_true; + } +-unlock: +- UNLOCK (&local->fd->lock); ++ ++ if (lock->acquired) ++ GF_ASSERT (!(*take_lock)); ++ list_add_tail (&local->transaction.owner_list, &lock->owners); ++out: ++ return; + } + + void +-afr_transaction_start (call_frame_t *frame, xlator_t *this) ++afr_transaction_start (afr_local_t *local, xlator_t *this) + { +- afr_local_t *local = frame->local; +- fd_t *fd = NULL; ++ afr_private_t *priv = NULL; ++ gf_boolean_t take_lock = _gf_true; ++ gf_boolean_t do_pre_op = _gf_false; ++ afr_local_t *timer_local = NULL; + +- afr_transaction_eager_lock_init (local, this); ++ priv = this->private; + +- if (local->fd && local->transaction.eager_lock_on) +- afr_set_lk_owner (frame, this, local->fd); +- else +- afr_set_lk_owner (frame, this, frame->root); ++ if (local->transaction.type != AFR_DATA_TRANSACTION && ++ local->transaction.type != AFR_METADATA_TRANSACTION) ++ goto lock_phase; + +- if (!local->transaction.eager_lock_on && local->loc.inode) { +- fd = fd_lookup (local->loc.inode, frame->root->pid); +- if (fd == NULL) +- fd = fd_lookup_anonymous (local->loc.inode, +- GF_ANON_FD_FLAGS); ++ if (!priv->eager_lock) ++ goto lock_phase; + +- if (fd) { +- afr_delayed_changelog_wake_up (this, fd); +- fd_unref (fd); +- } ++ LOCK (&local->inode->lock); ++ { ++ __afr_eager_lock_handle (local, &take_lock, &do_pre_op, ++ &timer_local); + } ++ UNLOCK (&local->inode->lock); ++lock_phase: ++ if (!local->transaction.eager_lock_on) { ++ afr_set_lk_owner (local->transaction.frame, this, ++ local->transaction.frame->root); ++ } else { ++ afr_set_lk_owner (local->transaction.frame, this, local->inode); ++ } ++ + +- afr_lock (frame, this); ++ if (take_lock) { ++ afr_lock (local->transaction.frame, this); ++ } else if (do_pre_op) { ++ afr_changelog_pre_op (local->transaction.frame, this); ++ } ++ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above ++ * so that any inheriting can happen*/ ++ if (timer_local) ++ afr_delayed_changelog_wake_up_cbk (timer_local); + } + + int +@@ -2196,7 +2313,7 @@ afr_write_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err) + goto fail; + } + +- afr_transaction_start (frame, this); ++ afr_transaction_start (local, this); + return 0; + fail: + local->transaction.unwind (frame, this); +@@ -2214,6 +2331,7 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + + local = frame->local; + priv = this->private; ++ local->transaction.frame = frame; + + local->transaction.type = type; + +@@ -2226,11 +2344,9 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + if (ret < 0) + goto out; + +- if (type == AFR_ENTRY_TRANSACTION || +- type == AFR_ENTRY_RENAME_TRANSACTION) { +- afr_transaction_start (frame, this); +- ret = 0; +- goto out; ++ ++ if (type != AFR_METADATA_TRANSACTION) { ++ goto txn_start; + } + + ret = afr_inode_get_readable (frame, local->inode, this, +@@ -2240,10 +2356,13 @@ afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) + event_generation)) { + afr_inode_refresh (frame, this, local->inode, local->loc.gfid, + afr_write_txn_refresh_done); +- } else { +- afr_transaction_start (frame, this); ++ ret = 0; ++ goto out; + } ++ ++txn_start: + ret = 0; ++ afr_transaction_start (local, this); + out: + return ret; + } +diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h +index ddcb1eb..a27e9a3 100644 +--- a/xlators/cluster/afr/src/afr-transaction.h ++++ b/xlators/cluster/afr/src/afr-transaction.h +@@ -17,12 +17,6 @@ void + afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this, + int child_index); + +-int +-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type); +- +-afr_inodelk_t* +-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom); +- + int32_t + afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +@@ -30,9 +24,6 @@ int + afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending); + + void +-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this); +- +-void + afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd); + + void +@@ -57,4 +48,8 @@ afr_pick_error_xdata (afr_local_t *local, afr_private_t *priv, + inode_t *inode2, unsigned char *readable2); + int + afr_transaction_resume (call_frame_t *frame, xlator_t *this); ++int ++afr_lock (call_frame_t *frame, xlator_t *this); ++void ++afr_delayed_changelog_wake_up_cbk (void *data); + #endif /* __TRANSACTION_H__ */ +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 5ff57c0..6be59dc 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -230,19 +230,12 @@ int + afr_entry_lockee_cmp (const void *l1, const void *l2); + + typedef struct { +- char *domain; /* Domain on which inodelk is taken */ +- struct gf_flock flock; +- unsigned char *locked_nodes; +- int32_t lock_count; +-} afr_inodelk_t; +- +-typedef struct { + loc_t *lk_loc; + + int lockee_count; + afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX]; + +- afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX]; ++ struct gf_flock flock; + const char *lk_basename; + const char *lower_basename; + const char *higher_basename; +@@ -255,7 +248,6 @@ typedef struct { + int32_t lock_count; + int32_t entrylk_lock_count; + +- uint64_t lock_number; + int32_t lk_call_count; + int32_t lk_expected_count; + int32_t lk_attempted_count; +@@ -292,37 +284,9 @@ typedef enum { + } afr_fd_open_status_t; + + typedef struct { +- unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; +- int inherited[AFR_NUM_CHANGE_LOGS]; +- int on_disk[AFR_NUM_CHANGE_LOGS]; + afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */ +- +- unsigned int *lock_piggyback; +- unsigned int *lock_acquired; +- + int flags; + +- /* used for delayed-post-op optimization */ +- pthread_mutex_t delay_lock; +- gf_timer_t *delay_timer; +- call_frame_t *delay_frame; +- +- /* set if any write on this fd was a non stable write +- (i.e, without O_SYNC or O_DSYNC) +- */ +- gf_boolean_t witnessed_unstable_write; +- +- /* @open_fd_count: +- Number of open FDs queried from the server, as queried through +- xdata in FOPs. Currently, used to decide if eager-locking must be +- temporarily disabled. +- */ +- uint32_t open_fd_count; +- +- +- /* list of frames currently in progress */ +- struct list_head eager_locked; +- + /* the subvolume on which the latest sequence of readdirs (starting + at offset 0) has begun. Till the next readdir request with 0 offset + arrives, we continue to read off this subvol. +@@ -336,6 +300,20 @@ typedef enum { + AFR_FOP_LOCK_QUORUM_FAILED, + } afr_fop_lock_state_t; + ++typedef struct _afr_inode_lock_t { ++ unsigned int event_generation; ++ gf_boolean_t release; ++ gf_boolean_t acquired; ++ gf_timer_t *delay_timer; ++ struct list_head owners; /*Transactions that are performing fop*/ ++ struct list_head post_op;/*Transactions that are done with the fop ++ *So can not conflict with the fops*/ ++ struct list_head waiting;/*Transaction that are waiting for ++ *conflicting transactions to complete*/ ++ struct list_head frozen;/*Transactions that need to go as part of ++ * next batch of eager-lock*/ ++} afr_lock_t; ++ + typedef struct _afr_inode_ctx { + uint64_t read_subvol; + uint64_t write_subvol; +@@ -343,6 +321,23 @@ typedef struct _afr_inode_ctx { + int spb_choice; + gf_timer_t *timer; + gf_boolean_t need_refresh; ++ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS]; ++ int inherited[AFR_NUM_CHANGE_LOGS]; ++ int on_disk[AFR_NUM_CHANGE_LOGS]; ++ ++ /* set if any write on this fd was a non stable write ++ (i.e, without O_SYNC or O_DSYNC) ++ */ ++ gf_boolean_t witnessed_unstable_write; ++ ++ /* @open_fd_count: ++ Number of open FDs queried from the server, as queried through ++ xdata in FOPs. Currently, used to decide if eager-locking must be ++ temporarily disabled. ++ */ ++ uint32_t open_fd_count; ++ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/ ++ afr_lock_t lock[2]; + } afr_inode_ctx_t; + + +@@ -457,7 +452,6 @@ typedef struct _afr_local { + dict_t *dict; + + int optimistic_change_log; +- gf_boolean_t delayed_post_op; + + /* Is the current writev() going to perform a stable write? + i.e, is fd->flags or @flags writev param have O_SYNC or +@@ -693,7 +687,7 @@ typedef struct _afr_local { + off_t start, len; + + gf_boolean_t eager_lock_on; +- int *eager_lock; ++ gf_boolean_t do_eager_unlock; + + char *basename; + char *new_basename; +@@ -707,7 +701,8 @@ typedef struct _afr_local { + of the transaction frame */ + call_stub_t *resume_stub; + +- struct list_head eager_locked; ++ struct list_head owner_list; ++ struct list_head wait_list; + + unsigned char *pre_op; + +@@ -768,7 +763,8 @@ typedef struct _afr_local { + */ + afr_changelog_resume_t changelog_resume; + +- call_frame_t *main_frame; ++ call_frame_t *main_frame; /*Fop frame*/ ++ call_frame_t *frame; /*Transaction frame*/ + + int (*wind) (call_frame_t *frame, xlator_t *this, int subvol); + +@@ -1009,7 +1005,7 @@ afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd); + afr_local_cleanup (frame->local, THIS); \ + mem_put (frame->local); \ + frame->local = NULL; }; \ +- frame->local;}) ++ frame->local; }) + + #define AFR_STACK_RESET(frame) \ + do { \ +@@ -1096,22 +1092,10 @@ afr_filter_xattrs (dict_t *xattr); + #define AFR_QUORUM_AUTO INT_MAX + + int +-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd); ++afr_fd_report_unstable_write (xlator_t *this, afr_local_t *local); + + gf_boolean_t +-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd); +- +-void +-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub); +- +-int +-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count); +- +-void +-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this); +- +-void +-afr_remove_eager_lock_stub (afr_local_t *local); ++afr_fd_has_witnessed_unstable_write (xlator_t *this, inode_t *inode); + + void + afr_reply_wipe (struct afr_reply *reply); +-- +1.8.3.1 + diff --git a/0195-storage-posix-Add-active-fd-count-option-in-gluster.patch b/0195-storage-posix-Add-active-fd-count-option-in-gluster.patch new file mode 100644 index 0000000..3b9d53a --- /dev/null +++ b/0195-storage-posix-Add-active-fd-count-option-in-gluster.patch @@ -0,0 +1,226 @@ +From 0ce89d9d2bb0b162ecd4dc47c663569815acdb7b Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 19 Mar 2018 15:12:14 +0530 +Subject: [PATCH 195/201] storage/posix: Add active-fd-count option in gluster + +Problem: +when dd happens on sharded replicate volume all the writes on shards happen +through anon-fd. When the writes don't come quick enough, old anon-fd closes +and new fd gets created to serve the new writes. open-fd-count is decremented +only after the fd is closed as part of fd_destroy(). So even when one fd is on +the way to be closed a new fd will be created and during this short period it +appears as though there are multiple fds opened on the file. AFR thinks another +application opened the same file and switches off eager-lock leading to +extra latency. + +Fix: +Have a different option called active-fd whose life cycle starts at +fd_bind() and ends just before fd_destroy() + + >BUG: 1557932 + +Upstream-patch: https://review.gluster.org/19740 +BUG: 1491785 +Change-Id: I2e221f6030feeedf29fbb3bd6554673b8a5b9c94 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/133659 +Tested-by: RHGS Build Bot +--- + libglusterfs/src/fd.c | 2 ++ + libglusterfs/src/glusterfs.h | 1 + + libglusterfs/src/inode.c | 2 ++ + libglusterfs/src/inode.h | 1 + + tests/volume.rc | 14 ++++++++- + xlators/storage/posix/src/posix-helpers.c | 52 ++++++++++++------------------- + xlators/storage/posix/src/posix.c | 12 +++++++ + 7 files changed, 51 insertions(+), 33 deletions(-) + +diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c +index a824db7..45b0d32 100644 +--- a/libglusterfs/src/fd.c ++++ b/libglusterfs/src/fd.c +@@ -557,6 +557,7 @@ fd_unref (fd_t *fd) + if (refcount == 0) { + if (!list_empty (&fd->inode_list)) { + list_del_init (&fd->inode_list); ++ fd->inode->active_fd_count--; + bound = _gf_true; + } + } +@@ -578,6 +579,7 @@ __fd_bind (fd_t *fd) + list_del_init (&fd->inode_list); + list_add (&fd->inode_list, &fd->inode->fd_list); + fd->inode->fd_count++; ++ fd->inode->active_fd_count++; + + return fd; + } +diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h +index c8835d9..5abfafa 100644 +--- a/libglusterfs/src/glusterfs.h ++++ b/libglusterfs/src/glusterfs.h +@@ -164,6 +164,7 @@ + #define GLUSTERFS_WRITE_IS_APPEND "glusterfs.write-is-append" + #define GLUSTERFS_WRITE_UPDATE_ATOMIC "glusterfs.write-update-atomic" + #define GLUSTERFS_OPEN_FD_COUNT "glusterfs.open-fd-count" ++#define GLUSTERFS_ACTIVE_FD_COUNT "glusterfs.open-active-fd-count" + #define GLUSTERFS_INODELK_COUNT "glusterfs.inodelk-count" + #define GLUSTERFS_ENTRYLK_COUNT "glusterfs.entrylk-count" + #define GLUSTERFS_POSIXLK_COUNT "glusterfs.posixlk-count" +diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c +index b7b5ac6..ffba1bf 100644 +--- a/libglusterfs/src/inode.c ++++ b/libglusterfs/src/inode.c +@@ -2344,6 +2344,8 @@ inode_dump (inode_t *inode, char *prefix) + gf_proc_dump_write("gfid", "%s", uuid_utoa (inode->gfid)); + gf_proc_dump_write("nlookup", "%ld", inode->nlookup); + gf_proc_dump_write("fd-count", "%u", inode->fd_count); ++ gf_proc_dump_write("active-fd-count", "%u", ++ inode->active_fd_count); + gf_proc_dump_write("ref", "%u", inode->ref); + gf_proc_dump_write("ia_type", "%d", inode->ia_type); + if (inode->_ctx) { +diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h +index b82b6ba..7a87748 100644 +--- a/libglusterfs/src/inode.h ++++ b/libglusterfs/src/inode.h +@@ -93,6 +93,7 @@ struct _inode { + gf_lock_t lock; + uint64_t nlookup; + uint32_t fd_count; /* Open fd count */ ++ uint32_t active_fd_count; /* Active open fd count */ + uint32_t ref; /* reference count on this inode */ + ia_type_t ia_type; /* what kind of file */ + struct list_head fd_list; /* list of open files on this inode */ +diff --git a/tests/volume.rc b/tests/volume.rc +index a15c8e5..d57aa93 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -804,7 +804,19 @@ function get_fd_count { + local fname=$4 + local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname)) + local statedump=$(generate_brick_statedump $vol $host $brick) +- local count=$(grep "gfid=$gfid_str" $statedump -A2 | grep fd-count | cut -f2 -d'=' | tail -1) ++ local count=$(grep "gfid=$gfid_str" $statedump -A2 -B1 | grep $brick -A3 | grep -w fd-count | cut -f2 -d'=' | tail -1) ++ rm -f $statedump ++ echo $count ++} ++ ++function get_active_fd_count { ++ local vol=$1 ++ local host=$2 ++ local brick=$3 ++ local fname=$4 ++ local gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $brick/$fname)) ++ local statedump=$(generate_brick_statedump $vol $host $brick) ++ local count=$(grep "gfid=$gfid_str" $statedump -A2 -B1 | grep $brick -A3 | grep -w active-fd-count | cut -f2 -d'=' | tail -1) + rm -f $statedump + echo $count + } +diff --git a/xlators/storage/posix/src/posix-helpers.c b/xlators/storage/posix/src/posix-helpers.c +index bc97206..ba1d8c3 100644 +--- a/xlators/storage/posix/src/posix-helpers.c ++++ b/xlators/storage/posix/src/posix-helpers.c +@@ -388,27 +388,6 @@ _get_filler_inode (posix_xattr_filler_t *filler) + } + + static int +-_posix_filler_get_openfd_count (posix_xattr_filler_t *filler, char *key) +-{ +- inode_t *inode = NULL; +- int ret = -1; +- +- inode = _get_filler_inode (filler); +- if (!inode || gf_uuid_is_null (inode->gfid)) +- goto out; +- +- ret = dict_set_uint32 (filler->xattr, key, inode->fd_count); +- if (ret < 0) { +- gf_msg (filler->this->name, GF_LOG_WARNING, 0, +- P_MSG_DICT_SET_FAILED, +- "Failed to set dictionary value for %s", key); +- goto out; +- } +-out: +- return ret; +-} +- +-static int + _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, + void *xattrargs) + { +@@ -416,11 +395,11 @@ _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, + int ret = -1; + char *databuf = NULL; + int _fd = -1; +- loc_t *loc = NULL; + ssize_t req_size = 0; + int32_t list_offset = 0; + ssize_t remaining_size = 0; + char *xattr = NULL; ++ inode_t *inode = NULL; + + if (posix_xattr_ignorable (key)) + goto out; +@@ -496,16 +475,25 @@ _posix_xattr_get_set (dict_t *xattr_req, char *key, data_t *data, + GF_FREE (databuf); + } + } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { +- ret = _posix_filler_get_openfd_count (filler, key); +- loc = filler->loc; +- if (loc) { +- ret = dict_set_uint32 (filler->xattr, key, +- loc->inode->fd_count); +- if (ret < 0) +- gf_msg (filler->this->name, GF_LOG_WARNING, 0, +- P_MSG_XDATA_GETXATTR, +- "Failed to set dictionary value for %s", +- key); ++ inode = _get_filler_inode (filler); ++ if (!inode || gf_uuid_is_null (inode->gfid)) ++ goto out; ++ ret = dict_set_uint32 (filler->xattr, key, inode->fd_count); ++ if (ret < 0) { ++ gf_msg (filler->this->name, GF_LOG_WARNING, 0, ++ P_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value for %s", key); ++ } ++ } else if (!strcmp (key, GLUSTERFS_ACTIVE_FD_COUNT)) { ++ inode = _get_filler_inode (filler); ++ if (!inode || gf_uuid_is_null (inode->gfid)) ++ goto out; ++ ret = dict_set_uint32 (filler->xattr, key, ++ inode->active_fd_count); ++ if (ret < 0) { ++ gf_msg (filler->this->name, GF_LOG_WARNING, 0, ++ P_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value for %s", key); + } + } else if (!strcmp (key, GET_ANCESTRY_PATH_KEY)) { + /* As of now, the only consumers of POSIX_ANCESTRY_PATH attempt +diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c +index a412e6d..6856e5e 100644 +--- a/xlators/storage/posix/src/posix.c ++++ b/xlators/storage/posix/src/posix.c +@@ -3554,6 +3554,18 @@ _fill_writev_xdata (fd_t *fd, dict_t *xdata, xlator_t *this, int is_append) + } + } + ++ if (dict_get (xdata, GLUSTERFS_ACTIVE_FD_COUNT)) { ++ ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_ACTIVE_FD_COUNT, ++ fd->inode->active_fd_count); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_WARNING, 0, ++ P_MSG_DICT_SET_FAILED, "%s: Failed to set " ++ "dictionary value for %s", ++ uuid_utoa (fd->inode->gfid), ++ GLUSTERFS_ACTIVE_FD_COUNT); ++ } ++ } ++ + if (dict_get (xdata, GLUSTERFS_WRITE_IS_APPEND)) { + ret = dict_set_uint32 (rsp_xdata, GLUSTERFS_WRITE_IS_APPEND, + is_append); +-- +1.8.3.1 + diff --git a/0196-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch b/0196-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch new file mode 100644 index 0000000..7040dba --- /dev/null +++ b/0196-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch @@ -0,0 +1,83 @@ +From 5b32d4317968581f217c82e893822852d63eeae8 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 19 Mar 2018 15:26:40 +0530 +Subject: [PATCH 196/201] cluster/afr: Switch to active-fd-count for open-fd + checks + + >BUG: 1557932 + +Upstream-patch: https://review.gluster.org/19741 +BUG: 1491785 +Change-Id: I3783e41b3812267bc10c0d05d062a31396ce135b +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/133660 +Tested-by: RHGS Build Bot +--- + tests/basic/afr/afr-no-fsync.t | 20 ++++++++++++++++++++ + xlators/cluster/afr/src/afr-inode-write.c | 16 ++++++++-------- + 2 files changed, 28 insertions(+), 8 deletions(-) + create mode 100644 tests/basic/afr/afr-no-fsync.t + +diff --git a/tests/basic/afr/afr-no-fsync.t b/tests/basic/afr/afr-no-fsync.t +new file mode 100644 +index 0000000..0966d9b +--- /dev/null ++++ b/tests/basic/afr/afr-no-fsync.t +@@ -0,0 +1,20 @@ ++#!/bin/bash ++#Tests that sequential write workload doesn't lead to FSYNCs ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,3} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 performance.flush-behind off ++TEST $CLI volume start $V0 ++TEST $CLI volume profile $V0 start ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++TEST dd if=/dev/zero of=$M0/a bs=1M count=500 ++TEST ! "$CLI volume profile $V0 info incremental | grep FSYNC" ++ ++cleanup; +diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c +index b52b6ca..9e6ba35 100644 +--- a/xlators/cluster/afr/src/afr-inode-write.c ++++ b/xlators/cluster/afr/src/afr-inode-write.c +@@ -314,10 +314,10 @@ afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index, + if (ret || !write_is_append) + local->append_write = _gf_false; + +- ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT, +- &open_fd_count); +- if (ret == -1) +- goto unlock; ++ ret = dict_get_uint32 (xdata, GLUSTERFS_ACTIVE_FD_COUNT, ++ &open_fd_count); ++ if (ret < 0) ++ goto unlock; + if (open_fd_count > local->open_fd_count) { + local->open_fd_count = open_fd_count; + local->update_open_fd_count = _gf_true; +@@ -529,10 +529,10 @@ afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + if (ret) + goto out; + +- if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) { +- op_errno = ENOMEM; +- goto out; +- } ++ if (dict_set_uint32 (local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) { ++ op_errno = ENOMEM; ++ goto out; ++ } + + if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) { + op_errno = ENOMEM; +-- +1.8.3.1 + diff --git a/0197-glusterd-ganesha-create-remove-export-file-only-from.patch b/0197-glusterd-ganesha-create-remove-export-file-only-from.patch new file mode 100644 index 0000000..7d76c06 --- /dev/null +++ b/0197-glusterd-ganesha-create-remove-export-file-only-from.patch @@ -0,0 +1,72 @@ +From 80810495522f2e0f484c2b593d66318e34bfabb9 Mon Sep 17 00:00:00 2001 +From: Jiffin Tony Thottan +Date: Wed, 14 Mar 2018 12:01:30 +0530 +Subject: [PATCH 197/201] glusterd/ganesha : create/remove export file only + from the node which performs ganesha.enable + +As part of volume set ganesha.enable on the ganesha export configuration file will be created/removed +using "create-export-ganesha.sh". This performed from the nodes which are part of ganesha cluster. +But it is not need since the file is saved in shared storage and consumed by the nodes in the ganesha cluster. + +Label: BACKPORT FROM UPSTREAM 3.10 + +Upstream Reference : +>patch link : https://review.gluster.org/#/c/19716/ +>Change-Id: I2583899972b47d451a90711940293004a3af4690 +>BUG: 1555195 +>Signed-off-by: Jiffin Tony Thottan + +Change-Id: I2583899972b47d451a90711940293004a3af4690 +BUG: 1551186 +Signed-off-by: Jiffin Tony Thottan +Reviewed-on: https://code.engineering.redhat.com/gerrit/133592 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-ganesha.c | 2 +- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 2 +- + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 2 +- + 3 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-ganesha.c b/xlators/mgmt/glusterd/src/glusterd-ganesha.c +index 7ba25ee..b130d5e 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-ganesha.c ++++ b/xlators/mgmt/glusterd/src/glusterd-ganesha.c +@@ -543,7 +543,7 @@ ganesha_manage_export (dict_t *dict, char *value, + * Create the export file from the node where ganesha.enable "on" + * is executed + * */ +- if (option) { ++ if (option && is_origin_glusterd (dict)) { + ret = manage_export_config (volname, "on", op_errstr); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index 0a21e02..a02a0b3 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -2364,7 +2364,7 @@ glusterd_op_reset_volume (dict_t *dict, char **op_rspstr) + } + + if (!strcmp(key, "ganesha.enable") || !strcmp (key, "all")) { +- if (glusterd_check_ganesha_export (volinfo)) { ++ if (glusterd_check_ganesha_export (volinfo) && is_origin_glusterd (dict)) { + ret = manage_export_config (volname, "off", op_rspstr); + if (ret) + gf_msg (this->name, GF_LOG_WARNING, 0, +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +index 414f9ba..e34d58a 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +@@ -2858,7 +2858,7 @@ glusterd_op_delete_volume (dict_t *dict) + volname); + goto out; + } +- if (glusterd_check_ganesha_export (volinfo)) { ++ if (glusterd_check_ganesha_export (volinfo) && is_origin_glusterd (dict)) { + ret = manage_export_config (volname, "off", NULL); + if (ret) + gf_msg (this->name, GF_LOG_WARNING, 0, 0, +-- +1.8.3.1 + diff --git a/0198-cluster-ec-Change-default-read-policy-to-gfid-hash.patch b/0198-cluster-ec-Change-default-read-policy-to-gfid-hash.patch new file mode 100644 index 0000000..845feaf --- /dev/null +++ b/0198-cluster-ec-Change-default-read-policy-to-gfid-hash.patch @@ -0,0 +1,81 @@ +From 8217d00a0a54457961e7ec7d3afb24e953923c7d Mon Sep 17 00:00:00 2001 +From: Ashish Pandey +Date: Tue, 13 Mar 2018 14:03:20 +0530 +Subject: [PATCH 198/201] cluster/ec: Change default read policy to gfid-hash + +Problem: +Whenever we read data from file over NFS, NFS reads +more data then requested and caches it. Based on the +stat information it makes sure that the cached/pre-read +data is valid or not. + +Consider 4 + 2 EC volume and all the bricks are on +differnt nodes. + +In EC, with round-robin read policy, reads are sent on +different set of data bricks. This way, it balances the +read fops to go on all the bricks and avoid heating UP +(overloading) same set of bricks. + +Due to small difference in clock speed, it is possible +that we get minor difference for atime, mtime or ctime +for different bricks. That might cause a different stat +returned to NFS based on which NFS will discard +cached/pre-read data which is actually not changed and +could be used. + +Solution: +Change read policy for EC as gfid-hash. That will force +all the read to go to same set of bricks. + +>Change-Id: I825441cc519e94bf3dc3aa0bd4cb7c6ae6392c84 +>BUG: 1554743 +>Signed-off-by: Ashish Pandey + +upstream patch: https://review.gluster.org/#/c/19703/ + +Change-Id: I43e95717980ca52c228fdcb7863c58bd4d14151c +BUG: 1559084 +Signed-off-by: Ashish Pandey +Reviewed-on: https://code.engineering.redhat.com/gerrit/133746 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/basic/ec/ec-read-policy.t | 7 +++---- + xlators/cluster/ec/src/ec.c | 2 +- + 2 files changed, 4 insertions(+), 5 deletions(-) + +diff --git a/tests/basic/ec/ec-read-policy.t b/tests/basic/ec/ec-read-policy.t +index e4390aa..fe6fe65 100644 +--- a/tests/basic/ec/ec-read-policy.t ++++ b/tests/basic/ec/ec-read-policy.t +@@ -20,10 +20,9 @@ TEST $CLI volume start $V0 + TEST glusterfs --direct-io-mode=yes --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + #TEST volume operations work fine +-EXPECT "round-robin" mount_get_option_value $M0 $V0-disperse-0 read-policy +-TEST $CLI volume set $V0 disperse.read-policy gfid-hash +-EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "gfid-hash" mount_get_option_value $M0 $V0-disperse-0 read-policy +-TEST $CLI volume reset $V0 disperse.read-policy ++ ++EXPECT "gfid-hash" mount_get_option_value $M0 $V0-disperse-0 read-policy ++TEST $CLI volume set $V0 disperse.read-policy round-robin + EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "round-robin" mount_get_option_value $M0 $V0-disperse-0 read-policy + + #TEST if the option gives the intended behavior. The way we perform this test +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 13ce7fb..bfdca64 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -1447,7 +1447,7 @@ struct volume_options options[] = + { .key = {"read-policy" }, + .type = GF_OPTION_TYPE_STR, + .value = {"round-robin", "gfid-hash"}, +- .default_value = "round-robin", ++ .default_value = "gfid-hash", + .description = "inode-read fops happen only on 'k' number of bricks in" + " n=k+m disperse subvolume. 'round-robin' selects the read" + " subvolume using round-robin algo. 'gfid-hash' selects read" +-- +1.8.3.1 + diff --git a/0199-cluster-ec-avoid-delays-in-self-heal.patch b/0199-cluster-ec-avoid-delays-in-self-heal.patch new file mode 100644 index 0000000..ab3dcc1 --- /dev/null +++ b/0199-cluster-ec-avoid-delays-in-self-heal.patch @@ -0,0 +1,383 @@ +From 09698d53b91786c990a0f7bc067e5c13551b0b12 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Wed, 21 Feb 2018 17:47:37 +0100 +Subject: [PATCH 199/201] cluster/ec: avoid delays in self-heal + +Self-heal creates a thread per brick to sweep the index looking for +files that need to be healed. These threads are started before the +volume comes online, so nothing is done but waiting for the next +sweep. This happens once per minute. + +When a replace brick command is executed, the new graph is loaded and +all index sweeper threads started. When all bricks have reported, a +getxattr request is sent to the root directory of the volume. This +causes a heal on it (because the new brick doesn't have good data), +and marks its contents as pending to be healed. This is done by the +index sweeper thread on the next round, one minute later. + +This patch solves this problem by waking all index sweeper threads +after a successful check on the root directory. + +Additionally, the index sweep thread scans the index directory +sequentially, but it might happen that after healing a directory entry +more index entries are created but skipped by the current directory +scan. This causes the remaining entries to be processed on the next +round, one minute later. The same can happen in the next round, so +the heal is running in bursts and taking a lot to finish, specially +on volumes with many directory levels. + +This patch solves this problem by immediately restarting the index +sweep if a directory has been healed. + +> Upstream patch: https://review.gluster.org/19718 +> master patch: https://review.gluster.org/#/c/19609/ + +Change-Id: I58d9ab6ef17b30f704dc322e1d3d53b904e5f30e +BUG: 1555261 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/133570 +Reviewed-by: Ashish Pandey +Tested-by: Ashish Pandey +Tested-by: RHGS Build Bot +--- + tests/bugs/ec/bug-1547662.t | 41 ++++++++++++++++ + xlators/cluster/ec/src/ec-heal.c | 9 ++++ + xlators/cluster/ec/src/ec-heald.c | 27 +++++++--- + xlators/cluster/ec/src/ec-heald.h | 4 +- + xlators/cluster/ec/src/ec.c | 101 ++++++++++++++++++++++---------------- + 5 files changed, 134 insertions(+), 48 deletions(-) + create mode 100644 tests/bugs/ec/bug-1547662.t + +diff --git a/tests/bugs/ec/bug-1547662.t b/tests/bugs/ec/bug-1547662.t +new file mode 100644 +index 0000000..5748218 +--- /dev/null ++++ b/tests/bugs/ec/bug-1547662.t +@@ -0,0 +1,41 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++# Immediately after replace-brick, trusted.ec.version will be absent, so if it ++# is present we can assume that heal was started on root ++function root_heal_attempted { ++ if [ -z $(get_hex_xattr trusted.ec.version $1) ]; then ++ echo "N" ++ else ++ echo "Y" ++ fi ++} ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST ${CLI} volume create ${V0} disperse 6 redundancy 2 ${H0}:${B0}/${V0}{0..5} ++TEST ${CLI} volume start ${V0} ++TEST ${GFS} --volfile-server ${H0} --volfile-id ${V0} ${M0} ++EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0 ++ ++TEST mkdir ${M0}/base ++TEST mkdir ${M0}/base/dir.{1,2} ++TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2} ++TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2} ++TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2} ++TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2} ++TEST mkdir ${M0}/base/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2}/dir.{1,2} ++ ++TEST ${CLI} volume replace-brick ${V0} ${H0}:${B0}/${V0}5 ${H0}:${B0}/${V0}6 commit force ++EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count ${V0} 0 ++EXPECT_WITHIN ${PROCESS_UP_TIMEOUT} "Y" glustershd_up_status ++EXPECT_WITHIN ${CHILD_UP_TIMEOUT} "6" ec_child_up_count_shd ${V0} 0 ++EXPECT_WITHIN ${HEAL_TIMEOUT} "Y" root_heal_attempted ${B0}/${V0}6 ++EXPECT_WITHIN ${HEAL_TIMEOUT} "^0$" get_pending_heal_count ${V0} ++EXPECT "^127$" echo $(find ${B0}/${V0}6/base -type d | wc -l) ++ ++cleanup; +diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c +index b8518d6..8e02986 100644 +--- a/xlators/cluster/ec/src/ec-heal.c ++++ b/xlators/cluster/ec/src/ec-heal.c +@@ -25,6 +25,7 @@ + #include "ec-combine.h" + #include "ec-method.h" + #include "ec-fops.h" ++#include "ec-heald.h" + + #define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr; }) + #define EC_COUNT(array, max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res; }) +@@ -2752,6 +2753,14 @@ ec_replace_heal (ec_t *ec, inode_t *inode) + gf_msg_debug (ec->xl->name, 0, + "Heal failed for replace brick ret = %d", ret); + ++ /* Once the root inode has been checked, it might have triggered a ++ * self-heal on it after a replace brick command or for some other ++ * reason. It can also happen that the volume already had damaged ++ * files in the index, even if the heal on the root directory failed. ++ * In both cases we need to wake all index healers to continue ++ * healing remaining entries that are marked as dirty. */ ++ ec_shd_index_healer_wake(ec); ++ + loc_wipe (&loc); + return ret; + } +diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c +index b4fa6f8..a703379 100644 +--- a/xlators/cluster/ec/src/ec-heald.c ++++ b/xlators/cluster/ec/src/ec-heald.c +@@ -184,8 +184,19 @@ ec_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name) + int + ec_shd_selfheal (struct subvol_healer *healer, int child, loc_t *loc) + { +- return syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL, +- NULL); ++ int32_t ret; ++ ++ ret = syncop_getxattr (healer->this, loc, NULL, EC_XATTR_HEAL, NULL, ++ NULL); ++ if ((ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) { ++ /* If we have just healed a directory, it's possible that ++ * other index entries have appeared to be healed. We put a ++ * mark so that we can check it later and restart a scan ++ * without delay. */ ++ healer->rerun = _gf_true; ++ } ++ ++ return ret; + } + + +@@ -472,11 +483,15 @@ ec_shd_index_healer_spawn (xlator_t *this, int subvol) + } + + void +-ec_selfheal_childup (ec_t *ec, int child) ++ec_shd_index_healer_wake(ec_t *ec) + { +- if (!ec->shd.iamshd) +- return; +- ec_shd_index_healer_spawn (ec->xl, child); ++ int32_t i; ++ ++ for (i = 0; i < ec->nodes; i++) { ++ if (((ec->xl_up >> i) & 1) != 0) { ++ ec_shd_index_healer_spawn(ec->xl, i); ++ } ++ } + } + + int +diff --git a/xlators/cluster/ec/src/ec-heald.h b/xlators/cluster/ec/src/ec-heald.h +index 4ae02e2..2a84881 100644 +--- a/xlators/cluster/ec/src/ec-heald.h ++++ b/xlators/cluster/ec/src/ec-heald.h +@@ -20,6 +20,8 @@ ec_xl_op (xlator_t *this, dict_t *input, dict_t *output); + + int + ec_selfheal_daemon_init (xlator_t *this); +-void ec_selfheal_childup (ec_t *ec, int child); ++ ++void ++ec_shd_index_healer_wake(ec_t *ec); + + #endif /* __EC_HEALD_H__ */ +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index bfdca64..956b45b 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -322,7 +322,7 @@ ec_get_event_from_state (ec_t *ec) + /* If ec is up but some subvolumes are yet to notify, give + * grace time for other subvols to notify to prevent start of + * I/O which may result in self-heals */ +- if (ec->timer && ec->xl_notify_count < ec->nodes) ++ if (ec->xl_notify_count < ec->nodes) + return GF_EVENT_MAXVAL; + + return GF_EVENT_CHILD_UP; +@@ -344,8 +344,8 @@ ec_up (xlator_t *this, ec_t *ec) + } + + ec->up = 1; +- gf_msg (this->name, GF_LOG_INFO, 0, +- EC_MSG_EC_UP, "Going UP"); ++ gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_UP, "Going UP"); ++ + gf_event (EVENT_EC_MIN_BRICKS_UP, "subvol=%s", this->name); + } + +@@ -358,8 +358,8 @@ ec_down (xlator_t *this, ec_t *ec) + } + + ec->up = 0; +- gf_msg (this->name, GF_LOG_INFO, 0, +- EC_MSG_EC_DOWN, "Going DOWN"); ++ gf_msg (this->name, GF_LOG_INFO, 0, EC_MSG_EC_DOWN, "Going DOWN"); ++ + gf_event (EVENT_EC_MIN_BRICKS_NOT_UP, "subvol=%s", this->name); + } + +@@ -383,31 +383,38 @@ ec_notify_cbk (void *data) + gf_timer_call_cancel (ec->xl->ctx, ec->timer); + ec->timer = NULL; + ++ /* The timeout has expired, so any subvolume that has not ++ * already reported its state, will be considered to be down. ++ * We mark as if all bricks had reported. */ ++ ec->xl_notify = (1ULL << ec->nodes) - 1ULL; ++ ec->xl_notify_count = ec->nodes; ++ ++ /* Since we have marked all subvolumes as notified, it's ++ * guaranteed that ec_get_event_from_state() will return ++ * CHILD_UP or CHILD_DOWN, but not MAXVAL. */ + event = ec_get_event_from_state (ec); +- /* If event is still MAXVAL then enough subvolumes didn't +- * notify, treat it as CHILD_DOWN. */ +- if (event == GF_EVENT_MAXVAL) { +- event = GF_EVENT_CHILD_DOWN; +- ec->xl_notify = (1ULL << ec->nodes) - 1ULL; +- ec->xl_notify_count = ec->nodes; +- } else if (event == GF_EVENT_CHILD_UP) { +- /* Rest of the bricks are still not coming up, +- * notify that ec is up. Files/directories will be +- * healed as in when they come up. */ ++ if (event == GF_EVENT_CHILD_UP) { ++ /* We are ready to bring the volume up. If there are ++ * still bricks DOWN, they will be healed when they ++ * come up. */ + ec_up (ec->xl, ec); + } + +- /* CHILD_DOWN should not come here as no grace period is given +- * for notifying CHILD_DOWN. */ +- + propagate = _gf_true; + } + unlock: + UNLOCK(&ec->lock); + + if (propagate) { ++ if ((event == GF_EVENT_CHILD_UP) && ec->shd.iamshd) { ++ /* We have just brought the volume UP, so we trigger ++ * a self-heal check on the root directory. */ ++ ec_launch_replace_heal (ec); ++ } ++ + default_notify (ec->xl, event, NULL); + } ++ + } + + void +@@ -442,7 +449,7 @@ ec_pending_fops_completed(ec_t *ec) + } + } + +-static void ++static gf_boolean_t + ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state) + { + uintptr_t current_state = 0; +@@ -455,23 +462,28 @@ ec_set_up_state(ec_t *ec, uintptr_t index_mask, uintptr_t new_state) + if (current_state != new_state) { + ec->xl_up ^= index_mask; + ec->xl_up_count += (current_state ? -1 : 1); ++ ++ return _gf_true; + } ++ ++ return _gf_false; + } + + int32_t + ec_notify (xlator_t *this, int32_t event, void *data, void *data2) + { +- ec_t *ec = this->private; +- int32_t idx = 0; +- int32_t error = 0; +- glusterfs_event_t old_event = GF_EVENT_MAXVAL; +- dict_t *input = NULL; +- dict_t *output = NULL; +- gf_boolean_t propagate = _gf_true; +- int32_t orig_event = event; ++ ec_t *ec = this->private; ++ int32_t idx = 0; ++ int32_t error = 0; ++ glusterfs_event_t old_event = GF_EVENT_MAXVAL; ++ dict_t *input = NULL; ++ dict_t *output = NULL; ++ gf_boolean_t propagate = _gf_true; ++ gf_boolean_t needs_shd_check = _gf_false; ++ int32_t orig_event = event; + struct gf_upcall *up_data = NULL; + struct gf_upcall_cache_invalidation *up_ci = NULL; +- uintptr_t mask = 0; ++ uintptr_t mask = 0; + + gf_msg_trace (this->name, 0, "NOTIFY(%d): %p, %p", + event, data, data2); +@@ -498,8 +510,6 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) + + for (idx = 0; idx < ec->nodes; idx++) { + if (ec->xl_list[idx] == data) { +- if (event == GF_EVENT_CHILD_UP) +- ec_selfheal_childup (ec, idx); + break; + } + } +@@ -525,17 +535,27 @@ ec_notify (xlator_t *this, int32_t event, void *data, void *data2) + + mask = 1ULL << idx; + if (event == GF_EVENT_CHILD_UP) { +- ec_set_up_state(ec, mask, mask); ++ /* We need to trigger a selfheal if a brick changes ++ * to UP state. */ ++ needs_shd_check = ec_set_up_state(ec, mask, mask); + } else if (event == GF_EVENT_CHILD_DOWN) { +- ec_set_up_state(ec, mask, 0); ++ ec_set_up_state(ec, mask, 0); + } + + event = ec_get_event_from_state (ec); + +- if (event == GF_EVENT_CHILD_UP && !ec->up) { +- ec_up (this, ec); +- } else if (event == GF_EVENT_CHILD_DOWN && ec->up) { +- ec_down (this, ec); ++ if (event == GF_EVENT_CHILD_UP) { ++ if (!ec->up) { ++ ec_up (this, ec); ++ } ++ } else { ++ /* If the volume is not UP, it's irrelevant if one ++ * brick has come up. We cannot heal anything. */ ++ needs_shd_check = _gf_false; ++ ++ if ((event == GF_EVENT_CHILD_DOWN) && ec->up) { ++ ec_down (this, ec); ++ } + } + + if (event != GF_EVENT_MAXVAL) { +@@ -554,14 +574,13 @@ unlock: + + done: + if (propagate) { ++ if (needs_shd_check && ec->shd.iamshd) { ++ ec_launch_replace_heal (ec); ++ } ++ + error = default_notify (this, event, data); + } + +- if (ec->shd.iamshd && +- ec->xl_notify_count == ec->nodes && +- event == GF_EVENT_CHILD_UP) { +- ec_launch_replace_heal (ec); +- } + out: + return error; + } +-- +1.8.3.1 + diff --git a/0200-quick-read-Discard-cache-for-fallocate-zerofill-and-.patch b/0200-quick-read-Discard-cache-for-fallocate-zerofill-and-.patch new file mode 100644 index 0000000..3d24fa7 --- /dev/null +++ b/0200-quick-read-Discard-cache-for-fallocate-zerofill-and-.patch @@ -0,0 +1,331 @@ +From 2c8b94fb5359424a17dc0380b86cb17058f07bf6 Mon Sep 17 00:00:00 2001 +From: Sachin Prabhu +Date: Wed, 14 Feb 2018 10:36:27 +0530 +Subject: [PATCH 200/201] quick-read: Discard cache for fallocate, zerofill and + discard ops + +The fallocate, zerofill and discard modify file data on the server thus +rendering stale any cache held by the xlator on the client. + +mainline: +> BUG: 1524252 +> Reviewed-on: https://review.gluster.org/19018 +> Reviewed-by: Raghavendra G +> Signed-off-by: Sachin Prabhu +(cherry picked from commit 429f2436b33793136836042ccc43ce4cfd7f89f3) + +BUG: 1523599 +Change-Id: I432146c6390a0cd5869420c373f598da43915f3f +Signed-off-by: Sachin Prabhu +Reviewed-on: https://code.engineering.redhat.com/gerrit/130229 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/quick-read/bz1523599/bz1523599.t | 32 ++++ + tests/bugs/quick-read/bz1523599/test_bz1523599.c | 196 +++++++++++++++++++++++ + xlators/performance/quick-read/src/quick-read.c | 40 ++++- + 3 files changed, 267 insertions(+), 1 deletion(-) + create mode 100755 tests/bugs/quick-read/bz1523599/bz1523599.t + create mode 100644 tests/bugs/quick-read/bz1523599/test_bz1523599.c + +diff --git a/tests/bugs/quick-read/bz1523599/bz1523599.t b/tests/bugs/quick-read/bz1523599/bz1523599.t +new file mode 100755 +index 0000000..5027efe +--- /dev/null ++++ b/tests/bugs/quick-read/bz1523599/bz1523599.t +@@ -0,0 +1,32 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../../include.rc ++. $(dirname $0)/../../../volume.rc ++. $(dirname $0)/../../../fileio.rc ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/brick1; ++EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++ ++logdir=`gluster --print-logdir` ++ ++TEST build_tester $(dirname $0)/test_bz1523599.c -lgfapi -o $(dirname $0)/test_bz1523599 ++TEST ./$(dirname $0)/test_bz1523599 0 $H0 $V0 test_bz1523599 $logdir/bz1523599.log ++TEST ./$(dirname $0)/test_bz1523599 1 $H0 $V0 test_bz1523599 $logdir/bz1523599.log ++TEST ./$(dirname $0)/test_bz1523599 0 $H0 $V0 test_bz1523599 $logdir/bz1523599.log ++TEST ./$(dirname $0)/test_bz1523599 2 $H0 $V0 test_bz1523599 $logdir/bz1523599.log ++ ++cleanup_tester $(dirname $0)/test_bz1523599 ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup; ++ +diff --git a/tests/bugs/quick-read/bz1523599/test_bz1523599.c b/tests/bugs/quick-read/bz1523599/test_bz1523599.c +new file mode 100644 +index 0000000..f0166e1 +--- /dev/null ++++ b/tests/bugs/quick-read/bz1523599/test_bz1523599.c +@@ -0,0 +1,196 @@ ++/* ++ * ./test_bz1523599 0 vm140-111 gv0 test211 log ++ * ./test_bz1523599 1 vm140-111 gv0 test211 log ++ * Open - Discard - Read - Then check read information to see if the initial TEST_STR_LEN/2 bytes read zero ++ */ ++ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define TEST_STR_LEN 2048 ++ ++enum fallocate_flag { ++ TEST_WRITE, ++ TEST_DISCARD, ++ TEST_ZEROFILL, ++}; ++ ++void print_str(char *str, int len) ++{ ++ int i, addr; ++ ++ printf("%07x\t", 0); ++ for (i = 0; i < len; i++) { ++ printf("%02x", str[i]); ++ if (i) { ++ if ((i + 1) % 16 == 0) ++ printf("\n%07x\t", i+1); ++ else if ((i + 1) % 4 == 0) ++ printf(" "); ++ } ++ } ++ printf("\n"); ++} ++ ++int ++test_read(char *str, int total_length, int len_zero) ++{ ++ int i; ++ int ret = 0; ++ ++ for (i = 0; i < len_zero; i++) { ++ if (str[i]) { ++ fprintf(stderr, "char at position %d not zeroed out\n", ++ i); ++ ret = -EIO; ++ goto out; ++ } ++ } ++ ++ for (i = len_zero; i < total_length; i++) { ++ if (str[i] != 0x11) { ++ fprintf(stderr, ++ "char at position %d does not contain pattern\n", ++ i); ++ ret = -EIO; ++ goto out; ++ } ++ } ++out: ++ return ret; ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int opcode; ++ char *host_name, *volume_name, *file_path, *glfs_log_path; ++ glfs_t *fs = NULL; ++ glfs_fd_t *fd = NULL; ++ off_t offset = 0; ++ size_t len_zero = TEST_STR_LEN / 2; ++ char writestr[TEST_STR_LEN]; ++ char readstr[TEST_STR_LEN]; ++ struct iovec iov = {&readstr, TEST_STR_LEN}; ++ int i; ++ int ret = 1; ++ ++ for (i = 0; i < TEST_STR_LEN; i++) ++ writestr[i] = 0x11; ++ for (i = 0; i < TEST_STR_LEN; i++) ++ readstr[i] = 0x22; ++ ++ if (argc != 6) { ++ fprintf(stderr, ++ "Syntax: %s \n", ++ argv[0]); ++ return 1; ++ } ++ ++ opcode = atoi(argv[1]); ++ host_name = argv[2]; ++ volume_name = argv[3]; ++ file_path = argv[4]; ++ glfs_log_path = argv[5]; ++ ++ fs = glfs_new(volume_name); ++ if (!fs) { ++ perror("glfs_new"); ++ return 1; ++ } ++ ++ ret = glfs_set_volfile_server(fs, "tcp", host_name, 24007); ++ if (ret != 0) { ++ perror("glfs_set_volfile_server"); ++ goto out; ++ } ++ ++ ret = glfs_set_logging(fs, glfs_log_path, 7); ++ if (ret != 0) { ++ perror("glfs_set_logging"); ++ goto out; ++ } ++ ++ ret = glfs_init(fs); ++ if (ret != 0) { ++ perror("glfs_init"); ++ goto out; ++ } ++ ++ fd = glfs_creat(fs, file_path, O_RDWR, 0777); ++ if (fd == NULL) { ++ perror("glfs_creat"); ++ ret = -1; ++ goto out; ++ } ++ ++ switch (opcode) { ++ case TEST_WRITE: ++ fprintf(stderr, "Test Write\n"); ++ ret = glfs_write(fd, writestr, TEST_STR_LEN, 0); ++ if (ret < 0) { ++ perror("glfs_write"); ++ goto out; ++ } else if (ret != TEST_STR_LEN) { ++ fprintf(stderr, "insufficient data written %d \n", ret); ++ ret = -EIO; ++ goto out; ++ } ++ ret = 0; ++ goto out; ++ case TEST_DISCARD: ++ fprintf(stderr, "Test Discard\n"); ++ ret = glfs_discard(fd, offset, len_zero); ++ if (ret < 0) { ++ if (errno == EOPNOTSUPP) { ++ fprintf(stderr, "Operation not supported\n"); ++ ret = 0; ++ goto out; ++ } ++ perror("glfs_discard"); ++ goto out; ++ } ++ goto test_read; ++ case TEST_ZEROFILL: ++ fprintf(stderr, "Test Zerofill\n"); ++ ret = glfs_zerofill(fd, offset, len_zero); ++ if (ret < 0) { ++ if (errno == EOPNOTSUPP) { ++ fprintf(stderr, "Operation not supported\n"); ++ ret = 0; ++ goto out; ++ } ++ perror("glfs_zerofill"); ++ goto out; ++ } ++ goto test_read; ++ default: ++ ret = -1; ++ fprintf(stderr, "Incorrect test code %d\n", opcode); ++ goto out; ++ } ++ ++test_read: ++ ret = glfs_readv(fd, &iov, 1, 0); ++ if (ret < 0) { ++ perror("glfs_readv"); ++ goto out; ++ } ++ ++ /* printf("Read str\n"); print_str(readstr, TEST_STR_LEN); printf("\n"); */ ++ ret = test_read(readstr, TEST_STR_LEN, len_zero); ++ ++out: ++ if (fd) ++ glfs_close(fd); ++ glfs_fini(fs); ++ ++ if (ret) ++ return -1; ++ ++ return 0; ++} +diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c +index 92b2f82..61232c1 100644 +--- a/xlators/performance/quick-read/src/quick-read.c ++++ b/xlators/performance/quick-read/src/quick-read.c +@@ -668,6 +668,41 @@ qr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + return 0; + } + ++static int ++qr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int keep_size, ++ off_t offset, size_t len, dict_t *xdata) ++{ ++ qr_inode_prune (this, fd->inode); ++ ++ STACK_WIND (frame, default_fallocate_cbk, ++ FIRST_CHILD (this), FIRST_CHILD (this)->fops->fallocate, ++ fd, keep_size, offset, len, xdata); ++ return 0; ++} ++ ++static int ++qr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ size_t len, dict_t *xdata) ++{ ++ qr_inode_prune (this, fd->inode); ++ ++ STACK_WIND (frame, default_discard_cbk, ++ FIRST_CHILD (this), FIRST_CHILD (this)->fops->discard, ++ fd, offset, len, xdata); ++ return 0; ++} ++ ++static int ++qr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ off_t len, dict_t *xdata) ++{ ++ qr_inode_prune (this, fd->inode); ++ ++ STACK_WIND (frame, default_zerofill_cbk, ++ FIRST_CHILD (this), FIRST_CHILD (this)->fops->zerofill, ++ fd, offset, len, xdata); ++ return 0; ++} + + int + qr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, +@@ -1128,7 +1163,10 @@ struct xlator_fops fops = { + .readv = qr_readv, + .writev = qr_writev, + .truncate = qr_truncate, +- .ftruncate = qr_ftruncate ++ .ftruncate = qr_ftruncate, ++ .fallocate = qr_fallocate, ++ .discard = qr_discard, ++ .zerofill = qr_zerofill + }; + + struct xlator_cbks cbks = { +-- +1.8.3.1 + diff --git a/0201-posix-After-set-storage.reserve-limit-df-does-not-sh.patch b/0201-posix-After-set-storage.reserve-limit-df-does-not-sh.patch new file mode 100644 index 0000000..2e391fb --- /dev/null +++ b/0201-posix-After-set-storage.reserve-limit-df-does-not-sh.patch @@ -0,0 +1,62 @@ +From 78918cf18ead4637c5aea20025c319d845518733 Mon Sep 17 00:00:00 2001 +From: moagrawa +Date: Mon, 26 Mar 2018 11:00:22 +0530 +Subject: [PATCH 201/201] posix: After set storage.reserve limit df does not + show correct output + +Problem: After set storage.reserve limit df does not show correct + output on client + +Solution: Update code in posix_statfs to reflect the disk usage + correctly on client + +> BUG: 1533736 +> Change-Id: I2c5feda0303d0f4abe5af22fac903011792b2dc8 +> Reviewed on https://review.gluster.org/#/c/19186/ +> Signed-off-by: Mohit Agrawal +> (cherry pick from commit c494445c886e16ddc6a960b9074a68fe9621ee09) + +BUG: 1550982 +Change-Id: I5444fad40f2df2fdcf4ab80c2641fc9cc56b18fe +Signed-off-by: moagrawa +Reviewed-on: https://code.engineering.redhat.com/gerrit/133745 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/storage/posix/src/posix.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c +index 6856e5e..56a2ca9 100644 +--- a/xlators/storage/posix/src/posix.c ++++ b/xlators/storage/posix/src/posix.c +@@ -3751,6 +3751,7 @@ posix_statfs (call_frame_t *frame, xlator_t *this, + struct posix_private * priv = NULL; + int shared_by = 1; + int percent = 0; ++ uint64_t reserved_blocks = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); +@@ -3776,7 +3777,17 @@ posix_statfs (call_frame_t *frame, xlator_t *this, + } + + percent = priv->disk_reserve; +- buf.f_bfree = (buf.f_bfree - ((buf.f_blocks * percent) / 100)); ++ reserved_blocks = (buf.f_blocks * percent) / 100; ++ ++ if (buf.f_bfree > reserved_blocks) { ++ buf.f_bfree = (buf.f_bfree - reserved_blocks); ++ if (buf.f_bavail > buf.f_bfree) { ++ buf.f_bavail = buf.f_bfree; ++ } ++ } else { ++ buf.f_bfree = 0; ++ buf.f_bavail = 0; ++ } + + shared_by = priv->shared_brick_count; + if (shared_by > 1) { +-- +1.8.3.1 + diff --git a/glusterfs.spec b/glusterfs.spec index 8ad0f8f..69179e2 100644 --- a/glusterfs.spec +++ b/glusterfs.spec @@ -192,7 +192,7 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist} %else Name: glusterfs Version: 3.12.2 -Release: 5%{?dist} +Release: 6%{?dist} %endif License: GPLv2 or LGPLv3+ Group: System Environment/Base @@ -445,6 +445,27 @@ Patch0177: 0177-hooks-fix-workdir-in-S13create-subdir-mounts.sh.patch Patch0178: 0178-cluster-ec-Do-lock-conflict-check-correctly-for-wait.patch Patch0179: 0179-packaging-adding-missed-part-from-5eed664-while-back.patch Patch0180: 0180-packaging-adding-missed-part-from-5eed664-while-back.patch +Patch0181: 0181-glusterd-get-state-memory-leak-fix.patch +Patch0182: 0182-glusterd-Fix-coverity-issues-in-glusterd-handler.c.patch +Patch0183: 0183-cluster-afr-Fix-dict-leak-in-pre-op.patch +Patch0184: 0184-cli-glusterfsd-remove-copyright-information.patch +Patch0185: 0185-rpcsvc-correct-event-thread-scaling.patch +Patch0186: 0186-cli-Remove-upstream-doc-reference.patch +Patch0187: 0187-features-shard-Do-list_del_init-while-list-memory-is.patch +Patch0188: 0188-georep-Pause-Resume-of-geo-replication-with-wrong-us.patch +Patch0189: 0189-fuse-enable-proper-fgetattr-like-semantics.patch +Patch0190: 0190-cluster-afr-Adding-option-to-take-full-file-lock.patch +Patch0191: 0191-cluster-afr-Make-afr_fsync-a-transaction.patch +Patch0192: 0192-cluster-afr-Remove-compound-fops-usage-in-afr.patch +Patch0193: 0193-cluster-afr-Remove-unused-code-paths.patch +Patch0194: 0194-cluster-afr-Make-AFR-eager-locking-similar-to-EC.patch +Patch0195: 0195-storage-posix-Add-active-fd-count-option-in-gluster.patch +Patch0196: 0196-cluster-afr-Switch-to-active-fd-count-for-open-fd-ch.patch +Patch0197: 0197-glusterd-ganesha-create-remove-export-file-only-from.patch +Patch0198: 0198-cluster-ec-Change-default-read-policy-to-gfid-hash.patch +Patch0199: 0199-cluster-ec-avoid-delays-in-self-heal.patch +Patch0200: 0200-quick-read-Discard-cache-for-fallocate-zerofill-and-.patch +Patch0201: 0201-posix-After-set-storage.reserve-limit-df-does-not-sh.patch %description GlusterFS is a distributed file-system capable of scaling to several @@ -2388,6 +2409,11 @@ fi %endif %changelog +* Mon Mar 26 2018 Milind Changire - 3.12.2-6 +- fixes bugs bz#1491785 bz#1518710 bz#1523599 bz#1528733 bz#1550474 + bz#1550982 bz#1551186 bz#1552360 bz#1552414 bz#1552425 bz#1554255 bz#1554905 + bz#1555261 bz#1556895 bz#1557297 bz#1559084 bz#1559788 + * Wed Mar 07 2018 Milind Changire - 3.12.2-5 - fixes bugs bz#1378371 bz#1384983 bz#1472445 bz#1493085 bz#1508999 bz#1516638 bz#1518260 bz#1529072 bz#1530519 bz#1537357 bz#1540908 bz#1541122