From 44012ad58070f16ea1295e19a305a661a1c1430f Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Wed, 7 Mar 2018 08:56:57 -0500 Subject: [PATCH] autobuild v3.12.2-5 Resolves: bz#1378371 bz#1384983 bz#1472445 bz#1493085 bz#1508999 Resolves: bz#1516638 bz#1518260 bz#1529072 bz#1530519 bz#1537357 Resolves: bz#1540908 bz#1541122 bz#1541932 bz#1543068 bz#1544382 Resolves: bz#1544852 bz#1545570 bz#1546075 bz#1546945 bz#1546960 Resolves: bz#1547012 bz#1549497 Signed-off-by: Milind Changire --- ...-DISCARD-doesn-t-punch-hole-properly.patch | 81 ++ ..._up_subvol-before-use-in-dht_opendir.patch | 76 ++ ...-Improve-geo-rep-pre-validation-logs.patch | 73 ++ ...d-up-gfid-lookup-100x-by-using-an-SQ.patch | 67 ++ 0153-afr-add-quorum-checks-in-post-op.patch | 71 ++ ...correct-errno-in-post-op-quorum-chec.patch | 66 ++ ...all-cases-all-bricks-being-blamed-as.patch | 323 +++++++ ...e-behind-fix-bug-while-handling-shor.patch | 72 ++ ...ve-unnecessary-child_up-initializati.patch | 45 + ...e-eager-lock-option-for-non-regular-.patch | 207 ++++ ...-S10selinux-label-brick.sh-hook-scri.patch | 107 +++ ...-and-disable-selinux-ganesha_use_fus.patch | 84 ++ 0161-cluster-dht-Fixed-a-typo.patch | 42 + ...andle-single-dht-child-in-dht_lookup.patch | 77 ++ ...-uuid-instead-of-hostname-while-find.patch | 37 + ...lazy-umount-and-use-mount-namespaces.patch | 278 ++++++ ...re-ENODATA-from-getxattr-for-posix-a.patch | 54 ++ ...scale-rpcsvc_request_handler-threads.patch | 361 +++++++ ...-change-voltype-for-ganesha.enable-i.patch | 44 + ...ass-the-correct-block-num-to-store-i.patch | 43 + ...everage-block_num-info-in-inode-ctx-.patch | 80 ++ ...ix-shard-inode-refcount-when-it-s-pa.patch | 150 +++ ...pon-FSYNC-from-upper-layers-wind-fsy.patch | 887 ++++++++++++++++++ ...dd-profile_enabled-flag-in-get-state.patch | 110 +++ ...-missed-part-from-5eed664-while-back.patch | 53 ++ ...ipt-to-stat-the-subdirs-in-add-brick.patch | 192 ++++ 0175-rpc-make-actor-search-parallel.patch | 282 ++++++ ...get-fixes-for-client-io-threads-quor.patch | 210 +++++ ...orkdir-in-S13create-subdir-mounts.sh.patch | 35 + ...ck-conflict-check-correctly-for-wait.patch | 90 ++ ...-missed-part-from-5eed664-while-back.patch | 38 + ...-missed-part-from-5eed664-while-back.patch | 39 + glusterfs.spec | 72 +- 33 files changed, 4437 insertions(+), 9 deletions(-) create mode 100644 0149-cluster-ec-EC-DISCARD-doesn-t-punch-hole-properly.patch create mode 100644 0150-dht-Fill-first_up_subvol-before-use-in-dht_opendir.patch create mode 100644 0151-geo-rep-Improve-geo-rep-pre-validation-logs.patch create mode 100644 0152-glusterfind-Speed-up-gfid-lookup-100x-by-using-an-SQ.patch create mode 100644 0153-afr-add-quorum-checks-in-post-op.patch create mode 100644 0154-afr-capture-the-correct-errno-in-post-op-quorum-chec.patch create mode 100644 0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch create mode 100644 0156-performance-write-behind-fix-bug-while-handling-shor.patch create mode 100644 0157-cluster-afr-remove-unnecessary-child_up-initializati.patch create mode 100644 0158-cluster-ec-create-eager-lock-option-for-non-regular-.patch create mode 100644 0159-extras-hooks-Fix-S10selinux-label-brick.sh-hook-scri.patch create mode 100644 0160-common-ha-enable-and-disable-selinux-ganesha_use_fus.patch create mode 100644 0161-cluster-dht-Fixed-a-typo.patch create mode 100644 0162-cluster-dht-Handle-single-dht-child-in-dht_lookup.patch create mode 100644 0163-glusterd-compare-uuid-instead-of-hostname-while-find.patch create mode 100644 0164-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch create mode 100644 0165-cluster-dht-Ignore-ENODATA-from-getxattr-for-posix-a.patch create mode 100644 0166-rpcsvc-scale-rpcsvc_request_handler-threads.patch create mode 100644 0167-glusterd-ganesha-change-voltype-for-ganesha.enable-i.patch create mode 100644 0168-features-shard-Pass-the-correct-block-num-to-store-i.patch create mode 100644 0169-features-shard-Leverage-block_num-info-in-inode-ctx-.patch create mode 100644 0170-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch create mode 100644 0171-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch create mode 100644 0172-glusterd-add-profile_enabled-flag-in-get-state.patch create mode 100644 0173-packaging-adding-missed-part-from-5eed664-while-back.patch create mode 100644 0174-hooks-add-a-script-to-stat-the-subdirs-in-add-brick.patch create mode 100644 0175-rpc-make-actor-search-parallel.patch create mode 100644 0176-glusterd-volume-get-fixes-for-client-io-threads-quor.patch create mode 100644 0177-hooks-fix-workdir-in-S13create-subdir-mounts.sh.patch create mode 100644 0178-cluster-ec-Do-lock-conflict-check-correctly-for-wait.patch create mode 100644 0179-packaging-adding-missed-part-from-5eed664-while-back.patch create mode 100644 0180-packaging-adding-missed-part-from-5eed664-while-back.patch diff --git a/0149-cluster-ec-EC-DISCARD-doesn-t-punch-hole-properly.patch b/0149-cluster-ec-EC-DISCARD-doesn-t-punch-hole-properly.patch new file mode 100644 index 0000000..ec648c3 --- /dev/null +++ b/0149-cluster-ec-EC-DISCARD-doesn-t-punch-hole-properly.patch @@ -0,0 +1,81 @@ +From 5b28188f7a970ccea32d6ed65b75c38703f25045 Mon Sep 17 00:00:00 2001 +From: Sunil Kumar Acharya +Date: Wed, 22 Nov 2017 15:12:26 +0530 +Subject: [PATCH 149/180] cluster/ec: EC DISCARD doesn't punch hole properly + +Problem: +DISCARD operation on EC volume was punching hole of lesser +size than the specified size in some cases. + +Solution: +EC was not handling punch hole for tail part in some cases. +Updated the code to handle it appropriately. + +>BUG: 1516206 +>Change-Id: If3e69e417c3e5034afee04e78f5f78855e65f932 +>Signed-off-by: Sunil Kumar Acharya + +Upstream patch: https://review.gluster.org/#/c/18838/ + +BUG: 1518260 +Change-Id: If3e69e417c3e5034afee04e78f5f78855e65f932 +Signed-off-by: Sunil Kumar Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/124648 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + tests/basic/ec/ec-discard.t | 10 +++++++++- + xlators/cluster/ec/src/ec-inode-write.c | 6 ++++-- + 2 files changed, 13 insertions(+), 3 deletions(-) + +diff --git a/tests/basic/ec/ec-discard.t b/tests/basic/ec/ec-discard.t +index 4a44cec..001f449 100644 +--- a/tests/basic/ec/ec-discard.t ++++ b/tests/basic/ec/ec-discard.t +@@ -137,6 +137,15 @@ TEST md5_sum=`get_md5_sum $B0/test_file` + EXPECT $md5_sum get_md5_sum $M0/test_file + TEST rm -f $B0/test_file $M0/test_file + ++#Offset and Size not at boundary covering a stripe ++TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8 ++TEST cp $B0/test_file $M0/test_file ++TEST fallocate -p -o 1500 -l 3000 $B0/test_file ++TEST fallocate -p -o 1500 -l 3000 $M0/test_file ++TEST md5_sum=`get_md5_sum $B0/test_file` ++EXPECT $md5_sum get_md5_sum $M0/test_file ++TEST rm -f $B0/test_file $M0/test_file ++ + #Offset and Size not at boundary + TEST dd if=/dev/urandom of=$B0/test_file bs=1024 count=8 + TEST cp $B0/test_file $M0/test_file +@@ -144,7 +153,6 @@ TEST fallocate -p -o 1000 -l 3072 $B0/test_file + TEST fallocate -p -o 1000 -l 3072 $M0/test_file + TEST md5_sum=`get_md5_sum $B0/test_file` + EXPECT $md5_sum get_md5_sum $M0/test_file +-#TEST rm -f $B0/test_file $M0/test_file + + #Data Corruption Tests + #Kill brick1 and brick2 +diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c +index ae51202..2c1165b 100644 +--- a/xlators/cluster/ec/src/ec-inode-write.c ++++ b/xlators/cluster/ec/src/ec-inode-write.c +@@ -1144,11 +1144,13 @@ void ec_update_discard_write(ec_fop_data_t *fop, uintptr_t mask) + error = ec_update_write (fop, mask, off_head, fop->user_size); + } else { + size_head = fop->int32; +- size_tail = (fop->user_size - fop->int32) % ec->stripe_size; ++ size_tail = (off_head + fop->user_size) % ec->stripe_size; + off_tail = off_head + fop->user_size - size_tail; + if (size_head) { + error = ec_update_write (fop, mask, off_head, size_head); +- goto out; ++ if (error) { ++ goto out; ++ } + } + if (size_tail) { + error = ec_update_write (fop, mask, off_tail, size_tail); +-- +1.8.3.1 + diff --git a/0150-dht-Fill-first_up_subvol-before-use-in-dht_opendir.patch b/0150-dht-Fill-first_up_subvol-before-use-in-dht_opendir.patch new file mode 100644 index 0000000..9154833 --- /dev/null +++ b/0150-dht-Fill-first_up_subvol-before-use-in-dht_opendir.patch @@ -0,0 +1,76 @@ +From fd0a8f99f853e923d485acfcdda556156105c8d1 Mon Sep 17 00:00:00 2001 +From: Poornima G +Date: Mon, 13 Nov 2017 12:55:06 +0530 +Subject: [PATCH 150/180] dht: Fill first_up_subvol before use in dht_opendir + +Reported by: Sam McLeod + +> Upstream patch: https://review.gluster.org/#/c/18723/ + +Change-Id: Ic8f9b46b173796afd70aff1042834b03ac3e80b2 +BUG: 1529072 +Signed-off-by: Poornima G +Reviewed-on: https://code.engineering.redhat.com/gerrit/130059 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/readdir-ahead/bug-1512437.t | 23 +++++++++++++++++++++++ + xlators/cluster/dht/src/dht-common.c | 5 +++++ + 2 files changed, 28 insertions(+) + create mode 100755 tests/bugs/readdir-ahead/bug-1512437.t + +diff --git a/tests/bugs/readdir-ahead/bug-1512437.t b/tests/bugs/readdir-ahead/bug-1512437.t +new file mode 100755 +index 0000000..50eaa7d +--- /dev/null ++++ b/tests/bugs/readdir-ahead/bug-1512437.t +@@ -0,0 +1,23 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup; ++ ++TEST glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/${V0}1 ++TEST $CLI volume start $V0 ++ ++TEST $CLI volume set $V0 parallel-readdir on ++TEST $CLI volume set $V0 readdir-optimize on ++ ++TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0 ++TEST mkdir -p $M0/subdir1/subdir2; ++umount $M0 ++TEST glusterfs --volfile-server=$H0 --volfile-id=$V0 $M0 ++count=`ls -1 $M0/subdir1 | wc -l` ++TEST [ $count -eq 1 ] ++ ++cleanup; +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 727a47b..1262732 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -5043,6 +5043,7 @@ dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + op_errno = ENOMEM; + goto err; + } ++ local->first_up_subvol = dht_first_up_subvol (this); + + if (!xdata) { + xdata = dict_new (); +@@ -5070,6 +5071,10 @@ dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd, + subvolumes = conf->local_subvols; + } + ++ /* In case of parallel-readdir, the readdir-ahead will be loaded ++ * below dht, in this case, if we want to enable or disable SKIP_DIRs ++ * it has to be done in opendir, so that prefetching logic in ++ * readdir-ahead, honors it */ + for (i = 0; i < call_count; i++) { + if (conf->readdir_optimize == _gf_true) { + if (subvolumes[i] != local->first_up_subvol) { +-- +1.8.3.1 + diff --git a/0151-geo-rep-Improve-geo-rep-pre-validation-logs.patch b/0151-geo-rep-Improve-geo-rep-pre-validation-logs.patch new file mode 100644 index 0000000..b1fd005 --- /dev/null +++ b/0151-geo-rep-Improve-geo-rep-pre-validation-logs.patch @@ -0,0 +1,73 @@ +From f34c2d392ad7e115dac0d146ec466fc218e060e3 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 18 Jan 2018 22:59:00 -0500 +Subject: [PATCH 151/180] geo-rep: Improve geo-rep pre-validation logs + +Geo-rep runs gverify.sh which does pre-validation. +As part of it, master and slave volume is mounted +to verify the size. If for some reason, the mount +fails, the error message does not point out the +mount log file location. Also both master and +slave mount logs are same. + +Patch does following improvements. + +1. Master and slave mount logs are separated and + error message points the log file to be looked for. +2. The log location is changed to /var/log/glusterfs/geo-replication + instead of /var/log/glusterfs/geo-replication-slaves +3. The log file name is changed to "gverify-mastermnt.log" and + "gverify-slavemnt.log" for master and slave mount respectively + +Upstream Patch: https://review.gluster.org/19242 +BUG: 1541122 +Change-Id: Ia644ec0afebbdaae92e01adf03c635e5f8866a02 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/130065 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/src/gverify.sh | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +diff --git a/geo-replication/src/gverify.sh b/geo-replication/src/gverify.sh +index 9b1328a..728ce07 100755 +--- a/geo-replication/src/gverify.sh ++++ b/geo-replication/src/gverify.sh +@@ -7,7 +7,8 @@ + # Considering buffer_size 100MB + BUFFER_SIZE=104857600; + SSH_PORT=$5; +-slave_log_file=`gluster --print-logdir`/geo-replication-slaves/slave.log ++master_log_file=`gluster --print-logdir`/geo-replication/gverify-mastermnt.log ++slave_log_file=`gluster --print-logdir`/geo-replication/gverify-slavemnt.log + + function SSHM() + { +@@ -93,7 +94,7 @@ function master_stats() + local m_status; + + d=$(mktemp -d -t ${0##*/}.XXXXXX 2>/dev/null); +- glusterfs -s localhost --xlator-option="*dht.lookup-unhashed=off" --volfile-id $MASTERVOL -l $slave_log_file $d; ++ glusterfs -s localhost --xlator-option="*dht.lookup-unhashed=off" --volfile-id $MASTERVOL -l $master_log_file $d; + i=$(get_inode_num $d); + if [[ "$i" -ne "1" ]]; then + echo 0:0; +@@ -190,12 +191,12 @@ function main() + slave_no_of_files=$(echo $slave_data | cut -f4 -d':'); + + if [[ "x$master_disk_size" = "x" || "x$master_version" = "x" || "$master_disk_size" -eq "0" ]]; then +- echo "FORCE_BLOCKER|Unable to fetch master volume details. Please check the master cluster and master volume." > $log_file; ++ echo "FORCE_BLOCKER|Unable to mount and fetch master volume details. Please check the log: $master_log_file" > $log_file; + exit 1; + fi; + + if [[ "x$slave_disk_size" = "x" || "x$slave_version" = "x" || "$slave_disk_size" -eq "0" ]]; then +- echo "FORCE_BLOCKER|Unable to fetch slave volume details. Please check the slave cluster and slave volume." > $log_file; ++ echo "FORCE_BLOCKER|Unable to mount and fetch slave volume details. Please check the log: $slave_log_file" > $log_file; + exit 1; + fi; + +-- +1.8.3.1 + diff --git a/0152-glusterfind-Speed-up-gfid-lookup-100x-by-using-an-SQ.patch b/0152-glusterfind-Speed-up-gfid-lookup-100x-by-using-an-SQ.patch new file mode 100644 index 0000000..05304ec --- /dev/null +++ b/0152-glusterfind-Speed-up-gfid-lookup-100x-by-using-an-SQ.patch @@ -0,0 +1,67 @@ +From d41cb3f53614dcf514d96717b5bde67b8d4c1335 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Niklas=20Hamb=C3=BCchen?= +Date: Mon, 12 Feb 2018 17:58:48 +0530 +Subject: [PATCH 152/180] glusterfind: Speed up gfid lookup 100x by using an + SQL index +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Fixes #1529883. + +This fixes some bits of `glusterfind`'s horrible performance, +making it 100x faster. + +Until now, glusterfind was, for each line in each CHANGELOG.* file, +linearly reading the entire contents of the sqlite database in +4096-bytes-sized pread64() syscalls when executing the + + SELECT COUNT(1) FROM %s WHERE 1=1 AND gfid = ? + +query through the code path: + + get_changes() + parse_changelog_to_db() + when_data_meta() + gfidpath_exists() + _exists() + +In a quick benchmark on my laptop, doing one such `SELECT` query +took ~75ms on a 10MB-sized sqlite DB, while doing the same query +with an index took < 1ms. + +mainline: +> BUG: 1529883 +> Reviewed-on: https://review.gluster.org/19114 +> Reviewed-by: Aravinda VK +> Signed-off-by: Niklas Hambüchen +(cherry picked from commit 14dbd5da1cae64e6d4d2c69966e19844d090ce98) + +Change-Id: I8e7fe60f1f45a06c102f56b54d2ead9e0377794e +Signed-off-by: Niklas Hambüchen +Reviewed-on: https://code.engineering.redhat.com/gerrit/130064 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: Sunil Kumar Heggodu Gopala Acharya +--- + tools/glusterfind/src/changelogdata.py | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/tools/glusterfind/src/changelogdata.py b/tools/glusterfind/src/changelogdata.py +index 3140d94..641593c 100644 +--- a/tools/glusterfind/src/changelogdata.py ++++ b/tools/glusterfind/src/changelogdata.py +@@ -112,6 +112,11 @@ class ChangelogData(object): + """ + self.cursor.execute(create_table) + ++ create_index = """ ++ CREATE INDEX gfid_index ON gfidpath(gfid); ++ """ ++ self.cursor.execute(create_index) ++ + def _create_table_inodegfid(self): + drop_table = "DROP TABLE IF EXISTS inodegfid" + self.cursor.execute(drop_table) +-- +1.8.3.1 + diff --git a/0153-afr-add-quorum-checks-in-post-op.patch b/0153-afr-add-quorum-checks-in-post-op.patch new file mode 100644 index 0000000..c44b75e --- /dev/null +++ b/0153-afr-add-quorum-checks-in-post-op.patch @@ -0,0 +1,71 @@ +From a1da6900ac8030dd9c156b38373837a00dbb37c0 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Thu, 18 Jan 2018 14:21:57 +0530 +Subject: [PATCH 153/180] afr: add quorum checks in post-op + +Backport of https://review.gluster.org/#/c/18571/ + +afr relies on pending changelog xattrs to identify source and sinks and the +setting of these xattrs happen in post-op. So if post-op fails, we need to +unwind the write txn with a failure. + +Change-Id: I0f019ac03890108324ee7672883d774918b20be1 +BUG: 1384983 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/129219 +Reviewed-by: Pranith Kumar Karampuri +Tested-by: RHGS Build Bot +--- + xlators/cluster/afr/src/afr-transaction.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 7e40bba..18d2ded 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -534,6 +534,29 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) + /* {{{ pending */ + + ++gf_boolean_t ++afr_post_op_has_quorum (afr_local_t *local, xlator_t *this) ++{ ++ afr_private_t *priv = NULL; ++ int i = 0; ++ unsigned char *post_op_children = NULL; ++ ++ priv = this->private; ++ post_op_children = alloca0 (priv->child_count); ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (!local->transaction.failed_subvols[i]) { ++ post_op_children[i] = 1; ++ } ++ } ++ ++ if (afr_has_quorum (post_op_children, this)) { ++ return _gf_true; ++ } ++ ++ return _gf_false; ++} ++ + int + afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + { +@@ -545,6 +568,12 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + priv = this->private; + int_lock = &local->internal_lock; + ++ /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ ++ if (!afr_post_op_has_quorum (local, this)) { ++ local->op_ret = -1; ++ local->op_errno = ENOTCONN; ++ } ++ + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); + local->transaction.resume_stub = NULL; +-- +1.8.3.1 + diff --git a/0154-afr-capture-the-correct-errno-in-post-op-quorum-chec.patch b/0154-afr-capture-the-correct-errno-in-post-op-quorum-chec.patch new file mode 100644 index 0000000..aa33640 --- /dev/null +++ b/0154-afr-capture-the-correct-errno-in-post-op-quorum-chec.patch @@ -0,0 +1,66 @@ +From 9b0122cbe61047c9591de447bb19a6028b69861c Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Tue, 30 Jan 2018 20:59:23 +0530 +Subject: [PATCH 154/180] afr: capture the correct errno in post-op quorum + check + +Backport of https://review.gluster.org/#/c/19366/ + +If the post-op phase of txn did not meet quorm checks, use that errno to +unwind the FOP rather than blindly setting ENOTCONN. + +Change-Id: I0cb0c8771ec75a45f9a25ad4cd8601103deddf0c +BUG: 1384983 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/129221 +Tested-by: RHGS Build Bot +--- + xlators/cluster/afr/src/afr-transaction.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 18d2ded..19740e1 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -534,8 +534,8 @@ afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) + /* {{{ pending */ + + +-gf_boolean_t +-afr_post_op_has_quorum (afr_local_t *local, xlator_t *this) ++void ++afr_handle_post_op_quorum (afr_local_t *local, xlator_t *this) + { + afr_private_t *priv = NULL; + int i = 0; +@@ -551,10 +551,13 @@ afr_post_op_has_quorum (afr_local_t *local, xlator_t *this) + } + + if (afr_has_quorum (post_op_children, this)) { +- return _gf_true; ++ return; + } + +- return _gf_false; ++ local->op_ret = -1; ++ /*local->op_errno is already captured in post-op callback.*/ ++ ++ return; + } + + int +@@ -569,10 +572,7 @@ afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this) + int_lock = &local->internal_lock; + + /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */ +- if (!afr_post_op_has_quorum (local, this)) { +- local->op_ret = -1; +- local->op_errno = ENOTCONN; +- } ++ afr_handle_post_op_quorum (local, this); + + if (local->transaction.resume_stub) { + call_resume (local->transaction.resume_stub); +-- +1.8.3.1 + diff --git a/0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch b/0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch new file mode 100644 index 0000000..8d4b8e3 --- /dev/null +++ b/0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch @@ -0,0 +1,323 @@ +From 6229320bc25ff24bb76f990c8e5411b6f1aa476c Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Sun, 28 Jan 2018 13:50:47 +0530 +Subject: [PATCH 155/180] afr: don't treat all cases all bricks being blamed as + split-brain + +Backport of https://review.gluster.org/#/c/19349/ + +Problem: +We currently don't have a roll-back/undoing of post-ops if quorum is not +met. Though the FOP is still unwound with failure, the xattrs remain on +the disk. Due to these partial post-ops and partial heals (healing only when +2 bricks are up), we can end up in split-brain purely from the afr +xattrs point of view i.e each brick is blamed by atleast one of the +others. These scenarios are hit when there is frequent +connect/disconnect of the client/shd to the bricks while I/O or heal +are in progress. + +Fix: +Instead of undoing the post-op, pick a source based on the xattr values. +If 2 bricks blame one, the blamed one must be treated as sink. +If there is no majority, all are sources. Once we pick a source, +self-heal will then do the heal instead of erroring out due to +split-brain. + +Change-Id: I3d0224b883eb0945785ade0e9697a1c828aec0ae +BUG: 1384983 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/129245 +Tested-by: RHGS Build Bot +--- + tests/basic/afr/arbiter-add-brick.t | 16 ++++ + .../replicate/bug-1539358-split-brain-detection.t | 89 ++++++++++++++++++++++ + tests/bugs/replicate/bug-802417.t | 12 +++ + xlators/cluster/afr/src/afr-self-heal-common.c | 51 +++++++++++-- + xlators/cluster/afr/src/afr-self-heal-data.c | 6 +- + 5 files changed, 165 insertions(+), 9 deletions(-) + create mode 100755 tests/bugs/replicate/bug-1539358-split-brain-detection.t + +diff --git a/tests/basic/afr/arbiter-add-brick.t b/tests/basic/afr/arbiter-add-brick.t +index fe919de..77b93d9 100644 +--- a/tests/basic/afr/arbiter-add-brick.t ++++ b/tests/basic/afr/arbiter-add-brick.t +@@ -12,6 +12,8 @@ TEST $CLI volume set $V0 performance.stat-prefetch off + TEST $CLI volume start $V0 + TEST $CLI volume set $V0 self-heal-daemon off + TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + TEST mkdir $M0/dir1 + TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1 + +@@ -24,6 +26,7 @@ TEST dd if=/dev/urandom of=$M0/file1 bs=1024 count=1024 + #convert replica 2 to arbiter volume + TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 + + #syntax check for add-brick. + TEST ! $CLI volume add-brick $V0 replica 2 arbiter 1 $H0:$B0/${V0}2 +@@ -31,6 +34,19 @@ TEST ! $CLI volume add-brick $V0 replica 3 arbiter 2 $H0:$B0/${V0}2 + + TEST $CLI volume add-brick $V0 replica 3 arbiter 1 $H0:$B0/${V0}2 + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 2 ++ ++#Trigger name heals from client. If we just rely on index heal, the first index ++#crawl on B0 fails for /, dir2 and /file either due to lock collision or files ++#not being present on the other 2 bricks yet. It is getting healed only in the ++#next crawl after priv->shd.timeout (600 seconds) or by manually launching ++#index heal again. ++TEST $CLI volume set $V0 data-self-heal off ++TEST $CLI volume set $V0 metadata-self-heal off ++TEST $CLI volume set $V0 entry-self-heal off ++TEST stat $M0/dir1 ++TEST stat $M0/dir2 ++TEST stat $M0/file1 + + #Heal files + TEST $CLI volume set $V0 self-heal-daemon on +diff --git a/tests/bugs/replicate/bug-1539358-split-brain-detection.t b/tests/bugs/replicate/bug-1539358-split-brain-detection.t +new file mode 100755 +index 0000000..7b71a7a +--- /dev/null ++++ b/tests/bugs/replicate/bug-1539358-split-brain-detection.t +@@ -0,0 +1,89 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++## Start and create a volume ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}; ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST $CLI volume set $V0 self-heal-daemon off ++TEST glusterfs --volfile-id=/$V0 --volfile-server=$H0 $M0 --attribute-timeout=0 --entry-timeout=0 ++ ++###############################################################################yy ++# Case of 2 bricks blaming the third and the third blaming the other two. ++ ++TEST `echo "hello" >> $M0/file` ++ ++# B0 and B2 must blame B1 ++TEST kill_brick $V0 $H0 $B0/$V0"1" ++TEST `echo "append" >> $M0/file` ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/file trusted.afr.$V0-client-1 data ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 data ++CLIENT_MD5=$(md5sum $M0/file | cut -d\ -f1) ++ ++# B1 must blame B0 and B2 ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"1"/file ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file ++ ++# Launch heal ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}1 ++TEST $CLI volume set $V0 self-heal-daemon on ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++B0_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) ++B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) ++B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\ -f1) ++TEST [ "$CLIENT_MD5" == "$B0_MD5" ] ++TEST [ "$CLIENT_MD5" == "$B1_MD5" ] ++TEST [ "$CLIENT_MD5" == "$B2_MD5" ] ++ ++TEST rm $M0/file ++ ++###############################################################################yy ++# Case of each brick blaming the next one in a cyclic manner ++ ++TEST `echo "hello" >> $M0/file` ++# Mark cyclic xattrs and modify file content directly on the bricks. ++TEST $CLI volume set $V0 self-heal-daemon off ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000010000000000000000 $B0/$V0"0"/file ++setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"0"/file ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000010000000000000000 $B0/$V0"1"/file ++setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"1"/file ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000010000000000000000 $B0/$V0"2"/file ++setfattr -n trusted.afr.dirty -v 0x000000010000000000000000 $B0/$V0"2"/file ++ ++TEST `echo "ab" >> $B0/$V0"0"/file` ++TEST `echo "cdef" >> $B0/$V0"1"/file` ++TEST `echo "ghi" >> $B0/$V0"2"/file` ++ ++# Add entry to xattrop dir to trigger index heal. ++xattrop_dir0=$(afr_get_index_path $B0/$V0"0") ++base_entry_b0=`ls $xattrop_dir0` ++gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/file)) ++ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0 ++ ++# Launch heal ++TEST $CLI volume set $V0 self-heal-daemon on ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++B0_MD5=$(md5sum $B0/${V0}0/file | cut -d\ -f1) ++B1_MD5=$(md5sum $B0/${V0}1/file | cut -d\ -f1) ++B2_MD5=$(md5sum $B0/${V0}2/file | cut -d\ -f1) ++TEST [ "$B0_MD5" == "$B1_MD5" ] ++TEST [ "$B0_MD5" == "$B2_MD5" ] ++###############################################################################yy ++cleanup +diff --git a/tests/bugs/replicate/bug-802417.t b/tests/bugs/replicate/bug-802417.t +index c5ba98b..f213439 100755 +--- a/tests/bugs/replicate/bug-802417.t ++++ b/tests/bugs/replicate/bug-802417.t +@@ -10,6 +10,18 @@ function write_file() + } + + cleanup; ++ ++##################################################### ++# We are currently not triggering data heal unless all bricks of the replica are ++# up. We will need to modify this .t once the fix for preventing stale reads ++# being served to clients for files in spurious split-brains is done. Spurious ++# split-brains here means afr xattrs indicates sbrain but it is actually not. ++# Self-heal will heal such files automatically but before the heal completes, ++# reads can be served which needs fixing. ++#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000 ++#G_TESTDEF_TEST_STATUS_CENTOS6=BAD_TEST,BUG=000000 ++###################################################### ++ + TEST glusterd + TEST pidof glusterd + TEST $CLI volume info; +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 26d3860..f61b237 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -1455,6 +1455,36 @@ afr_does_witness_exist (xlator_t *this, uint64_t *witness) + return _gf_false; + } + ++unsigned int ++afr_get_quorum_count (afr_private_t *priv) ++{ ++ if (priv->quorum_count == AFR_QUORUM_AUTO) { ++ return priv->child_count/2 + 1; ++ } else { ++ return priv->quorum_count; ++ } ++} ++ ++void ++afr_selfheal_post_op_failure_accounting (afr_private_t *priv, char *accused, ++ unsigned char *sources, ++ unsigned char *locked_on) ++{ ++ int i = 0; ++ unsigned int quorum_count = 0; ++ ++ if (AFR_COUNT (sources, priv->child_count) != 0) ++ return; ++ ++ quorum_count = afr_get_quorum_count (priv); ++ for (i = 0; i < priv->child_count; i++) { ++ if ((accused[i] < quorum_count) && locked_on[i]) { ++ sources[i] = 1; ++ } ++ } ++ return; ++} ++ + /* + * This function determines if a self-heal is required for a given inode, + * and if needed, in what direction. +@@ -1490,6 +1520,7 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + char *accused = NULL;/* Accused others without any self-accusal */ + char *pending = NULL;/* Have pending operations on others */ + char *self_accused = NULL; /* Accused itself */ ++ int min_participants = -1; + + priv = this->private; + +@@ -1513,8 +1544,13 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + } + } + ++ if (type == AFR_DATA_TRANSACTION) { ++ min_participants = priv->child_count; ++ } else { ++ min_participants = AFR_SH_MIN_PARTICIPANTS; ++ } + if (afr_success_count (replies, +- priv->child_count) < AFR_SH_MIN_PARTICIPANTS) { ++ priv->child_count) < min_participants) { + /* Treat this just like locks not being acquired */ + return -ENOTCONN; + } +@@ -1530,11 +1566,10 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + for (i = 0; i < priv->child_count; i++) { + for (j = 0; j < priv->child_count; j++) { + if (matrix[i][j]) { +- if (!self_accused[i]) +- accused[j] = 1; +- +- if (i != j) +- pending[i] = 1; ++ if (!self_accused[i]) ++ accused[j] += 1; ++ if (i != j) ++ pending[i] += 1; + } + } + } +@@ -1575,6 +1610,10 @@ afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this, + } + } + ++ if (type == AFR_DATA_TRANSACTION) ++ afr_selfheal_post_op_failure_accounting (priv, accused, ++ sources, locked_on); ++ + /* If no sources, all locked nodes are sinks - split brain */ + if (AFR_COUNT (sources, priv->child_count) == 0) { + for (i = 0; i < priv->child_count; i++) { +diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c +index 8cf43f2..bcd0dec 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-data.c ++++ b/xlators/cluster/afr/src/afr-self-heal-data.c +@@ -684,7 +684,7 @@ __afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd, + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0, + data_lock); + { +- if (ret < AFR_SH_MIN_PARTICIPANTS) { ++ if (ret < priv->child_count) { + gf_msg_debug (this->name, 0, "%s: Skipping " + "self-heal as only %d number " + "of subvolumes " +@@ -749,7 +749,7 @@ restore_time: + if (!is_arbiter_the_only_sink) { + ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, + 0, 0, data_lock); +- if (ret < AFR_SH_MIN_PARTICIPANTS) { ++ if (ret < priv->child_count) { + ret = -ENOTCONN; + did_sh = _gf_false; + goto skip_undo_pending; +@@ -878,7 +878,7 @@ afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode) + priv->sh_domain, 0, 0, + locked_on); + { +- if (ret < AFR_SH_MIN_PARTICIPANTS) { ++ if (ret < priv->child_count) { + gf_msg_debug (this->name, 0, "%s: Skipping " + "self-heal as only %d number of " + "subvolumes could be locked", +-- +1.8.3.1 + diff --git a/0156-performance-write-behind-fix-bug-while-handling-shor.patch b/0156-performance-write-behind-fix-bug-while-handling-shor.patch new file mode 100644 index 0000000..7401eda --- /dev/null +++ b/0156-performance-write-behind-fix-bug-while-handling-shor.patch @@ -0,0 +1,72 @@ +From 430ff66f69074063dd824b0cde8808ee3d2c7ca8 Mon Sep 17 00:00:00 2001 +From: Raghavendra G +Date: Fri, 22 Dec 2017 12:02:09 +0530 +Subject: [PATCH 156/180] performance/write-behind: fix bug while handling + short writes + +The variabled "fulfilled" in wb_fulfill_short_write is not reset to 0 +while handling every member of the list. + +This has some interesting consequences: + +* If we break from the loop while processing last member of the list + head->winds, req is reset to head as the list is a circular + one. However, head is already fulfilled and can potentially be + freed. So, we end up adding a freed request to wb_inode->todo + list. This is the RCA for the crash tracked by the bug associated + with this patch (Note that we saw "holder" which is freed in todo + list). + +* If we break from the loop while processing any of the last but one + member of the list head->winds, req is set to next member in the + list, skipping the current request, even though it is not entirely + synced. This can lead to data corruption. + +The fix is very simple and we've to change the code to make sure +"fulfilled" reflects whether the current request is fulfilled or not +and it doesn't carry history of previous requests in the list. + +Change-Id: Ia3d6988175a51c9e08efdb521a7b7938b01f93c8 +BUG: 1516638 +Signed-off-by: Raghavendra G +upstream patch: https://review.gluster.org/19064 +Reviewed-on: https://code.engineering.redhat.com/gerrit/126512 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/performance/write-behind/src/write-behind.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c +index d1a95c9..7104eb9 100644 +--- a/xlators/performance/write-behind/src/write-behind.c ++++ b/xlators/performance/write-behind/src/write-behind.c +@@ -964,6 +964,7 @@ __wb_fulfill_short_write (wb_request_t *req, int size, gf_boolean_t *fulfilled) + } else { + accounted_size = size; + __wb_modify_write_request (req, size); ++ *fulfilled = 0; + } + + out: +@@ -1005,7 +1006,7 @@ wb_fulfill_short_write (wb_request_t *head, int size) + size -= accounted_size; + + if (size == 0) { +- if (fulfilled) ++ if (fulfilled && (next != head)) + req = next; + + goto done; +@@ -1017,7 +1018,7 @@ wb_fulfill_short_write (wb_request_t *head, int size) + size -= accounted_size; + + if (size == 0) { +- if (fulfilled) ++ if (fulfilled && (next != head)) + req = next; + break; + } +-- +1.8.3.1 + diff --git a/0157-cluster-afr-remove-unnecessary-child_up-initializati.patch b/0157-cluster-afr-remove-unnecessary-child_up-initializati.patch new file mode 100644 index 0000000..26a1f67 --- /dev/null +++ b/0157-cluster-afr-remove-unnecessary-child_up-initializati.patch @@ -0,0 +1,45 @@ +From d4f44782105268a9f1780f8ed53a98a28ba09053 Mon Sep 17 00:00:00 2001 +From: Xavier Hernandez +Date: Thu, 1 Feb 2018 16:06:32 +0100 +Subject: [PATCH 157/180] cluster/afr: remove unnecessary child_up + initialization + +The child_up array was initialized with all elements being -1 to +allow afr_notify() to differentiate down bricks from bricks that +haven't reported yet. With current implementation this is not needed +anymore and it was causing unexpected results when other parts of +the code considered that if child_up[i] != 0, it meant that it was up. + +> Upstream patch: https://review.gluster.org/19440 + +Change-Id: I2a9d712ee64c512f24bd5cd3a48dcb37e3139472 +BUG: 1541932 +Signed-off-by: Xavier Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/130431 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/afr/src/afr.c | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c +index 84dbcc0..d3aee77 100644 +--- a/xlators/cluster/afr/src/afr.c ++++ b/xlators/cluster/afr/src/afr.c +@@ -583,13 +583,6 @@ init (xlator_t *this) + goto out; + } + +- for (i = 0; i < child_count; i++) +- priv->child_up[i] = -1; /* start with unknown state. +- this initialization needed +- for afr_notify() to work +- reliably +- */ +- + priv->children = GF_CALLOC (sizeof (xlator_t *), child_count, + gf_afr_mt_xlator_t); + if (!priv->children) { +-- +1.8.3.1 + diff --git a/0158-cluster-ec-create-eager-lock-option-for-non-regular-.patch b/0158-cluster-ec-create-eager-lock-option-for-non-regular-.patch new file mode 100644 index 0000000..f57c8e0 --- /dev/null +++ b/0158-cluster-ec-create-eager-lock-option-for-non-regular-.patch @@ -0,0 +1,207 @@ +From 76b5366d4f346d5010bd153d20668f8860262c4e Mon Sep 17 00:00:00 2001 +From: Xavier Hernandez +Date: Mon, 16 Oct 2017 13:57:59 +0200 +Subject: [PATCH 158/180] cluster/ec: create eager-lock option for non-regular + files + +A new option is added to allow independent configuration of eager +locking for regular files and non-regular files. + +> Upstream patch: https://review.gluster.org/18530 + +Change-Id: I8f80e46d36d8551011132b15c0fac549b7fb1c60 +BUG: 1530519 +Signed-off-by: Xavier Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/130432 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/globals.h | 2 ++ + tests/basic/ec/ec-background-heals.t | 1 + + tests/basic/ec/ec-optimistic-changelog.t | 1 + + tests/bugs/cli/bug-1320388.t | 1 + + xlators/cluster/ec/src/ec-common.c | 22 +++++++++++++++- + xlators/cluster/ec/src/ec-types.h | 1 + + xlators/cluster/ec/src/ec.c | 34 ++++++++++++++++--------- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 5 ++++ + 8 files changed, 54 insertions(+), 13 deletions(-) + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index 692d49d..6bbe3e6 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -101,6 +101,8 @@ + + #define GD_OP_VERSION_3_12_2 31202 /* Op-version for GlusterFS 3.12.2 */ + ++#define GD_OP_VERSION_3_12_3 31203 /* Op-version for GlusterFS 3.12.3 */ ++ + #define GD_OP_VERSION_3_13_0 31300 /* Op-version for GlusterFS 3.13.0 */ + + #define GD_OP_VERSION_3_13_1 31301 /* Op-version for GlusterFS 3.13.1 */ +diff --git a/tests/basic/ec/ec-background-heals.t b/tests/basic/ec/ec-background-heals.t +index b9291bc..29778a4 100644 +--- a/tests/basic/ec/ec-background-heals.t ++++ b/tests/basic/ec/ec-background-heals.t +@@ -17,6 +17,7 @@ TEST $CLI volume set $V0 performance.read-ahead off + TEST $CLI volume set $V0 performance.io-cache off + TEST $CLI volume set $V0 disperse.background-heals 0 + TEST $CLI volume set $V0 disperse.eager-lock off ++TEST $CLI volume set $V0 disperse.other-eager-lock off + TEST $CLI volume start $V0 + + TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +diff --git a/tests/basic/ec/ec-optimistic-changelog.t b/tests/basic/ec/ec-optimistic-changelog.t +index 1277da6..a372cd3 100644 +--- a/tests/basic/ec/ec-optimistic-changelog.t ++++ b/tests/basic/ec/ec-optimistic-changelog.t +@@ -19,6 +19,7 @@ TEST $CLI volume set $V0 performance.io-cache off + TEST $CLI volume set $V0 disperse.background-heals 0 + TEST $CLI volume set $V0 disperse.optimistic-change-log off + TEST $CLI volume set $V0 disperse.eager-lock off ++TEST $CLI volume set $V0 disperse.other-eager-lock off + TEST $CLI volume start $V0 + + TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0; +diff --git a/tests/bugs/cli/bug-1320388.t b/tests/bugs/cli/bug-1320388.t +index ca23ab8..bed1392 100755 +--- a/tests/bugs/cli/bug-1320388.t ++++ b/tests/bugs/cli/bug-1320388.t +@@ -29,6 +29,7 @@ TEST glusterd + TEST pidof glusterd + TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5} + TEST $CLI volume set $V0 disperse.eager-lock off ++TEST $CLI volume set $V0 disperse.other-eager-lock off + TEST $CLI volume start $V0 + TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + EXPECT_WITHIN $CHILD_UP_TIMEOUT "^6$" ec_child_up_count $V0 0 +diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c +index 18ed274..051fff6 100644 +--- a/xlators/cluster/ec/src/ec-common.c ++++ b/xlators/cluster/ec/src/ec-common.c +@@ -2553,6 +2553,26 @@ void ec_flush_size_version(ec_fop_data_t * fop) + ec_update_info(&fop->locks[0]); + } + ++static gf_boolean_t ++ec_use_eager_lock(ec_t *ec, ec_fop_data_t *fop) ++{ ++ /* Fops with no locks at this point mean that they are sent as sub-fops ++ * of other higher level fops. In this case we simply assume that the ++ * parent fop will take correct care of the eager lock. */ ++ if (fop->lock_count == 0) { ++ return _gf_true; ++ } ++ ++ /* We may have more than one lock, but this only happens in the rename ++ * fop, and both locks will reference an inode of the same type (a ++ * directory in this case), so we only need to check the first lock. */ ++ if (fop->locks[0].lock->loc.inode->ia_type == IA_IFREG) { ++ return ec->eager_lock; ++ } ++ ++ return ec->other_eager_lock; ++} ++ + void ec_lock_reuse(ec_fop_data_t *fop) + { + ec_cbk_data_t *cbk; +@@ -2562,7 +2582,7 @@ void ec_lock_reuse(ec_fop_data_t *fop) + ec = fop->xl->private; + cbk = fop->answer; + +- if (ec->eager_lock && cbk != NULL) { ++ if (ec_use_eager_lock(ec, fop) && cbk != NULL) { + if (cbk->xdata != NULL) { + if ((dict_get_int32(cbk->xdata, GLUSTERFS_INODELK_COUNT, + &count) == 0) && (count > 1)) { +diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h +index 3129586..f6e2cd9 100644 +--- a/xlators/cluster/ec/src/ec-types.h ++++ b/xlators/cluster/ec/src/ec-types.h +@@ -594,6 +594,7 @@ struct _ec { + gf_timer_t *timer; + gf_boolean_t shutdown; + gf_boolean_t eager_lock; ++ gf_boolean_t other_eager_lock; + gf_boolean_t optimistic_changelog; + gf_boolean_t parallel_writes; + uint32_t background_heals; +diff --git a/xlators/cluster/ec/src/ec.c b/xlators/cluster/ec/src/ec.c +index 09c5fa8..13ce7fb 100644 +--- a/xlators/cluster/ec/src/ec.c ++++ b/xlators/cluster/ec/src/ec.c +@@ -276,6 +276,8 @@ reconfigure (xlator_t *this, dict_t *options) + bool, failed); + GF_OPTION_RECONF ("eager-lock", ec->eager_lock, options, + bool, failed); ++ GF_OPTION_RECONF ("other-eager-lock", ec->other_eager_lock, options, ++ bool, failed); + GF_OPTION_RECONF ("background-heals", background_heals, options, + uint32, failed); + GF_OPTION_RECONF ("heal-wait-qlength", heal_wait_qlen, options, +@@ -654,6 +656,7 @@ init (xlator_t *this) + GF_OPTION_INIT ("self-heal-daemon", ec->shd.enabled, bool, failed); + GF_OPTION_INIT ("iam-self-heal-daemon", ec->shd.iamshd, bool, failed); + GF_OPTION_INIT ("eager-lock", ec->eager_lock, bool, failed); ++ GF_OPTION_INIT ("other-eager-lock", ec->other_eager_lock, bool, failed); + GF_OPTION_INIT ("background-heals", ec->background_heals, uint32, failed); + GF_OPTION_INIT ("heal-wait-qlength", ec->heal_wait_qlen, uint32, failed); + GF_OPTION_INIT ("self-heal-window-size", ec->self_heal_window_size, uint32, +@@ -1397,18 +1400,25 @@ struct volume_options options[] = + { .key = {"eager-lock"}, + .type = GF_OPTION_TYPE_BOOL, + .default_value = "on", +- .description = "Enable/Disable eager lock for disperse volume. " +- "If a fop takes a lock and completes its operation, " +- "it waits for next 1 second before releasing the lock, " +- "to see if the lock can be reused for next fop from " +- "the same client. If ec finds any lock contention within " +- "1 second it releases the lock immediately before time " +- "expires. This improves the performance of file operations." +- "However, as it takes lock on first brick, for few operations " +- "like read, discovery of lock contention might take long time " +- "and can actually degrade the performance. " +- "If eager lock is disabled, lock will be released as soon as fop " +- "completes. " ++ .description = "Enable/Disable eager lock for regular files on a " ++ "disperse volume. If a fop takes a lock and completes " ++ "its operation, it waits for next 1 second before " ++ "releasing the lock, to see if the lock can be reused " ++ "for next fop from the same client. If ec finds any lock " ++ "contention within 1 second it releases the lock " ++ "immediately before time expires. This improves the " ++ "performance of file operations. However, as it takes " ++ "lock on first brick, for few operations like read, " ++ "discovery of lock contention might take long time and " ++ "can actually degrade the performance. If eager lock is " ++ "disabled, lock will be released as soon as fop " ++ "completes." ++ }, ++ { .key = {"other-eager-lock"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .default_value = "on", ++ .description = "It's equivalent to the eager-lock option but for non " ++ "regular files." + }, + { .key = {"background-heals"}, + .type = GF_OPTION_TYPE_INT, +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index 693c917..af0a982 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -1445,6 +1445,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .op_version = GD_OP_VERSION_3_7_10, + .flags = OPT_FLAG_CLIENT_OPT + }, ++ { .key = "disperse.other-eager-lock", ++ .voltype = "cluster/disperse", ++ .op_version = GD_OP_VERSION_3_12_2, ++ .flags = OPT_FLAG_CLIENT_OPT ++ }, + { .key = "cluster.quorum-type", + .voltype = "cluster/replicate", + .option = "quorum-type", +-- +1.8.3.1 + diff --git a/0159-extras-hooks-Fix-S10selinux-label-brick.sh-hook-scri.patch b/0159-extras-hooks-Fix-S10selinux-label-brick.sh-hook-scri.patch new file mode 100644 index 0000000..18984d3 --- /dev/null +++ b/0159-extras-hooks-Fix-S10selinux-label-brick.sh-hook-scri.patch @@ -0,0 +1,107 @@ +From 994f4d8922f45e298aa6c048614319f353994550 Mon Sep 17 00:00:00 2001 +From: Milan Zink +Date: Wed, 10 Jan 2018 13:04:42 +0100 +Subject: [PATCH 159/180] extras/hooks: Fix S10selinux-label-brick.sh hook + script + +* script was failng due to syntax error +* shellcheck issues fixed +* improved performance: semanage & restorecon is being run on unique path + +>upstream patch : https://review.gluster.org/#/c/19177/ + +Change-Id: I58b357d9fd37586004a2a518f7a5d1c5c9ddd7e3 +BUG: 1546075 +Signed-off-by: Milan Zink +Reviewed-on: https://code.engineering.redhat.com/gerrit/130591 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + .../create/post/S10selinux-label-brick.sh | 57 +++++++++++----------- + 1 file changed, 29 insertions(+), 28 deletions(-) + +diff --git a/extras/hook-scripts/create/post/S10selinux-label-brick.sh b/extras/hook-scripts/create/post/S10selinux-label-brick.sh +index 6be4072..de242d2 100755 +--- a/extras/hook-scripts/create/post/S10selinux-label-brick.sh ++++ b/extras/hook-scripts/create/post/S10selinux-label-brick.sh +@@ -14,48 +14,49 @@ OPTSPEC="volname:" + VOL= + + parse_args () { +- ARGS=$(getopt -o '' -l $OPTSPEC -n $PROGNAME -- "$@") +- eval set -- "$ARGS" +- +- while true; do +- case $1 in +- --volname) +- shift +- VOL=$1 +- ;; +- *) +- shift +- break +- ;; +- esac ++ ARGS=$(getopt -o '' -l ${OPTSPEC} -n ${PROGNAME} -- "$@") ++ eval set -- "${ARGS}" ++ ++ while true; do ++ case ${1} in ++ --volname) ++ shift ++ VOL=${1} ++ ;; ++ *) + shift +- done ++ break ++ ;; ++ esac ++ shift ++ done + } + + set_brick_labels() + { +- volname=$1 ++ volname=${1} + +- # grab the path for each local brick +- brickdirs=$(grep '^path=' /var/lib/glusterd/vols/${volname}/bricks/* | cut -d= -f 2) ++ # grab the path for each local brick ++ brickpath="/var/lib/glusterd/vols/${volname}/bricks/*" ++ brickdirs=$(grep '^path=' "${brickpath}" | cut -d= -f 2 | sort -u) + +- for b in $brickdirs +- do +- # Add a file context for each brick path and associate with the +- # glusterd_brick_t SELinux type. +- semanage fcontext --add -t glusterd_brick_t -r s0 $b(/.*)? ++ for b in ${brickdirs}; do ++ # Add a file context for each brick path and associate with the ++ # glusterd_brick_t SELinux type. ++ pattern="${b}\(/.*\)?" ++ semanage fcontext --add -t glusterd_brick_t -r s0 "${pattern}" + +- # Set the labels on the new brick path. +- restorecon -R $b +- done ++ # Set the labels on the new brick path. ++ restorecon -R "${b}" ++ done + } + + SELINUX_STATE=$(which getenforce && getenforce) + [ "${SELINUX_STATE}" = 'Disabled' ] && exit 0 + + parse_args "$@" +-[ -z "$VOL" ] && exit 1 ++[ -z "${VOL}" ] && exit 1 + +-set_brick_labels $VOL ++set_brick_labels "${VOL}" + + exit 0 +-- +1.8.3.1 + diff --git a/0160-common-ha-enable-and-disable-selinux-ganesha_use_fus.patch b/0160-common-ha-enable-and-disable-selinux-ganesha_use_fus.patch new file mode 100644 index 0000000..b362977 --- /dev/null +++ b/0160-common-ha-enable-and-disable-selinux-ganesha_use_fus.patch @@ -0,0 +1,84 @@ +From b8dab559ef4c4a4a08b060ccd77e68d002e1236b Mon Sep 17 00:00:00 2001 +From: Jiffin Tony Thottan +Date: Tue, 20 Feb 2018 11:50:33 +0530 +Subject: [PATCH 160/180] common-ha: enable and disable selinux + ganesha_use_fusefs + +Adding missing changes in a downstream backport(https://code.engineering.redhat.com/gerrit/#/c/109845/) + +Label: DOWNSTREAM ONLY + +Change-Id: I59fd2fc2228ded9547c2d1e08c22f7a10c35f86f +BUG: 1544852 +Signed-off-by: Jiffin Tony Thottan +Reviewed-on: https://code.engineering.redhat.com/gerrit/130583 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Reviewed-by: Milind Changire +Reviewed-by: Atin Mukherjee +--- + glusterfs.spec.in | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index a230b24..3181d72 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -409,11 +409,17 @@ Requires: pcs, dbus + %if ( 0%{?rhel} && 0%{?rhel} == 6 ) + Requires: cman, pacemaker, corosync + %endif +-%if ( 0%{?fedora} && 0%{?fedora} > 25 ) ++%if ( ( 0%{?fedora} && 0%{?fedora} > 25 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) ++%if ( 0%{?rhel} ) + Requires: selinux-policy >= 3.13.1-160 ++Requires(post): policycoreutils-python ++Requires(postun): policycoreutils-python ++%else + Requires(post): policycoreutils-python-utils + Requires(postun): policycoreutils-python-utils + %endif ++%endif ++ + %if ( 0%{?fedora} ) || ( 0%{?rhel} && 0%{?rhel} > 5 ) + # we need portblock resource-agent in 3.9.5 and later. + Requires: resource-agents >= 3.9.5 +@@ -885,7 +891,7 @@ exit 0 + %endif + + %if ( 0%{?_build_server} ) +-%if ( 0%{?fedora} && 0%{?fedora} > 25 ) ++%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %post ganesha + semanage boolean -m ganesha_use_fusefs --on + exit 0 +@@ -1015,7 +1021,7 @@ fi + /sbin/ldconfig + + %if ( 0%{?_build_server} ) +-%if ( 0%{?fedora} && 0%{?fedora} > 25 ) ++%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %postun ganesha + semanage boolean -m ganesha_use_fusefs --off + exit 0 +@@ -1038,7 +1044,7 @@ exit 0 + ## All %%trigger should be placed here and keep them sorted + ## + %if ( 0%{?_build_server} ) +-%if ( 0%{?fedora} && 0%{?fedora} > 25 ) ++%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %trigger ganesha -- selinux-policy-targeted + semanage boolean -m ganesha_use_fusefs --on + exit 0 +@@ -1049,7 +1055,7 @@ exit 0 + ## All %%triggerun should be placed here and keep them sorted + ## + %if ( 0%{?_build_server} ) +-%if ( 0%{?fedora} && 0%{?fedora} > 25 ) ++%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) + %triggerun ganesha -- selinux-policy-targeted + semanage boolean -m ganesha_use_fusefs --off + exit 0 +-- +1.8.3.1 + diff --git a/0161-cluster-dht-Fixed-a-typo.patch b/0161-cluster-dht-Fixed-a-typo.patch new file mode 100644 index 0000000..54c5300 --- /dev/null +++ b/0161-cluster-dht-Fixed-a-typo.patch @@ -0,0 +1,42 @@ +From 313b38d5d8819191b7e5adf04396251a49f0a652 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Thu, 22 Feb 2018 10:50:04 +0530 +Subject: [PATCH 161/180] cluster/dht: Fixed a typo + +Replaced "then" with "than" + +upstream patch: https://review.gluster.org/#/c/19604/ + +> Change-Id: I73090e8c1a639befd7c5458e8d63bd173248bc7d +> BUG: 1547128 +> Signed-off-by: N Balachandran + +Change-Id: Id7b1cfdcf460164b4f1bc81da4dfca306db502e8 +BUG: 1546960 +Signed-off-by: N Balachandran +Reviewed-on: https://code.engineering.redhat.com/gerrit/130850 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/dht/src/dht-rebalance.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index f9a25fb..e620005 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -954,9 +954,9 @@ __dht_check_free_space (xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc + gf_msg (this->name, GF_LOG_WARNING, 0, + DHT_MSG_MIGRATE_FILE_FAILED, + "data movement of file " +- "{blocks:%"PRIu64" name:(%s) } would result in " ++ "{blocks:%"PRIu64" name:(%s)} would result in " + "dst node (%s:%"PRIu64") having lower disk " +- "space then the source node (%s:%"PRIu64")" ++ "space than the source node (%s:%"PRIu64")" + ".Skipping file.", stbuf->ia_blocks, loc->path, + to->name, dst_statfs_blocks, from->name, + src_statfs_blocks); +-- +1.8.3.1 + diff --git a/0162-cluster-dht-Handle-single-dht-child-in-dht_lookup.patch b/0162-cluster-dht-Handle-single-dht-child-in-dht_lookup.patch new file mode 100644 index 0000000..97b2f73 --- /dev/null +++ b/0162-cluster-dht-Handle-single-dht-child-in-dht_lookup.patch @@ -0,0 +1,77 @@ +From 90dc8d7cee42eeacc3214fef8bb45cbffc4c8de5 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Mon, 19 Feb 2018 09:44:29 +0530 +Subject: [PATCH 162/180] cluster/dht: Handle single dht child in dht_lookup + +This patch limits itself to only handling the case +where no file (data or linkto) exists on the subvol. + +Additional cases to be handled: +1. A linkto file was found on the only child subvol. This currently +calls dht_lookup_everywhere which eventually deletes it. It can be +deleted directly as it will not be pointing to a valid subvol. +2. Directory lookups - locking might be unnecessary in some cases. + +upstream patch: https://review.gluster.org/19581 + +> Change-Id: I940ba34531f2aaee1d36fd9ca45ecfd46be662a4 +> BUG: 1546620 +> Signed-off-by: N Balachandran + +Change-Id: I1cc1b9866fa18fe825847585e6a9b8c92898951a +BUG: 1545570 +Signed-off-by: N Balachandran +Reviewed-on: https://code.engineering.redhat.com/gerrit/130976 +Tested-by: RHGS Build Bot +Reviewed-by: Shyam Ranganathan +--- + xlators/cluster/dht/src/dht-common.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 1262732..5641330 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -2352,6 +2352,12 @@ dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + } + + if (ENTRY_MISSING (op_ret, op_errno)) { ++ ++ if (1 == conf->subvolume_cnt) { ++ /* No need to lookup again */ ++ goto out; ++ } ++ + gf_msg_debug (this->name, 0, + "Entry %s missing on subvol %s", + loc->path, prev->name); +@@ -2581,6 +2587,8 @@ dht_lookup (call_frame_t *frame, xlator_t *this, + local->xattr_req = dict_new (); + } + ++ /* Nameless lookup */ ++ + if (gf_uuid_is_null (loc->pargfid) && !gf_uuid_is_null (loc->gfid) && + !__is_root_gfid (loc->inode->gfid)) { + local->cached_subvol = NULL; +@@ -2598,6 +2606,9 @@ dht_lookup (call_frame_t *frame, xlator_t *this, + hashed_subvol = dht_subvol_get_hashed (this, loc); + local->hashed_subvol = hashed_subvol; + ++ ++ /* The entry has been looked up before and has an inode_ctx set ++ */ + if (is_revalidate (loc)) { + layout = local->layout; + if (!layout) { +@@ -2638,6 +2649,7 @@ dht_lookup (call_frame_t *frame, xlator_t *this, + "path %s", conf->xattr_name, loc->path); + goto err; + } ++ + /* need it in case file is not found on cached file + * on revalidate path and we may encounter linkto files on + * with dht_lookup_everywhere*/ +-- +1.8.3.1 + diff --git a/0163-glusterd-compare-uuid-instead-of-hostname-while-find.patch b/0163-glusterd-compare-uuid-instead-of-hostname-while-find.patch new file mode 100644 index 0000000..37657d4 --- /dev/null +++ b/0163-glusterd-compare-uuid-instead-of-hostname-while-find.patch @@ -0,0 +1,37 @@ +From 5138cf57c7a61eed4bb33c4fa2e21f6fb7bd56fd Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 20 Feb 2018 18:37:56 +0530 +Subject: [PATCH 163/180] glusterd: compare uuid instead of hostname while + finding compatible brick + +If the above is not done, bricks created with different IP/hostname will +not be compatible with brick multiplexing. + +>upstream mainline patch : https://review.gluster.org/#/c/19601/ + +Change-Id: I508eb59b0632df4b48466cca411c7ec6cc6bd577 +BUG: 1547012 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/131110 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/mgmt/glusterd/src/glusterd-utils.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 5deacde..9ccd718 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -5656,7 +5656,7 @@ find_compat_brick_in_vol (glusterd_conf_t *conf, + if (other_brick == brickinfo) { + continue; + } +- if (strcmp (brickinfo->hostname, other_brick->hostname) != 0) { ++ if (gf_uuid_compare (brickinfo->uuid, other_brick->uuid)) { + continue; + } + if (other_brick->status != GF_BRICK_STARTED && +-- +1.8.3.1 + diff --git a/0164-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch b/0164-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch new file mode 100644 index 0000000..06f4d74 --- /dev/null +++ b/0164-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch @@ -0,0 +1,278 @@ +From 9f4564e55b1e515743a1f80d18989681d5d3b59f Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 15 Feb 2018 01:46:29 -0500 +Subject: [PATCH 164/180] geo-rep: Remove lazy umount and use mount namespaces + +Lazy umounting the master volume by worker causes +issues with rsync's usage of getcwd. Henc removing +the lazy umount and using private mount namespace +for the same. On the slave, the lazy umount is +retained as we can't use private namespace in non +root geo-rep setup because gsyncd is spawned as +non-privileged user. + +Backport of https://review.gluster.org/#/c/19544/ + +Change-Id: I851e8dc2b8523dc5668a97e87ef619ab70471dfd +BUG: 1544382 +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/130468 +Tested-by: RHGS Build Bot +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/gconf.py | 3 +++ + geo-replication/syncdaemon/gsyncd.py | 13 ++++++----- + geo-replication/syncdaemon/monitor.py | 38 ++++++++++++++++++++------------ + geo-replication/syncdaemon/resource.py | 20 ++++++++++++----- + geo-replication/syncdaemon/syncdutils.py | 12 +++++----- + glusterfs.spec.in | 4 ++++ + 6 files changed, 59 insertions(+), 31 deletions(-) + +diff --git a/geo-replication/syncdaemon/gconf.py b/geo-replication/syncdaemon/gconf.py +index 97395b4..2280f44 100644 +--- a/geo-replication/syncdaemon/gconf.py ++++ b/geo-replication/syncdaemon/gconf.py +@@ -28,5 +28,8 @@ class GConf(object): + active_earlier = False + passive_earlier = False + mgmt_lock_fd = None ++ mountbroker = False ++ mount_point = None ++ mbr_umount_cmd = [] + + gconf = GConf() +diff --git a/geo-replication/syncdaemon/gsyncd.py b/geo-replication/syncdaemon/gsyncd.py +index d77b90f..629e8b7 100644 +--- a/geo-replication/syncdaemon/gsyncd.py ++++ b/geo-replication/syncdaemon/gsyncd.py +@@ -276,6 +276,7 @@ def main_i(): + op.add_option('--georep-session-working-dir', metavar='STATF', + type=str, action='callback', callback=store_abs) + op.add_option('--access-mount', default=False, action='store_true') ++ op.add_option('--slave-access-mount', default=False, action='store_true') + op.add_option('--ignore-deletes', default=False, action='store_true') + op.add_option('--isolated-slave', default=False, action='store_true') + op.add_option('--use-rsync-xattrs', default=False, action='store_true') +@@ -431,7 +432,7 @@ def main_i(): + o.get_opt_string() not in ('--version', '--help'))] + remote_tunables = ['listen', 'go_daemon', 'timeout', + 'session_owner', 'config_file', 'use_rsync_xattrs', +- 'local_id', 'local_node', 'access_mount'] ++ 'local_id', 'local_node', 'slave_access_mount'] + rq_remote_tunables = {'listen': True} + + # precedence for sources of values: 1) commandline, 2) cfg file, 3) +@@ -768,15 +769,15 @@ def main_i(): + else: + log_file = gconf.log_file + if be_monitor: +- label = 'monitor' ++ gconf.label = 'monitor' + elif be_agent: +- label = gconf.local_path ++ gconf.label = gconf.local_path + elif remote: + # master +- label = gconf.local_path ++ gconf.label = gconf.local_path + else: +- label = 'slave' +- startup(go_daemon=go_daemon, log_file=log_file, label=label) ++ gconf.label = 'slave' ++ startup(go_daemon=go_daemon, log_file=log_file, label=gconf.label) + resource.Popen.init_errhandler() + + if be_agent: +diff --git a/geo-replication/syncdaemon/monitor.py b/geo-replication/syncdaemon/monitor.py +index 4da9330..0f43c4f 100644 +--- a/geo-replication/syncdaemon/monitor.py ++++ b/geo-replication/syncdaemon/monitor.py +@@ -24,7 +24,7 @@ import random + from gconf import gconf + from syncdutils import select, waitpid, errno_wrap, lf + from syncdutils import set_term_handler, is_host_local, GsyncdError +-from syncdutils import escape, Thread, finalize, memoize ++from syncdutils import escape, Thread, finalize, memoize, boolify + from syncdutils import gf_event, EVENT_GEOREP_FAULTY + + from gsyncdstatus import GeorepStatus, set_monitor_status +@@ -306,19 +306,29 @@ class Monitor(object): + os.close(pr) + os.close(ra) + os.close(wa) +- os.execv(sys.executable, argv + ['--feedback-fd', str(pw), +- '--local-path', w[0]['dir'], +- '--local-node', w[0]['host'], +- '--local-node-id', +- w[0]['uuid'], +- '--local-id', +- '.' + escape(w[0]['dir']), +- '--rpc-fd', +- ','.join([str(rw), str(ww), +- str(ra), str(wa)]), +- '--subvol-num', str(w[2])] + +- (['--is-hottier'] if w[3] else []) + +- ['--resource-remote', remote_host]) ++ args_to_worker = argv + ['--feedback-fd', str(pw), ++ '--local-path', w[0]['dir'], ++ '--local-node', w[0]['host'], ++ '--local-node-id', ++ w[0]['uuid'], ++ '--local-id', ++ '.' + escape(w[0]['dir']), ++ '--rpc-fd', ++ ','.join([str(rw), str(ww), ++ str(ra), str(wa)]), ++ '--subvol-num', str(w[2])] ++ ++ if w[3]: ++ args_to_worker.append('--is-hottier') ++ args_to_worker += ['--resource-remote', remote_host] ++ ++ access_mount = boolify(gconf.access_mount) ++ if access_mount: ++ os.execv(sys.executable, args_to_worker) ++ else: ++ unshare_cmd = ['unshare', '-m', '--propagation', 'private'] ++ cmd = unshare_cmd + args_to_worker ++ os.execvp("unshare", cmd) + + cpids.add(cpid) + agents.add(apid) +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index 5ad5b97..4b2a266 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -43,7 +43,7 @@ from syncdutils import CHANGELOG_AGENT_CLIENT_VERSION + from syncdutils import GX_GFID_CANONICAL_LEN + from gsyncdstatus import GeorepStatus + from syncdutils import get_master_and_slave_data_from_args +-from syncdutils import mntpt_list, lf ++from syncdutils import lf + from syncdutils import Xattr, matching_disk_gfid, get_gfid_from_mnt + + UrlRX = re.compile('\A(\w+)://([^ *?[]*)\Z') +@@ -1047,8 +1047,8 @@ class SlaveRemote(object): + extra_opts += ['--local-node', ln] + if boolify(gconf.use_rsync_xattrs): + extra_opts.append('--use-rsync-xattrs') +- if boolify(gconf.access_mount): +- extra_opts.append('--access-mount') ++ if boolify(gconf.slave_access_mount): ++ extra_opts.append('--slave-access-mount') + po = Popen(rargs + gconf.remote_gsyncd.split() + extra_opts + + ['-N', '--listen', '--timeout', str(gconf.timeout), + slave], +@@ -1333,6 +1333,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + def __init__(self, params): + self.params = params + self.mntpt = None ++ self.umount_cmd = [] + + @classmethod + def get_glusterprog(cls): +@@ -1424,13 +1425,15 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + assert(mntdata[-1] == '\0') + mntpt = mntdata[:-1] + assert(mntpt) +- if mounted and not boolify(gconf.access_mount): ++ if mounted and gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + po = self.umount_l(mntpt) + po.terminate_geterr(fail_on_err=False) + if po.returncode != 0: + po.errlog() + rv = po.returncode +- if not boolify(gconf.access_mount): ++ if gconf.label == 'slave' \ ++ and not boolify(gconf.slave_access_mount): + self.cleanup_mntpt(mntpt) + except: + logging.exception('mount cleanup failure:') +@@ -1451,7 +1454,7 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def make_mount_argv(self): + self.mntpt = tempfile.mkdtemp(prefix='gsyncd-aux-mount-') +- mntpt_list.append(self.mntpt) ++ gconf.mount_point = self.mntpt + return [self.get_glusterprog()] + \ + ['--' + p for p in self.params] + [self.mntpt] + +@@ -1483,6 +1486,11 @@ class GLUSTER(AbstractUrl, SlaveLocal, SlaveRemote): + + def handle_mounter(self, po): + self.mntpt = po.stdout.readline()[:-1] ++ gconf.mount_point = self.mntpt ++ gconf.mountbroker = True ++ self.umount_cmd = self.make_cli_argv() + ['umount'] ++ gconf.mbr_umount_cmd = self.umount_cmd ++ + po.stdout.close() + sup(self, po) + if po.returncode != 0: +diff --git a/geo-replication/syncdaemon/syncdutils.py b/geo-replication/syncdaemon/syncdutils.py +index 269f301..2b57f83 100644 +--- a/geo-replication/syncdaemon/syncdutils.py ++++ b/geo-replication/syncdaemon/syncdutils.py +@@ -23,7 +23,6 @@ from errno import EINTR, ENOENT, EPERM, ESTALE, EBUSY, errorcode + from signal import signal, SIGTERM + import select as oselect + from os import waitpid as owaitpid +-import subprocess + + from conf import GLUSTERFS_LIBEXECDIR, UUID_FILE + sys.path.insert(1, GLUSTERFS_LIBEXECDIR) +@@ -209,7 +208,6 @@ def grabpidfile(fname=None, setpid=True): + + final_lock = Lock() + +-mntpt_list = [] + def finalize(*a, **kw): + """all those messy final steps we go trough upon termination + +@@ -256,12 +254,16 @@ def finalize(*a, **kw): + pass + + """ Unmount if not done """ +- for mnt in mntpt_list: +- p0 = subprocess.Popen (["umount", "-l", mnt], stderr=subprocess.PIPE) ++ if gconf.mount_point: ++ if gconf.mountbroker: ++ umount_cmd = gconf.mbr_umount_cmd + [gconf.mount_point, 'lazy'] ++ else: ++ umount_cmd = ['umount', '-l', gconf.mount_point] ++ p0 = subprocess.Popen(umount_cmd, stderr=subprocess.PIPE) + _, errdata = p0.communicate() + if p0.returncode == 0: + try: +- os.rmdir(mnt) ++ os.rmdir(gconf.mount_point) + except OSError: + pass + else: +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 3181d72..8379f64 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -453,6 +453,7 @@ BuildRequires: python-ctypes + %endif + Requires: python2-gluster = %{version}-%{release} + Requires: rsync ++Requires: util-linux + + %description geo-replication + GlusterFS is a distributed file-system capable of scaling to several +@@ -2147,6 +2148,9 @@ fi + %endif + + %changelog ++* Thu Feb 22 2018 Kotresh HR ++- Added util-linux as dependency to georeplication rpm (#1544382) ++ + * Wed Jan 17 2018 Milind Changire + - DOWNSTREAM ONLY - Removed pretrans script for glusterfs-ganesha - (#1410719) + +-- +1.8.3.1 + diff --git a/0165-cluster-dht-Ignore-ENODATA-from-getxattr-for-posix-a.patch b/0165-cluster-dht-Ignore-ENODATA-from-getxattr-for-posix-a.patch new file mode 100644 index 0000000..2375061 --- /dev/null +++ b/0165-cluster-dht-Ignore-ENODATA-from-getxattr-for-posix-a.patch @@ -0,0 +1,54 @@ +From cba83bafc87af2b635c24d1be2d7224a01901c73 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Tue, 20 Feb 2018 20:08:11 +0530 +Subject: [PATCH 165/180] cluster/dht: Ignore ENODATA from getxattr for posix + acls + +dht_migrate_file no longer prints an error if getxattr for +posix acls fails with ENODATA/ENOATTR. + +upstream: https://review.gluster.org/#/c/19603/ + +> Change-Id: Id9ecf6852cb5294c1c154b28d609889ea3420e1c +> BUG: 1546954 +> Signed-off-by: N Balachandran + +Change-Id: Id9ecf6852cb5294c1c154b28d609889ea3420e1c +BUG: 1546945 +Signed-off-by: N Balachandran +Reviewed-on: https://code.engineering.redhat.com/gerrit/130975 +Tested-by: RHGS Build Bot +Reviewed-by: Amar Tumballi +--- + xlators/cluster/dht/src/dht-rebalance.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index e620005..9770359 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -2029,13 +2029,14 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + ret = syncop_getxattr (from, loc, &xattr, POSIX_ACL_ACCESS_XATTR, + NULL, NULL); + if (ret < 0) { +- gf_msg (this->name, GF_LOG_WARNING, -ret, +- DHT_MSG_MIGRATE_FILE_FAILED, +- "Migrate file failed:" +- "%s: failed to get xattr from %s", +- loc->path, from->name); +- *fop_errno = -ret; +- ret = -1; ++ if ((-ret != ENODATA) && (-ret != ENOATTR)) { ++ gf_msg (this->name, GF_LOG_WARNING, -ret, ++ DHT_MSG_MIGRATE_FILE_FAILED, ++ "Migrate file failed:" ++ "%s: failed to get xattr from %s", ++ loc->path, from->name); ++ *fop_errno = -ret; ++ } + } else { + ret = syncop_setxattr (to, loc, xattr, 0, NULL, NULL); + if (ret < 0) { +-- +1.8.3.1 + diff --git a/0166-rpcsvc-scale-rpcsvc_request_handler-threads.patch b/0166-rpcsvc-scale-rpcsvc_request_handler-threads.patch new file mode 100644 index 0000000..5e5ff60 --- /dev/null +++ b/0166-rpcsvc-scale-rpcsvc_request_handler-threads.patch @@ -0,0 +1,361 @@ +From 8503ed9b94777d47352f19ebfa844e151352b87f Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Fri, 2 Mar 2018 15:39:27 +0530 +Subject: [PATCH 166/180] rpcsvc: scale rpcsvc_request_handler threads + +Scale rpcsvc_request_handler threads to match the scaling of event +handler threads. + +Please refer to https://bugzilla.redhat.com/show_bug.cgi?id=1467614#c51 +for a discussion about why we need multi-threaded rpcsvc request +handlers. + +mainline: +> Reviewed-on: https://review.gluster.org/19337 +> Reviewed-by: Raghavendra G +> Signed-off-by: Milind Changire +(cherry picked from commit 7d641313f46789ec0a7ba0cc04f504724c780855) + +Change-Id: Ib6838fb8b928e15602a3d36fd66b7ba08999430b +BUG: 1549497 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/131596 +Tested-by: RHGS Build Bot +Reviewed-by: Raghavendra Gowdappa +--- + glusterfsd/src/Makefile.am | 1 + + glusterfsd/src/glusterfsd-mgmt.c | 16 ++++- + glusterfsd/src/glusterfsd.h | 2 +- + libglusterfs/src/event-poll.c | 7 ++ + rpc/rpc-lib/src/rpcsvc.c | 129 +++++++++++++++++++++++++++++++---- + rpc/rpc-lib/src/rpcsvc.h | 8 +++ + xlators/protocol/server/src/server.c | 10 ++- + 7 files changed, 153 insertions(+), 20 deletions(-) + +diff --git a/glusterfsd/src/Makefile.am b/glusterfsd/src/Makefile.am +index 0196204..8ab585c 100644 +--- a/glusterfsd/src/Makefile.am ++++ b/glusterfsd/src/Makefile.am +@@ -22,6 +22,7 @@ AM_CPPFLAGS = $(GF_CPPFLAGS) \ + -I$(top_srcdir)/rpc/xdr/src \ + -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/xlators/nfs/server/src \ ++ -I$(top_srcdir)/xlators/protocol/server/src \ + -I$(top_srcdir)/api/src + + AM_CFLAGS = -Wall $(GF_CFLAGS) +diff --git a/glusterfsd/src/glusterfsd-mgmt.c b/glusterfsd/src/glusterfsd-mgmt.c +index ca706d1..69d93f5 100644 +--- a/glusterfsd/src/glusterfsd-mgmt.c ++++ b/glusterfsd/src/glusterfsd-mgmt.c +@@ -33,6 +33,7 @@ + #include "syncop.h" + #include "xlator.h" + #include "syscall.h" ++#include "server.h" + + static gf_boolean_t is_mgmt_rpc_reconnect = _gf_false; + int need_emancipate = 0; +@@ -185,12 +186,15 @@ glusterfs_terminate_response_send (rpcsvc_request_t *req, int op_ret) + } + + void +-glusterfs_autoscale_threads (glusterfs_ctx_t *ctx, int incr) ++glusterfs_autoscale_threads (glusterfs_ctx_t *ctx, int incr, xlator_t *this) + { + struct event_pool *pool = ctx->event_pool; ++ server_conf_t *conf = this->private; ++ int thread_count = pool->eventthreadcount; + + pool->auto_thread_count += incr; +- (void) event_reconfigure_threads (pool, pool->eventthreadcount+incr); ++ (void) event_reconfigure_threads (pool, thread_count+incr); ++ rpcsvc_ownthread_reconf (conf->rpc, pool->eventthreadcount); + } + + int +@@ -839,6 +843,7 @@ glusterfs_handle_attach (rpcsvc_request_t *req) + xlator_t *nextchild = NULL; + glusterfs_graph_t *newgraph = NULL; + glusterfs_ctx_t *ctx = NULL; ++ xlator_t *protocol_server = NULL; + + GF_ASSERT (req); + this = THIS; +@@ -876,7 +881,12 @@ glusterfs_handle_attach (rpcsvc_request_t *req) + nextchild->name); + goto out; + } +- glusterfs_autoscale_threads (this->ctx, 1); ++ /* we need a protocol/server xlator as ++ * nextchild ++ */ ++ protocol_server = this->ctx->active->first; ++ glusterfs_autoscale_threads (this->ctx, 1, ++ protocol_server); + } + } else { + gf_log (this->name, GF_LOG_WARNING, +diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h +index 6d1e165..43cef52 100644 +--- a/glusterfsd/src/glusterfsd.h ++++ b/glusterfsd/src/glusterfsd.h +@@ -124,7 +124,7 @@ int glusterfs_volume_top_read_perf (uint32_t blk_size, uint32_t blk_count, + char *brick_path, double *throughput, + double *time); + void +-glusterfs_autoscale_threads (glusterfs_ctx_t *ctx, int incr); ++glusterfs_autoscale_threads (glusterfs_ctx_t *ctx, int incr, xlator_t *this); + + extern glusterfs_ctx_t *glusterfsd_ctx; + #endif /* __GLUSTERFSD_H__ */ +diff --git a/libglusterfs/src/event-poll.c b/libglusterfs/src/event-poll.c +index 3bffc47..b1aca82 100644 +--- a/libglusterfs/src/event-poll.c ++++ b/libglusterfs/src/event-poll.c +@@ -173,6 +173,13 @@ event_pool_new_poll (int count, int eventthreadcount) + "thread count (%d) ignored", eventthreadcount); + } + ++ /* although, eventhreadcount for poll implementaiton is always ++ * going to be 1, eventthreadcount needs to be set to 1 so that ++ * rpcsvc_request_handler() thread scaling works flawlessly in ++ * both epoll and poll models ++ */ ++ event_pool->eventthreadcount = 1; ++ + return event_pool; + } + +diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c +index 68e27ab..31b5eb5 100644 +--- a/rpc/rpc-lib/src/rpcsvc.c ++++ b/rpc/rpc-lib/src/rpcsvc.c +@@ -1877,39 +1877,105 @@ rpcsvc_request_handler (void *arg) + goto unlock; + } + +- while (list_empty (&program->request_queue)) ++ while (list_empty (&program->request_queue) && ++ (program->threadcount <= ++ program->eventthreadcount)) { + pthread_cond_wait (&program->queue_cond, + &program->queue_lock); ++ } + +- req = list_entry (program->request_queue.next, +- typeof (*req), request_list); +- +- list_del_init (&req->request_list); ++ if (program->threadcount > program->eventthreadcount) { ++ done = 1; ++ program->threadcount--; ++ ++ gf_log (GF_RPCSVC, GF_LOG_INFO, ++ "program '%s' thread terminated; " ++ "total count:%d", ++ program->progname, ++ program->threadcount); ++ } else if (!list_empty (&program->request_queue)) { ++ req = list_entry (program->request_queue.next, ++ typeof (*req), request_list); ++ ++ list_del_init (&req->request_list); ++ } + } + unlock: + pthread_mutex_unlock (&program->queue_lock); + ++ if (req) { ++ THIS = req->svc->xl; ++ actor = rpcsvc_program_actor (req); ++ ret = actor->actor (req); ++ ++ if (ret != 0) { ++ rpcsvc_check_and_reply_error (ret, NULL, req); ++ } ++ req = NULL; ++ } ++ + if (done) + break; ++ } + +- THIS = req->svc->xl; ++ return NULL; ++} + +- actor = rpcsvc_program_actor (req); ++int ++rpcsvc_spawn_threads (rpcsvc_t *svc, rpcsvc_program_t *program) ++{ ++ int ret = 0, delta = 0, creates = 0; + +- ret = actor->actor (req); ++ if (!program || !svc) ++ goto out; + +- if (ret != 0) { +- rpcsvc_check_and_reply_error (ret, NULL, req); ++ pthread_mutex_lock (&program->queue_lock); ++ { ++ delta = program->eventthreadcount - program->threadcount; ++ ++ if (delta >= 0) { ++ while (delta--) { ++ ret = gf_thread_create (&program->thread, NULL, ++ rpcsvc_request_handler, ++ program, "rpcrqhnd"); ++ if (!ret) { ++ program->threadcount++; ++ creates++; ++ } ++ } ++ ++ if (creates) { ++ gf_log (GF_RPCSVC, GF_LOG_INFO, ++ "spawned %d threads for program '%s'; " ++ "total count:%d", ++ creates, ++ program->progname, ++ program->threadcount); ++ } ++ } else { ++ gf_log (GF_RPCSVC, GF_LOG_INFO, ++ "terminating %d threads for program '%s'", ++ -delta, program->progname); ++ ++ /* this signal is to just wake up the threads so they ++ * test for the change in eventthreadcount and kill ++ * themselves until the program thread count becomes ++ * equal to the event thread count ++ */ ++ pthread_cond_broadcast (&program->queue_cond); + } + } ++ pthread_mutex_unlock (&program->queue_lock); + +- return NULL; ++out: ++ return creates; + } + + int + rpcsvc_program_register (rpcsvc_t *svc, rpcsvc_program_t *program) + { + int ret = -1; ++ int creates = -1; + rpcsvc_program_t *newprog = NULL; + char already_registered = 0; + +@@ -1957,9 +2023,12 @@ rpcsvc_program_register (rpcsvc_t *svc, rpcsvc_program_t *program) + newprog->ownthread = _gf_false; + + if (newprog->ownthread) { +- gf_thread_create (&newprog->thread, NULL, +- rpcsvc_request_handler, +- newprog, "rpcsvcrh"); ++ newprog->eventthreadcount = 1; ++ creates = rpcsvc_spawn_threads (svc, newprog); ++ ++ if (creates < 1) { ++ goto out; ++ } + } + + pthread_mutex_lock (&svc->rpclock); +@@ -2816,6 +2885,38 @@ out: + return ret; + } + ++/* During reconfigure, Make sure to call this function after event-threads are ++ * reconfigured as programs' threadcount will be made equal to event threads. ++ */ ++ ++int ++rpcsvc_ownthread_reconf (rpcsvc_t *svc, int new_eventthreadcount) ++{ ++ int ret = -1; ++ rpcsvc_program_t *program = NULL; ++ ++ if (!svc) { ++ ret = 0; ++ goto out; ++ } ++ ++ pthread_rwlock_wrlock (&svc->rpclock); ++ { ++ list_for_each_entry (program, &svc->programs, program) { ++ if (program->ownthread) { ++ program->eventthreadcount = ++ new_eventthreadcount; ++ rpcsvc_spawn_threads (svc, program); ++ } ++ } ++ } ++ pthread_rwlock_unlock (&svc->rpclock); ++ ++ ret = 0; ++out: ++ return ret; ++} ++ + + rpcsvc_actor_t gluster_dump_actors[GF_DUMP_MAXVALUE] = { + [GF_DUMP_NULL] = {"NULL", GF_DUMP_NULL, NULL, NULL, 0, DRC_NA}, +diff --git a/rpc/rpc-lib/src/rpcsvc.h b/rpc/rpc-lib/src/rpcsvc.h +index 73507b6..4ae2350 100644 +--- a/rpc/rpc-lib/src/rpcsvc.h ++++ b/rpc/rpc-lib/src/rpcsvc.h +@@ -412,6 +412,12 @@ struct rpcsvc_program { + pthread_mutex_t queue_lock; + pthread_cond_t queue_cond; + pthread_t thread; ++ int threadcount; ++ /* eventthreadcount is just a readonly copy of the actual value ++ * owned by the event sub-system ++ * It is used to control the scaling of rpcsvc_request_handler threads ++ */ ++ int eventthreadcount; + }; + + typedef struct rpcsvc_cbk_program { +@@ -623,4 +629,6 @@ rpcsvc_auth_array (rpcsvc_t *svc, char *volname, int *autharr, int arrlen); + rpcsvc_vector_sizer + rpcsvc_get_program_vector_sizer (rpcsvc_t *svc, uint32_t prognum, + uint32_t progver, int procnum); ++extern int ++rpcsvc_ownthread_reconf (rpcsvc_t *svc, int new_eventthreadcount); + #endif +diff --git a/xlators/protocol/server/src/server.c b/xlators/protocol/server/src/server.c +index 6dc9d0f..4627ea0 100644 +--- a/xlators/protocol/server/src/server.c ++++ b/xlators/protocol/server/src/server.c +@@ -990,6 +990,12 @@ do_rpc: + + ret = server_init_grace_timer (this, options, conf); + ++ /* rpcsvc thread reconfigure should be after events thread ++ * reconfigure ++ */ ++ new_nthread = ++ ((struct event_pool *)(this->ctx->event_pool))->eventthreadcount; ++ ret = rpcsvc_ownthread_reconf (rpc_conf, new_nthread); + out: + THIS = oldTHIS; + gf_msg_debug ("", 0, "returning %d", ret); +@@ -1569,9 +1575,9 @@ notify (xlator_t *this, int32_t event, void *data, ...) + (*trav_p) = (*trav_p)->next; + glusterfs_mgmt_pmap_signout (ctx, + victim->name); +- glusterfs_autoscale_threads (THIS->ctx, -1); ++ /* we need the protocol/server xlator here as 'this' */ ++ glusterfs_autoscale_threads (ctx, -1, this); + default_notify (victim, GF_EVENT_CLEANUP, data); +- + } + break; + +-- +1.8.3.1 + diff --git a/0167-glusterd-ganesha-change-voltype-for-ganesha.enable-i.patch b/0167-glusterd-ganesha-change-voltype-for-ganesha.enable-i.patch new file mode 100644 index 0000000..e887bb4 --- /dev/null +++ b/0167-glusterd-ganesha-change-voltype-for-ganesha.enable-i.patch @@ -0,0 +1,44 @@ +From 48201511990f4677e634f236bb81ad03a23fc52a Mon Sep 17 00:00:00 2001 +From: Jiffin Tony Thottan +Date: Tue, 27 Feb 2018 15:35:30 +0530 +Subject: [PATCH 167/180] glusterd/ganesha : change voltype for ganesha.enable + in volume option table + +The voltype defined for ganesha.enable is features/ganesha. But ganesha xlator +was removed from client stack long back. Now it is defined as part of glusterd. +So reflecting the same on the volume option table. + +Label: BACKPORT FROM UPSTREAM 3.10 + +Upstream reference : +>patch link https://review.gluster.org/19639 +>Change-Id: Ifedd7493020b77bd54edfdbdd9c799d93b24d0aa +>BUG: 1486542 +>Signed-off-by: Jiffin Tony Thottan + +Change-Id: Ifedd7493020b77bd54edfdbdd9c799d93b24d0aa +BUG: 1378371 +Signed-off-by: Jiffin Tony Thottan +Reviewed-on: https://code.engineering.redhat.com/gerrit/131343 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index af0a982..b603c7f 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3232,7 +3232,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "ganesha.enable", +- .voltype = "features/ganesha", ++ .voltype = "mgmt/glusterd", + .value = "off", + .option = "ganesha.enable", + .op_version = GD_OP_VERSION_3_7_0, +-- +1.8.3.1 + diff --git a/0168-features-shard-Pass-the-correct-block-num-to-store-i.patch b/0168-features-shard-Pass-the-correct-block-num-to-store-i.patch new file mode 100644 index 0000000..0f115dc --- /dev/null +++ b/0168-features-shard-Pass-the-correct-block-num-to-store-i.patch @@ -0,0 +1,43 @@ +From 79eccaf175f9beb5b1bbda8a8e8cfa84829879ca Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Mon, 26 Feb 2018 15:22:58 +0530 +Subject: [PATCH 168/180] features/shard: Pass the correct block-num to store + in inode ctx + +> Upstream: https://review.gluster.org/19630 +> BUG: 1468483 +> Change-Id: Icf3a5d0598a081adb7d234a60bd15250a5ce1532 + +Change-Id: I790fddb241765663361139673f8ecaf19a3ff7fb +BUG: 1493085 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/131734 +Tested-by: RHGS Build Bot +Reviewed-by: Pranith Kumar Karampuri +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 3b6b41b..20a0608 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -641,7 +641,6 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + gf_msg_debug (this->name, 0, "Shard %d already " + "present. gfid=%s. Saving inode for future.", + shard_idx_iter, uuid_utoa(inode->gfid)); +- shard_idx_iter++; + local->inode_list[i] = inode; + /* Let the ref on the inodes that are already present + * in inode table still be held so that they don't get +@@ -655,6 +654,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + shard_idx_iter); + } + UNLOCK(&priv->lock); ++ shard_idx_iter++; + + continue; + } else { +-- +1.8.3.1 + diff --git a/0169-features-shard-Leverage-block_num-info-in-inode-ctx-.patch b/0169-features-shard-Leverage-block_num-info-in-inode-ctx-.patch new file mode 100644 index 0000000..ac07e26 --- /dev/null +++ b/0169-features-shard-Leverage-block_num-info-in-inode-ctx-.patch @@ -0,0 +1,80 @@ +From 347219cb249806a47d88f1de118dd3ddf9f5fbdd Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Mon, 26 Feb 2018 15:58:13 +0530 +Subject: [PATCH 169/180] features/shard: Leverage block_num info in inode-ctx + in read callback + +... instead of adding this information in fd_ctx in call path and +retrieving it again in the callback. + +> Upstream: https://review.gluster.org/19633 +> BUG: 1468483 +> Change-Id: Ibbddbbe85baadb7e24aacf5ec8a1250d493d7800 + +Change-Id: I384c1c12c1b39c36524761f45d5fbcc8608d96e3 +BUG: 1493085 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/131735 +Reviewed-by: Pranith Kumar Karampuri +Tested-by: Pranith Kumar Karampuri +Tested-by: RHGS Build Bot +--- + xlators/features/shard/src/shard.c | 21 +++------------------ + 1 file changed, 3 insertions(+), 18 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 20a0608..7677a14 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -3101,6 +3101,7 @@ shard_readv_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + struct iovec vec = {0,}; + shard_local_t *local = NULL; + fd_t *anon_fd = cookie; ++ shard_inode_ctx_t *ctx = NULL; + + local = frame->local; + +@@ -3119,7 +3120,8 @@ shard_readv_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + if (local->op_ret >= 0) + local->op_ret += op_ret; + +- fd_ctx_get (anon_fd, this, &block_num); ++ shard_inode_ctx_get (anon_fd->inode, this, &ctx); ++ block_num = ctx->block_num; + + if (block_num == local->first_block) { + address = local->iobuf->ptr; +@@ -3172,7 +3174,6 @@ int + shard_readv_do (call_frame_t *frame, xlator_t *this) + { + int i = 0; +- int ret = 0; + int call_count = 0; + int last_block = 0; + int cur_block = 0; +@@ -3229,22 +3230,6 @@ shard_readv_do (call_frame_t *frame, xlator_t *this) + } + } + +- ret = fd_ctx_set (anon_fd, this, cur_block); +- if (ret) { +- gf_msg (this->name, GF_LOG_ERROR, 0, +- SHARD_MSG_FD_CTX_SET_FAILED, +- "Failed to set fd ctx for block %d, gfid=%s", +- cur_block, +- uuid_utoa (local->inode_list[i]->gfid)); +- local->op_ret = -1; +- local->op_errno = ENOMEM; +- wind_failed = _gf_true; +- shard_readv_do_cbk (frame, (void *) (long) anon_fd, +- this, -1, ENOMEM, NULL, 0, NULL, +- NULL, NULL); +- goto next; +- } +- + STACK_WIND_COOKIE (frame, shard_readv_do_cbk, anon_fd, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, anon_fd, +-- +1.8.3.1 + diff --git a/0170-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch b/0170-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch new file mode 100644 index 0000000..bb00159 --- /dev/null +++ b/0170-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch @@ -0,0 +1,150 @@ +From c1e17987f2399999707f464b13fcbd316b17af59 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Thu, 15 Feb 2018 16:12:12 +0530 +Subject: [PATCH 170/180] features/shard: Fix shard inode refcount when it's + part of priv->lru_list. + +For as long as a shard's inode is in priv->lru_list, it should have a non-zero +ref-count. This patch achieves it by taking a ref on the inode when it +is added to lru list. When it's time for the inode to be evicted +from the lru list, a corresponding unref is done. + +> Upstream: https://review.gluster.org/19608 +> BUG: 1468483 +> Change-Id: I289ffb41e7be5df7489c989bc1bbf53377433c86 + +Change-Id: Id540d44643b24a1be2198c48e35e478081368676 +BUG: 1493085 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/131736 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + tests/bugs/shard/shard-inode-refcount-test.t | 27 +++++++++++++++++++++++++++ + tests/volume.rc | 18 ++++++++++++++++++ + xlators/features/shard/src/shard.c | 26 +++++++++++++++++--------- + 3 files changed, 62 insertions(+), 9 deletions(-) + create mode 100644 tests/bugs/shard/shard-inode-refcount-test.t + +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +new file mode 100644 +index 0000000..6358097 +--- /dev/null ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -0,0 +1,27 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume start $V0 ++ ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST dd if=/dev/zero of=$M0/one-plus-five-shards bs=1M count=23 ++ ++ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) ++TEST rm -f $M0/one-plus-five-shards ++EXPECT `expr $ACTIVE_INODES_BEFORE - 5` get_mount_active_size_value $V0 ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/tests/volume.rc b/tests/volume.rc +index 1ca17ab..a15c8e5 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -808,3 +808,21 @@ function get_fd_count { + rm -f $statedump + echo $count + } ++ ++function get_mount_active_size_value { ++ local vol=$1 ++ local statedump=$(generate_mount_statedump $vol) ++ sleep 1 ++ local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1) ++ rm -f $statedump ++ echo $val ++} ++ ++function get_mount_lru_size_value { ++ local vol=$1 ++ local statedump=$(generate_mount_statedump $vol) ++ sleep 1 ++ local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1) ++ rm -f $statedump ++ echo $val ++} +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 7677a14..49cf04a 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -502,6 +502,10 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + * by empty list), and if there is still space in the priv list, + * add this ctx to the tail of the list. + */ ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref (linked_inode); + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); +@@ -527,8 +531,16 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + /* The following unref corresponds to the ref held by + * inode_find() above. + */ +- inode_forget (lru_inode, 0); + inode_unref (lru_inode); ++ /* The following unref corresponds to the ref held at ++ * the time the shard was created or looked up ++ */ ++ inode_unref (lru_inode); ++ inode_forget (lru_inode, 0); ++ /* For as long as an inode is in lru list, we try to ++ * keep it alive by holding a ref on it. ++ */ ++ inode_ref (linked_inode); + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); +@@ -1658,11 +1670,6 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + buf); + inode_lookup (linked_inode); + list_index = block_num - local->first_block; +- +- /* Defer unref'ing the inodes until write is complete. These inodes are +- * unref'd in the event of a failure or after successful fop completion +- * in shard_local_wipe(). +- */ + local->inode_list[list_index] = linked_inode; + + LOCK(&priv->lock); +@@ -2520,10 +2527,11 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + if (!list_empty (&ctx->ilist)) { + list_del_init (&ctx->ilist); + priv->inode_count--; ++ GF_ASSERT (priv->inode_count >= 0); ++ inode_unlink (inode, priv->dot_shard_inode, block_bname); ++ inode_unref (inode); ++ inode_forget (inode, 0); + } +- GF_ASSERT (priv->inode_count >= 0); +- inode_unlink (inode, priv->dot_shard_inode, block_bname); +- inode_forget (inode, 0); + } + UNLOCK(&priv->lock); + +-- +1.8.3.1 + diff --git a/0171-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch b/0171-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch new file mode 100644 index 0000000..ac3a4f5 --- /dev/null +++ b/0171-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch @@ -0,0 +1,887 @@ +From 1c094d1dc70dbc5d08181ef64fb300c95f331aec Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Wed, 6 Dec 2017 16:55:33 +0530 +Subject: [PATCH 171/180] features/shard: Upon FSYNC from upper layers, wind + fsync on all changed shards + +> Upstream: https://review.gluster.org/19566 +> BUG: 1468483 +> Change-Id: Ib74354f57a18569762ad45a51f182822a2537421 + +Change-Id: I93797a60e3449d02413d171babfb8e4292e3f2f6 +BUG: 1493085 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/131737 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +Reviewed-by: Pranith Kumar Karampuri +--- + tests/bugs/shard/bug-1468483.t | 58 +++ + tests/bugs/shard/shard-inode-refcount-test.t | 2 +- + xlators/features/shard/src/shard-messages.h | 9 +- + xlators/features/shard/src/shard.c | 533 +++++++++++++++++++++++++-- + xlators/features/shard/src/shard.h | 6 + + 5 files changed, 569 insertions(+), 39 deletions(-) + create mode 100644 tests/bugs/shard/bug-1468483.t + +diff --git a/tests/bugs/shard/bug-1468483.t b/tests/bugs/shard/bug-1468483.t +new file mode 100644 +index 0000000..e462b8d +--- /dev/null ++++ b/tests/bugs/shard/bug-1468483.t +@@ -0,0 +1,58 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../common-utils.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 $H0:$B0/${V0}0 ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 16MB ++TEST $CLI volume start $V0 ++TEST $CLI volume profile $V0 start ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++TEST dd if=/dev/zero conv=fsync of=$M0/foo bs=1M count=100 ++ ++#This should ensure /.shard is created on the bricks. ++TEST stat $B0/${V0}0/.shard ++ ++gfid_foo=$(get_gfid_string $M0/foo) ++ ++TEST stat $B0/${V0}0/.shard/$gfid_foo.1 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.2 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.3 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.4 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.5 ++TEST stat $B0/${V0}0/.shard/$gfid_foo.6 ++ ++# For a file with 7 shards, there should be 7 fsyncs on the brick. Without this ++# fix, I was seeing only 1 fsync (on the base shard alone). ++ ++EXPECT "7" echo `$CLI volume profile $V0 info incremental | grep -w FSYNC | awk '{print $8}'` ++ ++useradd -M test_user 2>/dev/null ++ ++TEST touch $M0/bar ++ ++# Change ownership to non-root on bar. ++TEST chown test_user:test_user $M0/bar ++ ++TEST $CLI volume profile $V0 stop ++TEST $CLI volume profile $V0 start ++ ++# Write 100M of data on bar as non-root. ++TEST run_cmd_as_user test_user "dd if=/dev/zero conv=fsync of=$M0/bar bs=1M count=100" ++ ++EXPECT "7" echo `$CLI volume profile $V0 info incremental | grep -w FSYNC | awk '{print $8}'` ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++userdel test_user ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +index 6358097..03e0cc9 100644 +--- a/tests/bugs/shard/shard-inode-refcount-test.t ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -14,7 +14,7 @@ TEST $CLI volume start $V0 + + TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 + +-TEST dd if=/dev/zero of=$M0/one-plus-five-shards bs=1M count=23 ++TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23 + + ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) + TEST rm -f $M0/one-plus-five-shards +diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h +index 588cb68..8e61630 100644 +--- a/xlators/features/shard/src/shard-messages.h ++++ b/xlators/features/shard/src/shard-messages.h +@@ -40,7 +40,7 @@ + */ + + #define GLFS_COMP_BASE_SHARD GLFS_MSGID_COMP_SHARD +-#define GLFS_NUM_MESSAGES 18 ++#define GLFS_NUM_MESSAGES 19 + #define GLFS_MSGID_END (GLFS_COMP_BASE_SHARD + GLFS_NUM_MESSAGES + 1) + + #define glfs_msg_start_x GLFS_COMP_BASE_SHARD, "Invalid: Start of messages" +@@ -180,5 +180,12 @@ + */ + #define SHARD_MSG_INVALID_FOP (GLFS_COMP_BASE_SHARD + 18) + ++/*! ++ * @messageid 133019 ++ * @diagnosis ++ * @recommendedaction ++*/ ++#define SHARD_MSG_MEMALLOC_FAILED (GLFS_COMP_BASE_SHARD + 19) ++ + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" + #endif /* !_SHARD_MESSAGES_H_ */ +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 49cf04a..a661345 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -76,6 +76,7 @@ __shard_inode_ctx_get (inode_t *inode, xlator_t *this, shard_inode_ctx_t **ctx) + return ret; + + INIT_LIST_HEAD (&ctx_p->ilist); ++ INIT_LIST_HEAD (&ctx_p->to_fsync_list); + + ret = __inode_ctx_set (inode, this, (uint64_t *)&ctx_p); + if (ret < 0) { +@@ -205,6 +206,65 @@ shard_inode_ctx_set_refreshed_flag (inode_t *inode, xlator_t *this) + return ret; + } + ++int ++__shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *base_ictx = NULL; ++ shard_inode_ctx_t *shard_ictx = NULL; ++ ++ ret = __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ if (ret) ++ return ret; ++ ++ ret = __shard_inode_ctx_get (shard_inode, this, &shard_ictx); ++ if (ret) ++ return ret; ++ ++ if (shard_ictx->fsync_needed) { ++ shard_ictx->fsync_needed++; ++ return 1; ++ } ++ ++ list_add_tail (&shard_ictx->to_fsync_list, &base_ictx->to_fsync_list); ++ shard_ictx->inode = shard_inode; ++ shard_ictx->fsync_needed++; ++ base_ictx->fsync_count++; ++ shard_ictx->base_inode = base_inode; ++ ++ return 0; ++} ++ ++int ++shard_inode_ctx_add_to_fsync_list (inode_t *base_inode, xlator_t *this, ++ inode_t *shard_inode) ++{ ++ int ret = -1; ++ ++ /* This ref acts as a refkeepr on the base inode. We ++ * need to keep this inode alive as it holds the head ++ * of the to_fsync_list. ++ */ ++ inode_ref (base_inode); ++ ++ LOCK (&base_inode->lock); ++ LOCK (&shard_inode->lock); ++ { ++ ret = __shard_inode_ctx_add_to_fsync_list (base_inode, this, ++ shard_inode); ++ } ++ UNLOCK (&shard_inode->lock); ++ UNLOCK (&base_inode->lock); ++ ++ /* Unref the base inode corresponding to the ref above, if the shard is ++ * found to be already part of the fsync list. ++ */ ++ if (ret != 0) ++ inode_unref (base_inode); ++ return ret; ++} ++ + gf_boolean_t + __shard_inode_ctx_needs_lookup (inode_t *inode, xlator_t *this) + { +@@ -301,6 +361,40 @@ shard_inode_ctx_get_block_size (inode_t *inode, xlator_t *this, + } + + int ++__shard_inode_ctx_get_fsync_count (inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ ret = __inode_ctx_get (inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; ++ ++ ctx = (shard_inode_ctx_t *) ctx_uint; ++ ++ *fsync_count = ctx->fsync_needed; ++ ++ return 0; ++} ++ ++int ++shard_inode_ctx_get_fsync_count (inode_t *inode, xlator_t *this, ++ int *fsync_count) ++{ ++ int ret = -1; ++ ++ LOCK (&inode->lock); ++ { ++ ret = __shard_inode_ctx_get_fsync_count (inode, this, ++ fsync_count); ++ } ++ UNLOCK (&inode->lock); ++ ++ return ret; ++} ++int + __shard_inode_ctx_get_all (inode_t *inode, xlator_t *this, + shard_inode_ctx_t *ctx_out) + { +@@ -482,15 +576,19 @@ out: + return ret; + } + +-void ++inode_t * + __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + inode_t *base_inode, int block_num) + { +- char block_bname[256] = {0,}; +- inode_t *lru_inode = NULL; +- shard_priv_t *priv = NULL; +- shard_inode_ctx_t *ctx = NULL; +- shard_inode_ctx_t *lru_inode_ctx = NULL; ++ char block_bname[256] = {0,}; ++ inode_t *lru_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *lru_inode_ctx = NULL; ++ shard_inode_ctx_t *lru_base_inode_ctx = NULL; ++ inode_t *fsync_inode = NULL; ++ inode_t *lru_base_inode = NULL; ++ gf_boolean_t do_fsync = _gf_false; + + priv = this->private; + +@@ -510,6 +608,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); + priv->inode_count++; ++ ctx->base_inode = base_inode; + } else { + /*If on the other hand there is no available slot for this inode + * in the list, delete the lru inode from the head of the list, +@@ -519,30 +618,56 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + shard_inode_ctx_t, + ilist); + GF_ASSERT (lru_inode_ctx->block_num > 0); ++ lru_base_inode = lru_inode_ctx->base_inode; + list_del_init (&lru_inode_ctx->ilist); + lru_inode = inode_find (linked_inode->table, + lru_inode_ctx->stat.ia_gfid); +- shard_make_block_bname (lru_inode_ctx->block_num, +- lru_inode_ctx->base_gfid, +- block_bname, +- sizeof (block_bname)); +- inode_unlink (lru_inode, priv->dot_shard_inode, +- block_bname); +- /* The following unref corresponds to the ref held by +- * inode_find() above. ++ /* If the lru inode was part of the pending-fsync list, ++ * the base inode needs to be unref'd, the lru inode ++ * deleted from fsync list and fsync'd in a new frame, ++ * and then unlinked in memory and forgotten. + */ +- inode_unref (lru_inode); ++ LOCK (&lru_base_inode->lock); ++ LOCK (&lru_inode->lock); ++ { ++ if (!list_empty(&lru_inode_ctx->to_fsync_list)) { ++ list_del_init (&lru_inode_ctx->to_fsync_list); ++ lru_inode_ctx->fsync_needed = 0; ++ do_fsync = _gf_true; ++ __shard_inode_ctx_get (lru_base_inode, this, &lru_base_inode_ctx); ++ lru_base_inode_ctx->fsync_count--; ++ } ++ } ++ UNLOCK (&lru_inode->lock); ++ UNLOCK (&lru_base_inode->lock); ++ ++ if (!do_fsync) { ++ shard_make_block_bname (lru_inode_ctx->block_num, ++ lru_inode_ctx->base_gfid, ++ block_bname, ++ sizeof (block_bname)); + /* The following unref corresponds to the ref held at +- * the time the shard was created or looked up ++ * the time the shard was added to the lru list. ++ */ ++ inode_unref (lru_inode); ++ inode_unlink (lru_inode, priv->dot_shard_inode, ++ block_bname); ++ inode_forget (lru_inode, 0); ++ } else { ++ fsync_inode = lru_inode; ++ inode_unref (lru_base_inode); ++ } ++ /* The following unref corresponds to the ref ++ * held by inode_find() above. + */ + inode_unref (lru_inode); +- inode_forget (lru_inode, 0); + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref (linked_inode); + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; ++ ctx->base_inode = base_inode; + list_add_tail (&ctx->ilist, &priv->ilist_head); + } + } else { +@@ -551,6 +676,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + */ + list_move_tail (&ctx->ilist, &priv->ilist_head); + } ++ return fsync_inode; + } + + int +@@ -617,6 +743,85 @@ shard_common_inode_write_success_unwind (glusterfs_fop_t fop, + } + + int ++shard_evicted_inode_fsync_cbk (call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, int32_t op_errno, ++ struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ char block_bname[256] = {0,}; ++ fd_t *anon_fd = cookie; ++ inode_t *shard_inode = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ shard_inode = anon_fd->inode; ++ ++ if (op_ret < 0) { ++ gf_msg (this->name, GF_LOG_WARNING, op_errno, ++ SHARD_MSG_MEMALLOC_FAILED, "fsync failed on shard"); ++ goto out; ++ } ++ ++ LOCK (&priv->lock); ++ LOCK(&shard_inode->lock); ++ { ++ __shard_inode_ctx_get (shard_inode, this, &ctx); ++ if ((list_empty(&ctx->to_fsync_list)) && ++ (list_empty(&ctx->ilist))) { ++ shard_make_block_bname (ctx->block_num, ++ shard_inode->gfid, block_bname, ++ sizeof (block_bname)); ++ inode_unlink (shard_inode, priv->dot_shard_inode, ++ block_bname); ++ /* The following unref corresponds to the ref held by ++ * inode_link() at the time the shard was created or ++ * looked up ++ */ ++ inode_unref (shard_inode); ++ inode_forget (shard_inode, 0); ++ } ++ } ++ UNLOCK(&shard_inode->lock); ++ UNLOCK(&priv->lock); ++ ++out: ++ if (anon_fd) ++ fd_unref (anon_fd); ++ STACK_DESTROY (frame->root); ++ return 0; ++} ++ ++int ++shard_initiate_evicted_inode_fsync (xlator_t *this, inode_t *inode) ++{ ++ fd_t *anon_fd = NULL; ++ call_frame_t *fsync_frame = NULL; ++ ++ fsync_frame = create_frame (this, this->ctx->pool); ++ if (!fsync_frame) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " ++ "to fsync shard"); ++ return -1; ++ } ++ ++ anon_fd = fd_anonymous (inode); ++ if (!anon_fd) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create anon fd to" ++ " fsync shard"); ++ STACK_DESTROY (fsync_frame->root); ++ return -1; ++ } ++ ++ STACK_WIND_COOKIE (fsync_frame, shard_evicted_inode_fsync_cbk, anon_fd, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->fsync, ++ anon_fd, 1, NULL); ++ return 0; ++} ++ ++int + shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + shard_post_resolve_fop_handler_t post_res_handler) + { +@@ -625,6 +830,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + char path[PATH_MAX] = {0,}; + inode_t *inode = NULL; + inode_t *res_inode = NULL; ++ inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + +@@ -661,20 +867,22 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + */ + LOCK(&priv->lock); + { +- __shard_update_shards_inode_list (inode, this, ++ fsync_inode = __shard_update_shards_inode_list (inode, ++ this, + res_inode, + shard_idx_iter); + } + UNLOCK(&priv->lock); + shard_idx_iter++; +- ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync (this, ++ fsync_inode); + continue; + } else { + local->call_count++; + shard_idx_iter++; + } + } +- + out: + post_res_handler (frame, this); + return 0; +@@ -1657,6 +1865,7 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + char block_bname[256] = {0,}; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; ++ inode_t *fsync_inode = NULL; + shard_priv_t *priv = NULL; + + this = THIS; +@@ -1674,10 +1883,14 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + + LOCK(&priv->lock); + { +- __shard_update_shards_inode_list (linked_inode, this, +- local->loc.inode, block_num); ++ fsync_inode = __shard_update_shards_inode_list (linked_inode, ++ this, ++ local->loc.inode, ++ block_num); + } + UNLOCK(&priv->lock); ++ if (fsync_inode) ++ shard_initiate_evicted_inode_fsync (this, fsync_inode); + } + + int +@@ -2120,6 +2333,7 @@ shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); + if (!local->xattr_req) + goto err; ++ local->resolver_base_inode = loc->inode; + + shard_lookup_base_file (frame, this, &local->loc, + shard_post_lookup_truncate_handler); +@@ -2172,6 +2386,7 @@ shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + + local->loc.inode = inode_ref (fd->inode); + gf_uuid_copy (local->loc.gfid, fd->inode->gfid); ++ local->resolver_base_inode = fd->inode; + + shard_lookup_base_file (frame, this, &local->loc, + shard_post_lookup_truncate_handler); +@@ -2509,32 +2724,48 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + { + char block_bname[256] = {0,}; + inode_t *inode = NULL; ++ inode_t *base_inode = NULL; + xlator_t *this = NULL; + shard_priv_t *priv = NULL; + shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ gf_boolean_t unlink_unref_forget = _gf_false; + + this = THIS; + priv = this->private; + + inode = local->inode_list[shard_block_num - local->first_block]; ++ base_inode = local->resolver_base_inode; + + shard_make_block_bname (shard_block_num, (local->loc.inode)->gfid, + block_bname, sizeof (block_bname)); + + LOCK(&priv->lock); ++ LOCK(&base_inode->lock); ++ LOCK(&inode->lock); + { +- shard_inode_ctx_get (inode, this, &ctx); ++ __shard_inode_ctx_get (inode, this, &ctx); + if (!list_empty (&ctx->ilist)) { + list_del_init (&ctx->ilist); + priv->inode_count--; + GF_ASSERT (priv->inode_count >= 0); +- inode_unlink (inode, priv->dot_shard_inode, block_bname); +- inode_unref (inode); +- inode_forget (inode, 0); ++ unlink_unref_forget = _gf_true; ++ } ++ if (ctx->fsync_needed) { ++ inode_unref (base_inode); ++ list_del_init (&ctx->to_fsync_list); ++ __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ base_ictx->fsync_count--; + } + } ++ UNLOCK(&inode->lock); ++ UNLOCK(&base_inode->lock); ++ if (unlink_unref_forget) { ++ inode_unlink (inode, priv->dot_shard_inode, block_bname); ++ inode_unref (inode); ++ inode_forget (inode, 0); ++ } + UNLOCK(&priv->lock); +- + } + + int +@@ -2752,6 +2983,7 @@ shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + local->xflag = xflag; + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); + local->block_size = block_size; ++ local->resolver_base_inode = loc->inode; + local->fop = GF_FOP_UNLINK; + if (!this->itable) + this->itable = (local->loc.inode)->table; +@@ -2988,6 +3220,7 @@ shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + frame->local = local; + loc_copy (&local->loc, oldloc); + loc_copy (&local->loc2, newloc); ++ local->resolver_base_inode = newloc->inode; + local->fop = GF_FOP_RENAME; + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new(); + if (!local->xattr_req) +@@ -3754,6 +3987,10 @@ shard_common_inode_write_do_cbk (call_frame_t *frame, void *cookie, + local->delta_size += (post->ia_size - pre->ia_size); + shard_inode_ctx_set (local->fd->inode, this, post, 0, + SHARD_MASK_TIMES); ++ if (local->fd->inode != anon_fd->inode) ++ shard_inode_ctx_add_to_fsync_list (local->fd->inode, ++ this, ++ anon_fd->inode); + } + } + UNLOCK (&frame->lock); +@@ -4204,18 +4441,198 @@ shard_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + } + + int +-shard_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, struct iatt *prebuf, +- struct iatt *postbuf, dict_t *xdata) ++__shard_get_timestamps_from_inode_ctx (shard_local_t *local, inode_t *inode, ++ xlator_t *this) + { +- if (op_ret < 0) ++ int ret = -1; ++ uint64_t ctx_uint = 0; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ ret = __inode_ctx_get (inode, this, &ctx_uint); ++ if (ret < 0) ++ return ret; ++ ++ ctx = (shard_inode_ctx_t *) ctx_uint; ++ ++ local->postbuf.ia_ctime = ctx->stat.ia_ctime; ++ local->postbuf.ia_ctime_nsec = ctx->stat.ia_ctime_nsec; ++ local->postbuf.ia_atime = ctx->stat.ia_atime; ++ local->postbuf.ia_atime_nsec = ctx->stat.ia_atime_nsec; ++ local->postbuf.ia_mtime = ctx->stat.ia_mtime; ++ local->postbuf.ia_mtime_nsec = ctx->stat.ia_mtime_nsec; ++ ++ return 0; ++} ++ ++int ++shard_get_timestamps_from_inode_ctx (shard_local_t *local, inode_t *inode, ++ xlator_t *this) ++{ ++ int ret = 0; ++ ++ LOCK (&inode->lock); ++ { ++ ret = __shard_get_timestamps_from_inode_ctx (local, inode, ++ this); ++ } ++ UNLOCK (&inode->lock); ++ ++ return ret; ++} ++ ++int ++shard_fsync_shards_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int call_count = 0; ++ uint64_t fsync_count = 0; ++ fd_t *anon_fd = cookie; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *base_ictx = NULL; ++ inode_t *base_inode = NULL; ++ ++ local = frame->local; ++ base_inode = local->fd->inode; ++ ++ if (local->op_ret < 0) + goto out; + +- /* To-Do: Wind fsync on all shards of the file */ +- postbuf->ia_ctime = 0; ++ LOCK (&frame->lock); ++ { ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto out; ++ } ++ shard_inode_ctx_set (local->fd->inode, this, postbuf, 0, ++ SHARD_MASK_TIMES); ++ } ++ UNLOCK (&frame->lock); ++ fd_ctx_get (anon_fd, this, &fsync_count); + out: +- SHARD_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf, +- xdata); ++ if (base_inode != anon_fd->inode) { ++ LOCK (&base_inode->lock); ++ LOCK (&anon_fd->inode->lock); ++ { ++ __shard_inode_ctx_get (anon_fd->inode, this, &ctx); ++ __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ if (op_ret == 0) ++ ctx->fsync_needed -= fsync_count; ++ GF_ASSERT (ctx->fsync_needed >= 0); ++ list_del_init (&ctx->to_fsync_list); ++ if (ctx->fsync_needed != 0) { ++ list_add_tail (&ctx->to_fsync_list, ++ &base_ictx->to_fsync_list); ++ base_ictx->fsync_count++; ++ } ++ } ++ UNLOCK (&anon_fd->inode->lock); ++ UNLOCK (&base_inode->lock); ++ } ++ if (anon_fd) ++ fd_unref (anon_fd); ++ ++ call_count = shard_call_count_return (frame); ++ if (call_count != 0) ++ return 0; ++ ++ if (local->op_ret < 0) { ++ SHARD_STACK_UNWIND (fsync, frame, local->op_ret, ++ local->op_errno, NULL, NULL, NULL); ++ } else { ++ shard_get_timestamps_from_inode_ctx (local, base_inode, this); ++ SHARD_STACK_UNWIND (fsync, frame, local->op_ret, ++ local->op_errno, &local->prebuf, ++ &local->postbuf, local->xattr_rsp); ++ } ++ return 0; ++} ++ ++int ++shard_post_lookup_fsync_handler (call_frame_t *frame, xlator_t *this) ++{ ++ int ret = 0; ++ int call_count = 0; ++ int fsync_count = 0; ++ fd_t *anon_fd = NULL; ++ inode_t *base_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_inode_ctx_t *ctx = NULL; ++ shard_inode_ctx_t *iter = NULL; ++ struct list_head copy = {0,}; ++ shard_inode_ctx_t *tmp = NULL; ++ ++ local = frame->local; ++ base_inode = local->fd->inode; ++ local->postbuf = local->prebuf; ++ INIT_LIST_HEAD (©); ++ ++ if (local->op_ret < 0) { ++ SHARD_STACK_UNWIND (fsync, frame, local->op_ret, ++ local->op_errno, NULL, NULL, NULL); ++ return 0; ++ } ++ ++ LOCK (&base_inode->lock); ++ { ++ __shard_inode_ctx_get (base_inode, this, &ctx); ++ list_splice_init (&ctx->to_fsync_list, ©); ++ call_count = ctx->fsync_count; ++ ctx->fsync_count = 0; ++ } ++ UNLOCK (&base_inode->lock); ++ ++ local->call_count = ++call_count; ++ ++ /* Send fsync() on the base shard first */ ++ anon_fd = fd_ref (local->fd); ++ STACK_WIND_COOKIE (frame, shard_fsync_shards_cbk, anon_fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, anon_fd, ++ local->datasync, local->xattr_req); ++ call_count--; ++ anon_fd = NULL; ++ ++ list_for_each_entry_safe (iter, tmp, ©, to_fsync_list) { ++ fsync_count = 0; ++ shard_inode_ctx_get_fsync_count (iter->inode, this, ++ &fsync_count); ++ GF_ASSERT (fsync_count > 0); ++ anon_fd = fd_anonymous (iter->inode); ++ if (!anon_fd) { ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create " ++ "anon fd to fsync shard"); ++ shard_fsync_shards_cbk (frame, (void *) (long) anon_fd, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); ++ continue; ++ } ++ ++ ret = fd_ctx_set (anon_fd, this, fsync_count); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_FD_CTX_SET_FAILED, "Failed to set fd " ++ "ctx for shard inode gfid=%s", ++ uuid_utoa (iter->inode->gfid)); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ shard_fsync_shards_cbk (frame, (void *) (long) anon_fd, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); ++ continue; ++ } ++ STACK_WIND_COOKIE (frame, shard_fsync_shards_cbk, anon_fd, ++ FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, anon_fd, ++ local->datasync, local->xattr_req); ++ call_count--; ++ } ++ + return 0; + } + +@@ -4223,8 +4640,50 @@ int + shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + dict_t *xdata) + { +- STACK_WIND (frame, shard_fsync_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->fsync, fd, datasync, xdata); ++ int ret = 0; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size (fd->inode, this, &block_size); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " ++ "size for %s from its inode ctx", ++ uuid_utoa (fd->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND (frame, default_fsync_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, fd, datasync, ++ xdata); ++ return 0; ++ } ++ ++ if (!this->itable) ++ this->itable = fd->inode->table; ++ ++ local = mem_get0 (this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ local->fd = fd_ref (fd); ++ local->fop = GF_FOP_FSYNC; ++ local->datasync = datasync; ++ local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); ++ if (!local->xattr_req) ++ goto err; ++ ++ local->loc.inode = inode_ref (fd->inode); ++ gf_uuid_copy (local->loc.gfid, fd->inode->gfid); ++ ++ shard_lookup_base_file (frame, this, &local->loc, ++ shard_post_lookup_fsync_handler); ++ return 0; ++err: ++ SHARD_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 7319598..75d39a1 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -215,6 +215,7 @@ typedef struct shard_local { + uint32_t gid; + uint64_t block_size; + uint64_t dst_block_size; ++ int32_t datasync; + off_t offset; + size_t total_size; + size_t written_size; +@@ -270,6 +271,11 @@ typedef struct shard_inode_ctx { + uuid_t base_gfid; + int block_num; + gf_boolean_t refreshed; ++ struct list_head to_fsync_list; ++ int fsync_needed; ++ inode_t *inode; ++ int fsync_count; ++ inode_t *base_inode; + } shard_inode_ctx_t; + + #endif /* __SHARD_H__ */ +-- +1.8.3.1 + diff --git a/0172-glusterd-add-profile_enabled-flag-in-get-state.patch b/0172-glusterd-add-profile_enabled-flag-in-get-state.patch new file mode 100644 index 0000000..d6abf70 --- /dev/null +++ b/0172-glusterd-add-profile_enabled-flag-in-get-state.patch @@ -0,0 +1,110 @@ +From ab7ff0b569aea5284108fc07dce78f30ac342b1d Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 23 Jan 2018 08:23:11 +0530 +Subject: [PATCH 172/180] glusterd: add profile_enabled flag in get-state + +>upstream mainline patch : https://review.gluster.org/#/c/19286 + +Change-Id: I09f348ed7ae6cd481f8c4d8b4f65f2f2f6aad84e +BUG: 1537357 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/131835 +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 2 ++ + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 22 ---------------------- + xlators/mgmt/glusterd/src/glusterd-utils.c | 22 ++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-utils.h | 3 +++ + 4 files changed, 27 insertions(+), 22 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index a3e1fdc..8fc665d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -5650,6 +5650,8 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + fprintf (fp, "Volume%d.transport_type: %s\n", count, + transport_type_str); + fprintf (fp, "Volume%d.status: %s\n", count, vol_status_str); ++ fprintf (fp, "Volume%d.profile_enabled: %d\n", count, ++ glusterd_is_profile_on (volinfo)); + fprintf (fp, "Volume%d.brickcount: %d\n", count, + volinfo->brick_count); + +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index 81cde21..0cf76dd 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -1978,28 +1978,6 @@ glusterd_op_stage_status_volume (dict_t *dict, char **op_errstr) + return ret; + } + +- +-static gf_boolean_t +-glusterd_is_profile_on (glusterd_volinfo_t *volinfo) +-{ +- int ret = -1; +- gf_boolean_t is_latency_on = _gf_false; +- gf_boolean_t is_fd_stats_on = _gf_false; +- +- GF_ASSERT (volinfo); +- +- ret = glusterd_volinfo_get_boolean (volinfo, VKEY_DIAG_CNT_FOP_HITS); +- if (ret != -1) +- is_fd_stats_on = ret; +- ret = glusterd_volinfo_get_boolean (volinfo, VKEY_DIAG_LAT_MEASUREMENT); +- if (ret != -1) +- is_latency_on = ret; +- if ((_gf_true == is_latency_on) && +- (_gf_true == is_fd_stats_on)) +- return _gf_true; +- return _gf_false; +-} +- + static int + glusterd_op_stage_stats_volume (dict_t *dict, char **op_errstr) + { +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 9ccd718..9a67cfd 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -13988,3 +13988,25 @@ gd_rb_op_to_str (char *op) + return "replace-brick commit force"; + return NULL; + } ++ ++gf_boolean_t ++glusterd_is_profile_on (glusterd_volinfo_t *volinfo) ++{ ++ int ret = -1; ++ gf_boolean_t is_latency_on = _gf_false; ++ gf_boolean_t is_fd_stats_on = _gf_false; ++ ++ GF_ASSERT (volinfo); ++ ++ ret = glusterd_volinfo_get_boolean (volinfo, VKEY_DIAG_CNT_FOP_HITS); ++ if (ret != -1) ++ is_fd_stats_on = ret; ++ ret = glusterd_volinfo_get_boolean (volinfo, VKEY_DIAG_LAT_MEASUREMENT); ++ if (ret != -1) ++ is_latency_on = ret; ++ if ((_gf_true == is_latency_on) && ++ (_gf_true == is_fd_stats_on)) ++ return _gf_true; ++ return _gf_false; ++} ++ +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 3b82b1e..6c525e5 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -892,4 +892,7 @@ glusterd_get_index_basepath (glusterd_brickinfo_t *brickinfo, char *buffer, + + } + ++gf_boolean_t ++glusterd_is_profile_on (glusterd_volinfo_t *volinfo); ++ + #endif +-- +1.8.3.1 + diff --git a/0173-packaging-adding-missed-part-from-5eed664-while-back.patch b/0173-packaging-adding-missed-part-from-5eed664-while-back.patch new file mode 100644 index 0000000..69b7938 --- /dev/null +++ b/0173-packaging-adding-missed-part-from-5eed664-while-back.patch @@ -0,0 +1,53 @@ +From 8b9f3413dfc9f319f5d7476132bbbfa390819f26 Mon Sep 17 00:00:00 2001 +From: Jiffin Tony Thottan +Date: Wed, 28 Feb 2018 09:17:09 +0530 +Subject: [PATCH 173/180] packaging : adding missed part from 5eed664 while + backporting to downstream + +Label : DOWNSTREAM ONLY + +Change-Id: I0ece0adb3b2c85a5ced9c1850ee783d8952a8dec +BUG: 1472445 +Signed-off-by: Jiffin Tony Thottan +Reviewed-on: https://code.engineering.redhat.com/gerrit/131344 +Tested-by: RHGS Build Bot +Reviewed-by: Milind Changire +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfs.spec.in | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 8379f64..ca36e65 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -826,8 +826,13 @@ install -D -p -m 0644 extras/glusterfs-logrotate \ + %{buildroot}%{_sysconfdir}/logrotate.d/glusterfs + + # ganesha ghosts ++%if ( ! 0%{?_build_server} ) + mkdir -p %{buildroot}%{_sysconfdir}/ganesha + touch %{buildroot}%{_sysconfdir}/ganesha/ganesha-ha.conf ++mkdir -p %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/exports ++touch %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha.conf ++touch %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha-ha.conf ++%endif + + %if ( 0%{!?_without_georeplication:1} ) + mkdir -p %{buildroot}%{_sharedstatedir}/glusterd/geo-replication +@@ -1342,7 +1347,11 @@ exit 0 + %{_prefix}/lib/ocf/resource.d/heartbeat/* + %{_sharedstatedir}/glusterd/hooks/1/start/post/S31ganesha-start.sh + %{_sysconfdir}/ganesha/ganesha-ha.conf.sample +-%ghost %config(noreplace) %{_sysconfdir}/ganesha/ganesha-ha.conf ++%ghost %attr(0644,-,-) %config(noreplace) %{_sysconfdir}/ganesha/ganesha-ha.conf ++%ghost %dir %attr(0755,-,-) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha ++%ghost %dir %attr(0755,-,-) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/exports ++%ghost %attr(0644,-,-) %config(noreplace) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha.conf ++%ghost %attr(0644,-,-) %config(noreplace) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha-ha.conf + %endif + + %if ( 0%{?_build_server} ) +-- +1.8.3.1 + diff --git a/0174-hooks-add-a-script-to-stat-the-subdirs-in-add-brick.patch b/0174-hooks-add-a-script-to-stat-the-subdirs-in-add-brick.patch new file mode 100644 index 0000000..b2fb6c4 --- /dev/null +++ b/0174-hooks-add-a-script-to-stat-the-subdirs-in-add-brick.patch @@ -0,0 +1,192 @@ +From d88cae2d02f0c106b4330128715921c459dd77fc Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Fri, 3 Nov 2017 11:49:42 +0530 +Subject: [PATCH 174/180] hooks: add a script to stat the subdirs in add-brick + +The subdirectories are expected to be present for a subdir +mount to be successful. If not, the client_handshake() +itself fails to succeed. When a volume is about to get +mounted first time, this is easier to handle, as if the +directory is not present in one brick, then its mostly +not present in any other brick. In case of add-brick, +the directory is not present in new brick, and there is +no chance of healing it from the subdirectory mount, as +in those clients, the subdir itself will be 'root' ('/') +of the filesystem. Hence we need a volume mount to heal +the directory before connections can succeed. + +This patch does take care of that by healing the directories +which are expected to be mounted as subdirectories from the +volume level mount point. + +>Change-Id: I2c2ac7b7567fe209aaa720006d09b68584d0dd14 +>BUG: 1549915 +>Signed-off-by: Amar Tumballi +upstream patch: https://review.gluster.org/#/c/18645/ + +BUG: 1508999 +Change-Id: I2c2ac7b7567fe209aaa720006d09b68584d0dd14 +Signed-off-by: Sunil Kumar Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/131896 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + extras/hook-scripts/add-brick/post/Makefile.am | 4 +- + .../add-brick/post/S13create-subdir-mounts.sh | 86 ++++++++++++++++++++++ + glusterfs.spec.in | 3 +- + tests/features/subdir-mount.t | 16 +--- + 4 files changed, 94 insertions(+), 15 deletions(-) + create mode 100755 extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh + +diff --git a/extras/hook-scripts/add-brick/post/Makefile.am b/extras/hook-scripts/add-brick/post/Makefile.am +index 5ca5a66..8eb82a1 100644 +--- a/extras/hook-scripts/add-brick/post/Makefile.am ++++ b/extras/hook-scripts/add-brick/post/Makefile.am +@@ -1,4 +1,4 @@ +-EXTRA_DIST = disabled-quota-root-xattr-heal.sh ++EXTRA_DIST = disabled-quota-root-xattr-heal.sh S13create-subdir-mounts.sh + + hookdir = $(GLUSTERD_WORKDIR)/hooks/1/add-brick/post/ +-hook_SCRIPTS = disabled-quota-root-xattr-heal.sh ++hook_SCRIPTS = disabled-quota-root-xattr-heal.sh S13create-subdir-mounts.sh +diff --git a/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh b/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh +new file mode 100755 +index 0000000..95e624e +--- /dev/null ++++ b/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh +@@ -0,0 +1,86 @@ ++#!/bin/bash ++ ++##--------------------------------------------------------------------------- ++## This script runs the self-heal of the directories which are expected to ++## be present as they are mounted as subdirectory mounts. ++##--------------------------------------------------------------------------- ++ ++MOUNT_DIR=`mktemp -d -t ${0##*/}.XXXXXX`; ++OPTSPEC="volname:,go-workdir" ++PROGNAME="add-brick-create-subdir" ++VOL_NAME=test ++GLUSTERD_WORKDIR="/var/lib/glusterd" ++ ++cleanup_mountpoint () ++{ ++ umount -f $MOUNT_DIR; ++ if [ 0 -ne $? ] ++ then ++ return $? ++ fi ++ ++ rmdir $MOUNT_DIR; ++ if [ 0 -ne $? ] ++ then ++ return $? ++ fi ++} ++ ++##------------------------------------------ ++## Parse the arguments ++##------------------------------------------ ++ARGS=$(getopt -l $OPTSPEC -name $PROGNAME $@) ++eval set -- "$ARGS" ++ ++while true; ++do ++ case $1 in ++ --volname) ++ shift ++ VOL_NAME=$1 ++ ;; ++ --gd-workdir) ++ shift ++ GLUSTERD_WORKDIR=$1 ++ ;; ++ --version) ++ shift ++ ;; ++ --volume-op) ++ shift ++ ;; ++ *) ++ shift ++ break ++ ;; ++ esac ++ shift ++done ++ ++## See if we have any subdirs to be healed before going further ++subdirs=$(grep 'auth.allow' ${GLUSTERD_WORKDIR}/vols/${VOL_NAME}/info | cut -f2 -d'=' | tr ',' '\n' | cut -f1 -d'('); ++ ++if [ -z ${subdirs} ]; then ++ rmdir $MOUNT_DIR; ++ exit 0; ++fi ++ ++##---------------------------------------- ++## Mount the volume in temp directory. ++## ----------------------------------- ++glusterfs -s localhost --volfile-id=$VOL_NAME --client-pid=-50 $MOUNT_DIR; ++if [ 0 -ne $? ] ++then ++ exit $?; ++fi ++ ++## ----------------------------------- ++# Do the 'stat' on all the directory for now. Ideal fix is to look at subdir ++# list from 'auth.allow' option and only stat them. ++for subdir in ${subdirs} ++do ++ stat ${MOUNT_DIR}/${subdir} > /dev/null; ++done ++ ++## Clean up and exit ++cleanup_mountpoint; +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index ca36e65..34a3aba 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -1519,8 +1519,9 @@ exit 0 + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/post + %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/post/disabled-quota-root-xattr-heal.sh +- %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/pre/S28Quota-enable-root-xattr-heal.sh ++ %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/post/S13create-subdir-mounts.sh + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/pre ++ %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/pre/S28Quota-enable-root-xattr-heal.sh + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/create + %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/create/post + %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/create/post/S10selinux-label-brick.sh +diff --git a/tests/features/subdir-mount.t b/tests/features/subdir-mount.t +index 1742f86..8401946 100644 +--- a/tests/features/subdir-mount.t ++++ b/tests/features/subdir-mount.t +@@ -98,22 +98,14 @@ TEST test "$mount_inode" == "1" + + TEST umount $M2 + +-# because the subdir is not yet 'healed', below should fail. ++# Now the exported subdirs should be automatically healed due to ++# hook scripts. Check if the mount is successful. + TEST $GFS --subdir-mount /subdir2 -s $H0 --volfile-id $V0 $M2 + mount_inode=$(stat --format "%i" "$M2") +-TEST test "$mount_inode" != "1" +- +-# Allow the heal to complete +-TEST stat $M0/subdir1/subdir1.1/subdir1.2/subdir1.2_file; +-TEST stat $M0/subdir2/ +- +-# Now the mount should succeed +-TEST $GFS --subdir-mount /subdir2 -s $H0 --volfile-id $V0 $M1 +-TEST stat $M1 ++TEST test "$mount_inode" == "1" + +-# umount $M1 / $M2 + TEST umount $M0 +-TEST umount $M1 ++TEST umount $M2 + + + TEST $CLI volume stop $V0; +-- +1.8.3.1 + diff --git a/0175-rpc-make-actor-search-parallel.patch b/0175-rpc-make-actor-search-parallel.patch new file mode 100644 index 0000000..7441396 --- /dev/null +++ b/0175-rpc-make-actor-search-parallel.patch @@ -0,0 +1,282 @@ +From 72dc3a3eff84c5e17bbc3bfddec9daf50338464f Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Tue, 6 Mar 2018 23:08:08 +0530 +Subject: [PATCH 175/180] rpc: make actor search parallel + +Problem: +On a service request, the actor is searched using an exclusive mutex +lock which is not really necessary since most of the time the actor +list is going to be searched and not modified. + +Solution: +Use a read-write lock instead of a mutex lock. Only modify operations +on a service need to be done under a write-lock which grants exclusive +access to the code. + +NOTE: +This patch has been specifically been taken in to use the pthread +rwlock primitives that rpcsvc_ownthread_reconf() is using in commit +8503ed9b94777d47352f19ebfa844e151352b87f +(rpcsvc: scale rpcsvc_request_handler threads) + +mainline: +> BUG: 1509644 +> Reviewed-on: https://review.gluster.org/18543 +> Reviewed-by: Jeff Darcy +> Signed-off-by: Milind Changire + +Change-Id: I318026c13bb6e0385dc24018a976229182fc7f79 +BUG: 1549497 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/131909 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + rpc/rpc-lib/src/rpcsvc-common.h | 2 +- + rpc/rpc-lib/src/rpcsvc.c | 54 ++++++++++++++++++++--------------------- + 2 files changed, 28 insertions(+), 28 deletions(-) + +diff --git a/rpc/rpc-lib/src/rpcsvc-common.h b/rpc/rpc-lib/src/rpcsvc-common.h +index dd95803..ab715d3 100644 +--- a/rpc/rpc-lib/src/rpcsvc-common.h ++++ b/rpc/rpc-lib/src/rpcsvc-common.h +@@ -42,7 +42,7 @@ typedef struct rpcsvc_state { + * other options. + */ + +- pthread_mutex_t rpclock; ++ pthread_rwlock_t rpclock; + + unsigned int memfactor; + +diff --git a/rpc/rpc-lib/src/rpcsvc.c b/rpc/rpc-lib/src/rpcsvc.c +index 31b5eb5..fbd1071 100644 +--- a/rpc/rpc-lib/src/rpcsvc.c ++++ b/rpc/rpc-lib/src/rpcsvc.c +@@ -88,11 +88,11 @@ rpcsvc_listener_destroy (rpcsvc_listener_t *listener) + goto listener_free; + } + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_wrlock (&svc->rpclock); + { + list_del_init (&listener->list); + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + listener_free: + GF_FREE (listener); +@@ -110,7 +110,7 @@ rpcsvc_get_program_vector_sizer (rpcsvc_t *svc, uint32_t prognum, + if (!svc) + return NULL; + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_rdlock (&svc->rpclock); + { + /* Find the matching RPC program from registered list */ + list_for_each_entry (program, &svc->programs, program) { +@@ -121,7 +121,7 @@ rpcsvc_get_program_vector_sizer (rpcsvc_t *svc, uint32_t prognum, + } + } + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + if (found) { + /* Make sure the requested procnum is supported by RPC prog */ +@@ -237,7 +237,7 @@ rpcsvc_program_actor (rpcsvc_request_t *req) + + svc = req->svc; + peername = req->trans->peerinfo.identifier; +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_rdlock (&svc->rpclock); + { + list_for_each_entry (program, &svc->programs, program) { + if (program->prognum == req->prognum) { +@@ -251,7 +251,7 @@ rpcsvc_program_actor (rpcsvc_request_t *req) + } + } + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + if (!found) { + if (err != PROG_MISMATCH) { +@@ -735,7 +735,7 @@ rpcsvc_handle_disconnect (rpcsvc_t *svc, rpc_transport_t *trans) + event = (trans->listener == NULL) ? RPCSVC_EVENT_LISTENER_DEAD + : RPCSVC_EVENT_DISCONNECT; + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_rdlock (&svc->rpclock); + { + if (!svc->notify_count) + goto unlock; +@@ -755,7 +755,7 @@ rpcsvc_handle_disconnect (rpcsvc_t *svc, rpc_transport_t *trans) + wrapper_count = i; + } + unlock: +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + if (wrappers) { + for (i = 0; i < wrapper_count; i++) { +@@ -1495,7 +1495,7 @@ rpcsvc_get_listener (rpcsvc_t *svc, uint16_t port, rpc_transport_t *trans) + goto out; + } + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_rdlock (&svc->rpclock); + { + list_for_each_entry (listener, &svc->listeners, list) { + if (trans != NULL) { +@@ -1521,7 +1521,7 @@ rpcsvc_get_listener (rpcsvc_t *svc, uint16_t port, rpc_transport_t *trans) + } + } + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + if (!found) { + listener = NULL; +@@ -1566,7 +1566,7 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) + " program failed"); + goto out; + } +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_rdlock (&svc->rpclock); + { + list_for_each_entry (prog, &svc->programs, program) { + if ((prog->prognum == program->prognum) +@@ -1575,7 +1575,7 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) + } + } + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + if (prog == NULL) { + ret = -1; +@@ -1592,11 +1592,11 @@ rpcsvc_program_unregister (rpcsvc_t *svc, rpcsvc_program_t *program) + goto out; + } + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_wrlock (&svc->rpclock); + { + list_del_init (&prog->program); + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + ret = 0; + out: +@@ -1655,11 +1655,11 @@ rpcsvc_listener_alloc (rpcsvc_t *svc, rpc_transport_t *trans) + + INIT_LIST_HEAD (&listener->list); + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_wrlock (&svc->rpclock); + { + list_add_tail (&listener->list, &svc->listeners); + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + out: + return listener; + } +@@ -1813,7 +1813,7 @@ rpcsvc_unregister_notify (rpcsvc_t *svc, rpcsvc_notify_t notify, void *mydata) + goto out; + } + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_wrlock (&svc->rpclock); + { + list_for_each_entry_safe (wrapper, tmp, &svc->notify, list) { + if ((wrapper->notify == notify) +@@ -1824,7 +1824,7 @@ rpcsvc_unregister_notify (rpcsvc_t *svc, rpcsvc_notify_t notify, void *mydata) + } + } + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + out: + return ret; +@@ -1844,12 +1844,12 @@ rpcsvc_register_notify (rpcsvc_t *svc, rpcsvc_notify_t notify, void *mydata) + wrapper->data = mydata; + wrapper->notify = notify; + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_wrlock (&svc->rpclock); + { + list_add_tail (&wrapper->list, &svc->notify); + svc->notify_count++; + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + ret = 0; + out: +@@ -1987,7 +1987,7 @@ rpcsvc_program_register (rpcsvc_t *svc, rpcsvc_program_t *program) + goto out; + } + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_rdlock (&svc->rpclock); + { + list_for_each_entry (newprog, &svc->programs, program) { + if ((newprog->prognum == program->prognum) +@@ -1997,7 +1997,7 @@ rpcsvc_program_register (rpcsvc_t *svc, rpcsvc_program_t *program) + } + } + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + if (already_registered) { + ret = 0; +@@ -2031,11 +2031,11 @@ rpcsvc_program_register (rpcsvc_t *svc, rpcsvc_program_t *program) + } + } + +- pthread_mutex_lock (&svc->rpclock); ++ pthread_rwlock_wrlock (&svc->rpclock); + { + list_add_tail (&newprog->program, &svc->programs); + } +- pthread_mutex_unlock (&svc->rpclock); ++ pthread_rwlock_unlock (&svc->rpclock); + + ret = 0; + gf_log (GF_RPCSVC, GF_LOG_DEBUG, "New program registered: %s, Num: %d," +@@ -2077,7 +2077,7 @@ build_prog_details (rpcsvc_request_t *req, gf_dump_rsp *rsp) + if (!req || !req->trans || !req->svc) + goto out; + +- pthread_mutex_lock (&req->svc->rpclock); ++ pthread_rwlock_rdlock (&req->svc->rpclock); + { + list_for_each_entry (program, &req->svc->programs, program) { + prog = GF_CALLOC (1, sizeof (*prog), 0); +@@ -2098,7 +2098,7 @@ build_prog_details (rpcsvc_request_t *req, gf_dump_rsp *rsp) + ret = 0; + } + unlock: +- pthread_mutex_unlock (&req->svc->rpclock); ++ pthread_rwlock_unlock (&req->svc->rpclock); + out: + return ret; + } +@@ -2456,7 +2456,7 @@ rpcsvc_init (xlator_t *xl, glusterfs_ctx_t *ctx, dict_t *options, + if (!svc) + return NULL; + +- pthread_mutex_init (&svc->rpclock, NULL); ++ pthread_rwlock_init (&svc->rpclock, NULL); + INIT_LIST_HEAD (&svc->authschemes); + INIT_LIST_HEAD (&svc->notify); + INIT_LIST_HEAD (&svc->listeners); +-- +1.8.3.1 + diff --git a/0176-glusterd-volume-get-fixes-for-client-io-threads-quor.patch b/0176-glusterd-volume-get-fixes-for-client-io-threads-quor.patch new file mode 100644 index 0000000..9dc907a --- /dev/null +++ b/0176-glusterd-volume-get-fixes-for-client-io-threads-quor.patch @@ -0,0 +1,210 @@ +From 787e1e865ba4b9680c95ae78a11a4d84714cc9a3 Mon Sep 17 00:00:00 2001 +From: Ravishankar N +Date: Wed, 7 Mar 2018 11:03:50 +0530 +Subject: [PATCH 176/180] glusterd: volume get fixes for client-io-threads & + quorum-type + +Backport of: https://review.gluster.org/#/c/19567/ + +1. If a replica volume created on glusterfs-3.8 was upgraded to +glusterfs-3.12, `gluster vol get volname client-io-threads` displayed +'on' even though it wasn't and the xlator wasn't loaded on +the client-graph. This was due to removing certain checks in +glusterd_get_default_val_for_volopt as a part of commit +47604fad4c2a3951077e41e0c007ceb979bb2c24. Fix it. + +2. Also, as a part of op-version bump-up, client-io-threads was being +loaded on the clients during volfile regeneration. Prevent it. + +3. AFR assumes quorum-type to be auto in newly created replic 3 (odd +replica in general) volumes but `gluster vol get quorum-type` displays +'none'. Fix it. + +Change-Id: I4a6099e50f2f10f2bd76f304b1e3bc0795e07a9d +BUG: 1543068 +Signed-off-by: Ravishankar N +Reviewed-on: https://code.engineering.redhat.com/gerrit/131964 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 6 +++--- + xlators/mgmt/glusterd/src/glusterd-op-sm.c | 13 +++++++++++++ + xlators/mgmt/glusterd/src/glusterd-utils.c | 27 ++++++++++++++++++++++++++- + xlators/mgmt/glusterd/src/glusterd-utils.h | 3 ++- + xlators/mgmt/glusterd/src/glusterd-volgen.c | 8 +++++++- + 5 files changed, 51 insertions(+), 6 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 8fc665d..cf280a7 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -4913,7 +4913,7 @@ glusterd_get_volume_opts (rpcsvc_request_t *req, dict_t *dict) + (dict, + _gf_false, + key, orig_key, +- volinfo->dict, ++ volinfo, + &rsp.op_errstr); + if (ret && !rsp.op_errstr) { + snprintf (err_str, +@@ -4939,7 +4939,7 @@ glusterd_get_volume_opts (rpcsvc_request_t *req, dict_t *dict) + } else { + /* Handle the "all" volume option request */ + ret = glusterd_get_default_val_for_volopt (dict, _gf_true, NULL, +- NULL, volinfo->dict, ++ NULL, volinfo, + &rsp.op_errstr); + if (ret && !rsp.op_errstr) { + snprintf (err_str, sizeof(err_str), +@@ -5530,7 +5530,7 @@ glusterd_get_state (rpcsvc_request_t *req, dict_t *dict) + vol_all_opts = dict_new (); + + ret = glusterd_get_default_val_for_volopt (vol_all_opts, +- _gf_true, NULL, NULL, volinfo->dict, ++ _gf_true, NULL, NULL, volinfo, + &rsp.op_errstr); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, +diff --git a/xlators/mgmt/glusterd/src/glusterd-op-sm.c b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +index 0cf76dd..0a21e02 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-op-sm.c ++++ b/xlators/mgmt/glusterd/src/glusterd-op-sm.c +@@ -2542,6 +2542,15 @@ out: + return ret; + } + ++/* This is a hack to prevent client-io-threads from being loaded in the graph ++ * when the cluster-op-version is bumped up from 3.8.x to 3.13.x. The key is ++ * deleted subsequently in glusterd_create_volfiles(). */ ++static int ++glusterd_dict_set_skip_cliot_key (glusterd_volinfo_t *volinfo) ++{ ++ return dict_set_int32 (volinfo->dict, "skip-CLIOT", 1); ++} ++ + static int + glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict, + char **op_errstr) +@@ -2633,6 +2642,10 @@ glusterd_op_set_all_volume_options (xlator_t *this, dict_t *dict, + (volinfo, &start_nfs_svc); + if (ret) + goto out; ++ ++ if (glusterd_dict_set_skip_cliot_key (volinfo)) ++ goto out; ++ + if (!volinfo->is_snap_volume) { + svc = &(volinfo->snapd.svc); + ret = svc->manager (svc, volinfo, +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 9a67cfd..49605cc 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -12842,10 +12842,30 @@ out: + return ret; + } + ++char * ++glusterd_get_option_value (glusterd_volinfo_t *volinfo, char *key) ++{ ++ char *value = NULL; ++ ++ if (!glusterd_is_volume_replicate(volinfo)) ++ goto ret; ++ ++ if (!strcmp (key, "performance.client-io-threads")) { ++ value = "off"; ++ } else if (!strcmp (key, "cluster.quorum-type")) { ++ if (volinfo->replica_count%2) { ++ value = "auto"; ++ } ++ } ++ret: ++ return value; ++} ++ + int + glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + char *input_key, char *orig_key, +- dict_t *vol_dict, char **op_errstr) ++ glusterd_volinfo_t *volinfo, ++ char **op_errstr) + { + struct volopt_map_entry *vme = NULL; + int ret = -1; +@@ -12856,6 +12876,7 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + char dict_key[50] = {0,}; + gf_boolean_t key_found = _gf_false; + glusterd_conf_t *priv = NULL; ++ dict_t *vol_dict = NULL; + + this = THIS; + GF_ASSERT (this); +@@ -12863,6 +12884,7 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + priv = this->private; + GF_VALIDATE_OR_GOTO (this->name, priv, out); + ++ vol_dict = volinfo->dict; + GF_VALIDATE_OR_GOTO (this->name, vol_dict, out); + + /* Check whether key is passed for a single option */ +@@ -12883,6 +12905,9 @@ glusterd_get_default_val_for_volopt (dict_t *ctx, gf_boolean_t all_opts, + ret = dict_get_str (priv->opts, vme->key, &def_val); + if (!def_val) { + ret = dict_get_str (vol_dict, vme->key, &def_val); ++ if (ret == -ENOENT) ++ def_val = glusterd_get_option_value (volinfo, ++ vme->key); + if (!def_val) { + if (vme->value) { + def_val = vme->value; +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.h b/xlators/mgmt/glusterd/src/glusterd-utils.h +index 6c525e5..e69a779 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.h ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.h +@@ -730,7 +730,8 @@ glusterd_get_global_options_for_all_vols (rpcsvc_request_t *req, dict_t *dict, + int + glusterd_get_default_val_for_volopt (dict_t *dict, gf_boolean_t all_opts, + char *key, char *orig_key, +- dict_t *vol_dict, char **err_str); ++ glusterd_volinfo_t *volinfo, ++ char **err_str); + + int + glusterd_check_client_op_version_support (char *volname, uint32_t op_version, +diff --git a/xlators/mgmt/glusterd/src/glusterd-volgen.c b/xlators/mgmt/glusterd/src/glusterd-volgen.c +index 8ff76d6..0e287b6 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volgen.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volgen.c +@@ -263,7 +263,6 @@ first_of (volgen_graph_t *graph) + * + *************************/ + +- + static int + volopt_selector (int lvl, char **patt, void *param, + int (*optcbk)(char *word, void *param)) +@@ -514,6 +513,11 @@ volgen_graph_set_options_generic (volgen_graph_t *graph, dict_t *dict, + odt.data_t_fake = _gf_false; + + data = dict_get (dict, vme->key); ++ if (!strcmp (vme->key, "performance.client-io-threads") && ++ dict_get_str_boolean (dict, "skip-CLIOT", ++ _gf_false) == _gf_true) { ++ continue; ++ } + + if (data) + process_option (vme->key, data, &odt); +@@ -6393,6 +6397,8 @@ glusterd_create_volfiles (glusterd_volinfo_t *volinfo) + GD_MSG_VOLFILE_CREATE_FAIL, + "Could not generate client volfiles"); + ++ dict_del (volinfo->dict, "skip-CLIOT"); ++ + out: + return ret; + } +-- +1.8.3.1 + diff --git a/0177-hooks-fix-workdir-in-S13create-subdir-mounts.sh.patch b/0177-hooks-fix-workdir-in-S13create-subdir-mounts.sh.patch new file mode 100644 index 0000000..f88208d --- /dev/null +++ b/0177-hooks-fix-workdir-in-S13create-subdir-mounts.sh.patch @@ -0,0 +1,35 @@ +From 96a8b43272b935341c4c431c31fb8ae101fc4acd Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Tue, 6 Mar 2018 21:59:44 +0530 +Subject: [PATCH 177/180] hooks: fix workdir in S13create-subdir-mounts.sh + +>Change-Id: Id3eff498091ad9fa4651e93b66903426e76776d6 +>BUG: 1549915 +>Signed-off-by: Atin Mukherjee +Upstream patch: https://review.gluster.org/#/c/19682/ + +BUG: 1508999 +Change-Id: Id3eff498091ad9fa4651e93b66903426e76776d6 +Signed-off-by: Sunil Kumar Acharya +Reviewed-on: https://code.engineering.redhat.com/gerrit/131994 +Tested-by: RHGS Build Bot +--- + extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh b/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh +index 95e624e..1a6923e 100755 +--- a/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh ++++ b/extras/hook-scripts/add-brick/post/S13create-subdir-mounts.sh +@@ -6,7 +6,7 @@ + ##--------------------------------------------------------------------------- + + MOUNT_DIR=`mktemp -d -t ${0##*/}.XXXXXX`; +-OPTSPEC="volname:,go-workdir" ++OPTSPEC="volname:,version:,gd-workdir:,volume-op:" + PROGNAME="add-brick-create-subdir" + VOL_NAME=test + GLUSTERD_WORKDIR="/var/lib/glusterd" +-- +1.8.3.1 + diff --git a/0178-cluster-ec-Do-lock-conflict-check-correctly-for-wait.patch b/0178-cluster-ec-Do-lock-conflict-check-correctly-for-wait.patch new file mode 100644 index 0000000..f3e510e --- /dev/null +++ b/0178-cluster-ec-Do-lock-conflict-check-correctly-for-wait.patch @@ -0,0 +1,90 @@ +From 684f7a9f2f6d79ac45fdf2cf994c5fca139e51cd Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Wed, 31 Jan 2018 22:10:46 +0530 +Subject: [PATCH 178/180] cluster/ec: Do lock conflict check correctly for + wait-list + +Problem: +ec_link_has_lock_conflict() is traversing over only owner_list +but the function is also getting called with wait_list. + +Fix: +Modify ec_link_has_lock_conflict() to traverse lists correctly. +Updated the callers to reflect the changes. + + >BUG: 1540896 + >Change-Id: Ibd7ea10f4498e7c2761f9a6faac6d5cb7d750c91 + >Signed-off-by: Pranith Kumar K + +Master Patch: https://review.gluster.org/19415 +Release-3.13 Patch: https://review.gluster.org/19428 + +BUG: 1540908 +Change-Id: I859aab86d591c22d635bf7a2ef17b77b605d32d8 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/129249 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/cluster/ec/src/ec-common.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c +index 051fff6..bd2ae50 100644 +--- a/xlators/cluster/ec/src/ec-common.c ++++ b/xlators/cluster/ec/src/ec-common.c +@@ -1643,18 +1643,27 @@ ec_lock_update_fd(ec_lock_t *lock, ec_fop_data_t *fop) + } + + static gf_boolean_t +-ec_link_has_lock_conflict (ec_lock_link_t *link, struct list_head *owners) ++ec_link_has_lock_conflict (ec_lock_link_t *link, gf_boolean_t waitlist_check) + { +- ec_lock_link_t *owner_link = NULL; ++ ec_lock_link_t *trav_link = NULL; + ec_t *ec = link->fop->xl->private; + + if (!ec->parallel_writes) + return _gf_true; + +- list_for_each_entry (owner_link, owners, owner_list) { +- if (ec_lock_conflict (owner_link, link)) ++ list_for_each_entry (trav_link, &link->lock->owners, owner_list) { ++ if (ec_lock_conflict (trav_link, link)) + return _gf_true; + } ++ ++ if (!waitlist_check) ++ return _gf_false; ++ ++ list_for_each_entry (trav_link, &link->lock->waiting, wait_list) { ++ if (ec_lock_conflict (trav_link, link)) ++ return _gf_true; ++ } ++ + return _gf_false; + } + +@@ -1676,7 +1685,7 @@ ec_lock_wake_shared(ec_lock_t *lock, struct list_head *list) + + /* If the fop is not shareable, only this fop can be assigned as owner. + * Other fops will need to wait until this one finishes. */ +- if (ec_link_has_lock_conflict (link, &lock->owners)) { ++ if (ec_link_has_lock_conflict (link, _gf_false)) { + conflict = _gf_true; + } + +@@ -1923,9 +1932,7 @@ ec_lock_assign_owner(ec_lock_link_t *link) + * owners, or waiters(to prevent starvation). + * Otherwise we need to wait. + */ +- if (!lock->acquired || +- ec_link_has_lock_conflict (link, &lock->owners) || +- ec_link_has_lock_conflict (link, &lock->waiting)) { ++ if (!lock->acquired || ec_link_has_lock_conflict (link, _gf_true)) { + ec_trace("LOCK_QUEUE_WAIT", fop, "lock=%p", lock); + + list_add_tail(&link->wait_list, &lock->waiting); +-- +1.8.3.1 + diff --git a/0179-packaging-adding-missed-part-from-5eed664-while-back.patch b/0179-packaging-adding-missed-part-from-5eed664-while-back.patch new file mode 100644 index 0000000..5b13681 --- /dev/null +++ b/0179-packaging-adding-missed-part-from-5eed664-while-back.patch @@ -0,0 +1,38 @@ +From d62958f6a3e3a4ecd61f130ad399d56580392c90 Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 7 Mar 2018 06:24:35 -0500 +Subject: [PATCH 179/180] packaging : adding missed part from 5eed664 while + backporting to downstream + +accidental inverse logic during review of previous patch + +Label : DOWNSTREAM ONLY + +BUG: 1472445 +Change-Id: Ie53d0dcb80c687fba6e7f9ddca591b621049cc22 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/132045 +Tested-by: RHGS Build Bot +Reviewed-by: Milind Changire +Reviewed-by: Atin Mukherjee +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfs.spec.in | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 34a3aba..ee51ae8 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -826,7 +826,7 @@ install -D -p -m 0644 extras/glusterfs-logrotate \ + %{buildroot}%{_sysconfdir}/logrotate.d/glusterfs + + # ganesha ghosts +-%if ( ! 0%{?_build_server} ) ++%if ( 0%{?_build_server} ) + mkdir -p %{buildroot}%{_sysconfdir}/ganesha + touch %{buildroot}%{_sysconfdir}/ganesha/ganesha-ha.conf + mkdir -p %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/exports +-- +1.8.3.1 + diff --git a/0180-packaging-adding-missed-part-from-5eed664-while-back.patch b/0180-packaging-adding-missed-part-from-5eed664-while-back.patch new file mode 100644 index 0000000..87410fd --- /dev/null +++ b/0180-packaging-adding-missed-part-from-5eed664-while-back.patch @@ -0,0 +1,39 @@ +From 3933466e627e84621537eed382abe77b2db6105d Mon Sep 17 00:00:00 2001 +From: "Kaleb S. KEITHLEY" +Date: Wed, 7 Mar 2018 07:55:22 -0500 +Subject: [PATCH 180/180] packaging : adding missed part from 5eed664 while + backporting to downstream + +more accidental inverse logic during review of previous patch + +excluding /etc/ganesha/ganesha-ha.conf is spurious since it is +not installed for client builds + +Label : DOWNSTREAM ONLY + +BUG: 1472445 +Change-Id: Idac2ca1eecdba5d3cba08ab2595a5c3f969c6ee3 +Signed-off-by: Kaleb S. KEITHLEY +Reviewed-on: https://code.engineering.redhat.com/gerrit/132059 +Tested-by: RHGS Build Bot +Reviewed-by: Milind Changire +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + glusterfs.spec.in | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index ee51ae8..4b5238a 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -1093,7 +1093,6 @@ exit 0 + %exclude %{_libexecdir}/ganesha/* + %exclude %{_prefix}/lib/ocf/resource.d/heartbeat/* + %exclude %{_sysconfdir}/ganesha/ganesha-ha.conf.sample +-%exclude %{_sysconfdir}/ganesha/ganesha-ha.conf + + # exclude incrementalapi + %exclude %{_libexecdir}/glusterfs/* +-- +1.8.3.1 + diff --git a/glusterfs.spec b/glusterfs.spec index 45305c2..8ad0f8f 100644 --- a/glusterfs.spec +++ b/glusterfs.spec @@ -192,7 +192,7 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist} %else Name: glusterfs Version: 3.12.2 -Release: 4%{?dist} +Release: 5%{?dist} %endif License: GPLv2 or LGPLv3+ Group: System Environment/Base @@ -413,6 +413,38 @@ Patch0145: 0145-glusterd-optimize-glusterd-import-volumes-code-path.patch Patch0146: 0146-cluster-dht-Cleanup-on-fallocate-failure.patch Patch0147: 0147-glusterd-import-volumes-in-separate-synctask.patch Patch0148: 0148-glusterd-tier-is_tier_enabled-inserted-causing-check.patch +Patch0149: 0149-cluster-ec-EC-DISCARD-doesn-t-punch-hole-properly.patch +Patch0150: 0150-dht-Fill-first_up_subvol-before-use-in-dht_opendir.patch +Patch0151: 0151-geo-rep-Improve-geo-rep-pre-validation-logs.patch +Patch0152: 0152-glusterfind-Speed-up-gfid-lookup-100x-by-using-an-SQ.patch +Patch0153: 0153-afr-add-quorum-checks-in-post-op.patch +Patch0154: 0154-afr-capture-the-correct-errno-in-post-op-quorum-chec.patch +Patch0155: 0155-afr-don-t-treat-all-cases-all-bricks-being-blamed-as.patch +Patch0156: 0156-performance-write-behind-fix-bug-while-handling-shor.patch +Patch0157: 0157-cluster-afr-remove-unnecessary-child_up-initializati.patch +Patch0158: 0158-cluster-ec-create-eager-lock-option-for-non-regular-.patch +Patch0159: 0159-extras-hooks-Fix-S10selinux-label-brick.sh-hook-scri.patch +Patch0160: 0160-common-ha-enable-and-disable-selinux-ganesha_use_fus.patch +Patch0161: 0161-cluster-dht-Fixed-a-typo.patch +Patch0162: 0162-cluster-dht-Handle-single-dht-child-in-dht_lookup.patch +Patch0163: 0163-glusterd-compare-uuid-instead-of-hostname-while-find.patch +Patch0164: 0164-geo-rep-Remove-lazy-umount-and-use-mount-namespaces.patch +Patch0165: 0165-cluster-dht-Ignore-ENODATA-from-getxattr-for-posix-a.patch +Patch0166: 0166-rpcsvc-scale-rpcsvc_request_handler-threads.patch +Patch0167: 0167-glusterd-ganesha-change-voltype-for-ganesha.enable-i.patch +Patch0168: 0168-features-shard-Pass-the-correct-block-num-to-store-i.patch +Patch0169: 0169-features-shard-Leverage-block_num-info-in-inode-ctx-.patch +Patch0170: 0170-features-shard-Fix-shard-inode-refcount-when-it-s-pa.patch +Patch0171: 0171-features-shard-Upon-FSYNC-from-upper-layers-wind-fsy.patch +Patch0172: 0172-glusterd-add-profile_enabled-flag-in-get-state.patch +Patch0173: 0173-packaging-adding-missed-part-from-5eed664-while-back.patch +Patch0174: 0174-hooks-add-a-script-to-stat-the-subdirs-in-add-brick.patch +Patch0175: 0175-rpc-make-actor-search-parallel.patch +Patch0176: 0176-glusterd-volume-get-fixes-for-client-io-threads-quor.patch +Patch0177: 0177-hooks-fix-workdir-in-S13create-subdir-mounts.sh.patch +Patch0178: 0178-cluster-ec-Do-lock-conflict-check-correctly-for-wait.patch +Patch0179: 0179-packaging-adding-missed-part-from-5eed664-while-back.patch +Patch0180: 0180-packaging-adding-missed-part-from-5eed664-while-back.patch %description GlusterFS is a distributed file-system capable of scaling to several @@ -559,11 +591,17 @@ Requires: pcs, dbus %if ( 0%{?rhel} && 0%{?rhel} == 6 ) Requires: cman, pacemaker, corosync %endif -%if ( 0%{?fedora} && 0%{?fedora} > 25 ) +%if ( ( 0%{?fedora} && 0%{?fedora} > 25 ) || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) +%if ( 0%{?rhel} ) Requires: selinux-policy >= 3.13.1-160 +Requires(post): policycoreutils-python +Requires(postun): policycoreutils-python +%else Requires(post): policycoreutils-python-utils Requires(postun): policycoreutils-python-utils %endif +%endif + %if ( 0%{?fedora} ) || ( 0%{?rhel} && 0%{?rhel} > 5 ) # we need portblock resource-agent in 3.9.5 and later. Requires: resource-agents >= 3.9.5 @@ -597,6 +635,7 @@ BuildRequires: python-ctypes %endif Requires: python2-gluster = %{version}-%{release} Requires: rsync +Requires: util-linux %description geo-replication GlusterFS is a distributed file-system capable of scaling to several @@ -1018,8 +1057,13 @@ install -D -p -m 0644 extras/glusterfs-logrotate \ %{buildroot}%{_sysconfdir}/logrotate.d/glusterfs # ganesha ghosts +%if ( 0%{?_build_server} ) mkdir -p %{buildroot}%{_sysconfdir}/ganesha touch %{buildroot}%{_sysconfdir}/ganesha/ganesha-ha.conf +mkdir -p %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/exports +touch %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha.conf +touch %{buildroot}%{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha-ha.conf +%endif %if ( 0%{!?_without_georeplication:1} ) mkdir -p %{buildroot}%{_sharedstatedir}/glusterd/geo-replication @@ -1084,7 +1128,7 @@ exit 0 %endif %if ( 0%{?_build_server} ) -%if ( 0%{?fedora} && 0%{?fedora} > 25 ) +%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %post ganesha semanage boolean -m ganesha_use_fusefs --on exit 0 @@ -1214,7 +1258,7 @@ fi /sbin/ldconfig %if ( 0%{?_build_server} ) -%if ( 0%{?fedora} && 0%{?fedora} > 25 ) +%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %postun ganesha semanage boolean -m ganesha_use_fusefs --off exit 0 @@ -1237,7 +1281,7 @@ exit 0 ## All %%trigger should be placed here and keep them sorted ## %if ( 0%{?_build_server} ) -%if ( 0%{?fedora} && 0%{?fedora} > 25 ) +%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %trigger ganesha -- selinux-policy-targeted semanage boolean -m ganesha_use_fusefs --on exit 0 @@ -1248,7 +1292,7 @@ exit 0 ## All %%triggerun should be placed here and keep them sorted ## %if ( 0%{?_build_server} ) -%if ( 0%{?fedora} && 0%{?fedora} > 25 ) +%if ( 0%{?fedora} && 0%{?fedora} > 25 || ( 0%{?rhel} && 0%{?rhel} > 6 ) ) %triggerun ganesha -- selinux-policy-targeted semanage boolean -m ganesha_use_fusefs --off exit 0 @@ -1280,7 +1324,6 @@ exit 0 %exclude %{_libexecdir}/ganesha/* %exclude %{_prefix}/lib/ocf/resource.d/heartbeat/* %exclude %{_sysconfdir}/ganesha/ganesha-ha.conf.sample -%exclude %{_sysconfdir}/ganesha/ganesha-ha.conf # exclude incrementalapi %exclude %{_libexecdir}/glusterfs/* @@ -1534,7 +1577,11 @@ exit 0 %{_prefix}/lib/ocf/resource.d/heartbeat/* %{_sharedstatedir}/glusterd/hooks/1/start/post/S31ganesha-start.sh %{_sysconfdir}/ganesha/ganesha-ha.conf.sample -%ghost %config(noreplace) %{_sysconfdir}/ganesha/ganesha-ha.conf +%ghost %attr(0644,-,-) %config(noreplace) %{_sysconfdir}/ganesha/ganesha-ha.conf +%ghost %dir %attr(0755,-,-) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha +%ghost %dir %attr(0755,-,-) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/exports +%ghost %attr(0644,-,-) %config(noreplace) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha.conf +%ghost %attr(0644,-,-) %config(noreplace) %{_localstatedir}/run/gluster/shared_storage/nfs-ganesha/ganesha-ha.conf %endif %if ( 0%{?_build_server} ) @@ -1702,8 +1749,9 @@ exit 0 %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/post %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/post/disabled-quota-root-xattr-heal.sh - %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/pre/S28Quota-enable-root-xattr-heal.sh + %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/post/S13create-subdir-mounts.sh %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/pre + %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/add-brick/pre/S28Quota-enable-root-xattr-heal.sh %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/create %dir %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/create/post %attr(0755,-,-) %{_sharedstatedir}/glusterd/hooks/1/create/post/S10selinux-label-brick.sh @@ -2340,6 +2388,12 @@ fi %endif %changelog +* Wed Mar 07 2018 Milind Changire - 3.12.2-5 +- fixes bugs bz#1378371 bz#1384983 bz#1472445 bz#1493085 bz#1508999 + bz#1516638 bz#1518260 bz#1529072 bz#1530519 bz#1537357 bz#1540908 bz#1541122 + bz#1541932 bz#1543068 bz#1544382 bz#1544852 bz#1545570 bz#1546075 bz#1546945 + bz#1546960 bz#1547012 bz#1549497 + * Mon Feb 12 2018 Milind Changire - 3.12.2-4 - fixes bugs bz#1446125 bz#1467536 bz#1530146 bz#1540600 bz#1540664 bz#1540961 bz#1541830 bz#1543296