diff --git a/0408-Update-rfc.sh-to-rhgs-3.4.2.patch b/0408-Update-rfc.sh-to-rhgs-3.4.2.patch new file mode 100644 index 0000000..eb82c7e --- /dev/null +++ b/0408-Update-rfc.sh-to-rhgs-3.4.2.patch @@ -0,0 +1,27 @@ +From 1c73105778ac1a7193af747fe9a741690fea010c Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Fri, 2 Nov 2018 20:56:59 +0530 +Subject: [PATCH 408/444] Update rfc.sh to rhgs-3.4.2 + +Change-Id: Ibe0a72f24c36248f0f637417247e9e359fa23075 +Signed-off-by: Milind Changire +--- + rfc.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/rfc.sh b/rfc.sh +index 8c4b5ac..5b09aad 100755 +--- a/rfc.sh ++++ b/rfc.sh +@@ -17,7 +17,7 @@ done + shift $((OPTIND-1)) + + +-branch="rhgs-3.4.1"; ++branch="rhgs-3.4.2"; + + set_hooks_commit_msg() + { +-- +1.8.3.1 + diff --git a/0409-Update-database-profile-group.patch b/0409-Update-database-profile-group.patch new file mode 100644 index 0000000..075a252 --- /dev/null +++ b/0409-Update-database-profile-group.patch @@ -0,0 +1,50 @@ +From f56ad2fc0ba3f3b78dc854b0c09c5c8f9bb9db77 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Fri, 21 Sep 2018 21:51:46 +0200 +Subject: [PATCH 409/444] Update database profile group + +Some performance testing has revealed that pgbench performs 3x better +when these options are set: + +performance.client-io-threads=on +performance.open-behind=on +performance.read-after-open=yes +server.event-threads=4 +client.event-threads=4 + +> Upstream patch: https://review.gluster.org/c/glusterfs/+/21247 +> Change-Id: I36ce389f893a8af13aac5f8285104d749b73d098 +> fixes: bz#1631886 +> Signed-off-by: Xavi Hernandez + +Change-Id: I36ce389f893a8af13aac5f8285104d749b73d098 +BUG: 1644120 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/154881 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + extras/group-db-workload | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/extras/group-db-workload b/extras/group-db-workload +index c9caf21..9334d6f 100644 +--- a/extras/group-db-workload ++++ b/extras/group-db-workload +@@ -1,4 +1,4 @@ +-performance.open-behind=off ++performance.open-behind=on + performance.write-behind=off + performance.stat-prefetch=off + performance.quick-read=off +@@ -6,3 +6,7 @@ performance.strict-o-direct=on + performance.read-ahead=off + performance.io-cache=off + performance.readdir-ahead=off ++performance.client-io-threads=on ++server.event-threads=4 ++client.event-threads=4 ++performance.read-after-open=yes +-- +1.8.3.1 + diff --git a/0410-cli-fix-glusterd-memory-leak-cause-by-gluster-v-stat.patch b/0410-cli-fix-glusterd-memory-leak-cause-by-gluster-v-stat.patch new file mode 100644 index 0000000..a27f0f9 --- /dev/null +++ b/0410-cli-fix-glusterd-memory-leak-cause-by-gluster-v-stat.patch @@ -0,0 +1,42 @@ +From 4bfbc59a0cbfb28325c16e81480decab003fe6d1 Mon Sep 17 00:00:00 2001 +From: shujun10086 +Date: Tue, 2 Oct 2018 08:37:17 +0000 +Subject: [PATCH 410/444] cli: fix glusterd memory leak cause by "gluster v + status volume_name" + +If use this command every some seconds for example 15s to check gluster brick +status, the glusterd will use about 1G memory in a year. free the value of rsp +in gf_cli_status_cbk. glusterd allocate the value of rsp and send it to cli, but +cli do not free the value, that cause glusterd memory leak. + +> fixes: bz#1635480 +> Change-Id: I3f19cd0d4b791ae1b35f9664b3a668b1579f1178 +> Signed-off-by: shujun10086 + +upstream patch: https://review.gluster.org/#/c/21316/ + +Change-Id: I3f19cd0d4b791ae1b35f9664b3a668b1579f1178 +BUG: 1635100 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/154882 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + cli/src/cli-rpc-ops.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c +index 54b61ee65..10f772c 100644 +--- a/cli/src/cli-rpc-ops.c ++++ b/cli/src/cli-rpc-ops.c +@@ -8515,6 +8515,7 @@ cont: + ret = rsp.op_ret; + + out: ++ FREE(rsp.dict.dict_val); + if (dict) + dict_unref (dict); + GF_FREE (status.brick); +-- +1.8.3.1 + diff --git a/0411-glusterd-ensure-volinfo-caps-is-set-to-correct-value.patch b/0411-glusterd-ensure-volinfo-caps-is-set-to-correct-value.patch new file mode 100644 index 0000000..5563776 --- /dev/null +++ b/0411-glusterd-ensure-volinfo-caps-is-set-to-correct-value.patch @@ -0,0 +1,86 @@ +From da38c139d41c839244cd5acc0464ddf06fa51c78 Mon Sep 17 00:00:00 2001 +From: Sanju Rakonde +Date: Wed, 3 Oct 2018 23:58:37 +0530 +Subject: [PATCH 411/444] glusterd: ensure volinfo->caps is set to correct + value + +With the commit febf5ed4848, during the volume create op, +we are setting volinfo->caps to 0, only if any of the bricks +belong to the same node and brickinfo->vg[0] is null. +Previously, we used to set volinfo->caps to 0, when +either brick doesn't belong to the same node or brickinfo->vg[0] +is null. + +With this patch, we set volinfo->caps to 0, when either brick +doesn't belong to the same node or brickinfo->vg[0] is null. +(as we do earlier without commit febf5ed4848). + +> fixes: bz#1635820 +> Change-Id: I00a97415786b775fb088ac45566ad52b402f1a49 +> Signed-off-by: Sanju Rakonde + +upstream patch: https://review.gluster.org/#/c/glusterfs/+/21336/ + +Change-Id: I00a97415786b775fb088ac45566ad52b402f1a49 +BUG: 1635136 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/154909 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + .../bug-1636957-peer-reject-on-glusterd-reboot.t | 29 ++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 2 ++ + 2 files changed, 31 insertions(+) + create mode 100644 tests/bugs/glusterd/bug-1636957-peer-reject-on-glusterd-reboot.t + +diff --git a/tests/bugs/glusterd/bug-1636957-peer-reject-on-glusterd-reboot.t b/tests/bugs/glusterd/bug-1636957-peer-reject-on-glusterd-reboot.t +new file mode 100644 +index 0000000..b462b38 +--- /dev/null ++++ b/tests/bugs/glusterd/bug-1636957-peer-reject-on-glusterd-reboot.t +@@ -0,0 +1,29 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../cluster.rc ++. $(dirname $0)/../../volume.rc ++ ++function peer_count { ++eval \$CLI_$1 peer status | grep 'Peer in Cluster (Connected)' | wc -l ++} ++ ++cleanup ++ ++TEST launch_cluster 2 ++ ++TEST $CLI_1 peer probe $H2; ++EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count 1 ++EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count 2 ++ ++TEST $CLI_1 volume create $V0 $H1:$B1/$V0 $H2:$B2/$V0 ++ ++# rebooting a node which doesn't host bricks for any one volume ++# peer should not go into rejected state ++TEST kill_glusterd 2 ++TEST start_glusterd 2 ++ ++EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count 1 ++EXPECT_WITHIN $PROBE_TIMEOUT 1 peer_count 2 ++ ++cleanup +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +index 36d9bff..87b7acc 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +@@ -2485,6 +2485,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) + caps = 0; + } + #endif ++ } else { ++ caps = 0; + } + + cds_list_add_tail (&brickinfo->brick_list, &volinfo->bricks); +-- +1.8.3.1 + diff --git a/0412-glusterd-set-fsid-while-performing-replace-brick.patch b/0412-glusterd-set-fsid-while-performing-replace-brick.patch new file mode 100644 index 0000000..2535ab0 --- /dev/null +++ b/0412-glusterd-set-fsid-while-performing-replace-brick.patch @@ -0,0 +1,126 @@ +From 1e1495a8d5356e6a4f724c211cdd17c5e3f399b5 Mon Sep 17 00:00:00 2001 +From: Sanju Rakonde +Date: Tue, 30 Oct 2018 16:36:50 +0530 +Subject: [PATCH 412/444] glusterd: set fsid while performing replace brick + +While performing the replace-brick operation, we should set +fsid value to the new brick. + +> fixes: bz#1637196 +> Change-Id: I9e9a4962fc0c2f5dff43e4ac11767814a0c0beaf +> Signed-off-by: Sanju Rakonde + +upstream patch: https://review.gluster.org/#/c/glusterfs/+/21513/ + +Change-Id: I9e9a4962fc0c2f5dff43e4ac11767814a0c0beaf +BUG: 1644279 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/154907 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + .../df-results-post-replace-brick-operations.t | 58 ++++++++++++++++++++++ + xlators/mgmt/glusterd/src/glusterd-replace-brick.c | 15 ++++++ + 2 files changed, 73 insertions(+) + create mode 100644 tests/bugs/glusterd/df-results-post-replace-brick-operations.t + +diff --git a/tests/bugs/glusterd/df-results-post-replace-brick-operations.t b/tests/bugs/glusterd/df-results-post-replace-brick-operations.t +new file mode 100644 +index 0000000..443911c +--- /dev/null ++++ b/tests/bugs/glusterd/df-results-post-replace-brick-operations.t +@@ -0,0 +1,58 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++TEST glusterd ++ ++#Create brick partitions ++TEST truncate -s 100M $B0/brick1 ++TEST truncate -s 100M $B0/brick2 ++TEST truncate -s 100M $B0/brick3 ++TEST truncate -s 100M $B0/brick4 ++TEST truncate -s 100M $B0/brick5 ++ ++LO1=`SETUP_LOOP $B0/brick1` ++TEST [ $? -eq 0 ] ++TEST MKFS_LOOP $LO1 ++ ++LO2=`SETUP_LOOP $B0/brick2` ++TEST [ $? -eq 0 ] ++TEST MKFS_LOOP $LO2 ++ ++LO3=`SETUP_LOOP $B0/brick3` ++TEST [ $? -eq 0 ] ++TEST MKFS_LOOP $LO3 ++ ++LO4=`SETUP_LOOP $B0/brick4` ++TEST [ $? -eq 0 ] ++TEST MKFS_LOOP $LO4 ++ ++LO5=`SETUP_LOOP $B0/brick5` ++TEST [ $? -eq 0 ] ++TEST MKFS_LOOP $LO5 ++ ++TEST mkdir -p $B0/${V0}1 $B0/${V0}2 $B0/${V0}3 $B0/${V0}4 $B0/${V0}5 ++TEST MOUNT_LOOP $LO1 $B0/${V0}1 ++TEST MOUNT_LOOP $LO2 $B0/${V0}2 ++TEST MOUNT_LOOP $LO3 $B0/${V0}3 ++TEST MOUNT_LOOP $LO4 $B0/${V0}4 ++TEST MOUNT_LOOP $LO5 $B0/${V0}5 ++ ++# create a subdirectory in mount point and use it for volume creation ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}1/brick1 $H0:$B0/${V0}2/brick1 $H0:$B0/${V0}3/brick1 ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "3" online_brick_count ++ ++# mount the volume and check the size at mount point ++TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0 ++total_space=$(df -P $M0 | tail -1 | awk '{ print $2}') ++ ++# perform replace brick operations ++TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}1/brick1 $H0:$B0/${V0}4/brick1 commit force ++TEST $CLI volume replace-brick $V0 $H0:$B0/${V0}2/brick1 $H0:$B0/${V0}5/brick1 commit force ++ ++# check for the size at mount point, it should be same as previous ++total_space_new=$(df -P $M0 | tail -1 | awk '{ print $2}') ++TEST [ $total_space -eq $total_space_new ] +diff --git a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +index a037323..5fc3669 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-replace-brick.c ++++ b/xlators/mgmt/glusterd/src/glusterd-replace-brick.c +@@ -362,6 +362,7 @@ glusterd_op_perform_replace_brick (glusterd_volinfo_t *volinfo, + int32_t ret = -1; + xlator_t *this = NULL; + glusterd_conf_t *conf = NULL; ++ struct statvfs brickstat = {0,}; + + this = THIS; + GF_ASSERT (this); +@@ -379,6 +380,20 @@ glusterd_op_perform_replace_brick (glusterd_volinfo_t *volinfo, + ret = glusterd_resolve_brick (new_brickinfo); + if (ret) + goto out; ++ if (!gf_uuid_compare(new_brickinfo->uuid, MY_UUID)) { ++ ret = sys_statvfs(new_brickinfo->path, &brickstat); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, GD_MSG_STATVFS_FAILED, ++ "Failed to fetch disk utilization " ++ "from the brick (%s:%s). Please check the health of " ++ "the brick. Error code was %s", ++ new_brickinfo->hostname, new_brickinfo->path, ++ strerror(errno)); ++ ++ goto out; ++ } ++ new_brickinfo->statfs_fsid = brickstat.f_fsid; ++ } + + ret = glusterd_volume_brickinfo_get_by_brick (old_brick, + volinfo, &old_brickinfo, +-- +1.8.3.1 + diff --git a/0413-glusterfind-add-logs-to-identify-parsing-phases.patch b/0413-glusterfind-add-logs-to-identify-parsing-phases.patch new file mode 100644 index 0000000..9e34554 --- /dev/null +++ b/0413-glusterfind-add-logs-to-identify-parsing-phases.patch @@ -0,0 +1,66 @@ +From 23dda42f6fa9fe0e6def1b6b0cef8dfdd9a5dcb3 Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Mon, 5 Nov 2018 15:02:36 +0530 +Subject: [PATCH 413/444] glusterfind: add logs to identify parsing phases + +Add logs to idenitfy start and finish of changelog parsing phases. + +mainline: +> fixes: bz#1632236 +> Reviewed-on: https://review.gluster.org/c/glusterfs/+/21262 +> Change-Id: Id250231f2af7829f887401d30ac98875ae1ae793 +> Signed-off-by: Milind Changire + +Change-Id: Id250231f2af7829f887401d30ac98875ae1ae793 +BUG: 1631166 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/154905 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tools/glusterfind/src/changelog.py | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tools/glusterfind/src/changelog.py b/tools/glusterfind/src/changelog.py +index 2376af2..8354cc9 100644 +--- a/tools/glusterfind/src/changelog.py ++++ b/tools/glusterfind/src/changelog.py +@@ -273,6 +273,7 @@ def get_changes(brick, hash_dir, log_file, start, end, args): + fail("%s: %s Historical Changelogs not available: %s" % + (args.node, brick, e), logger=logger) + ++ logger.info("[1/4] Starting changelog parsing ...") + try: + # scan followed by getchanges till scan returns zero. + # history_scan() is blocking call, till it gets the number +@@ -301,18 +302,27 @@ def get_changes(brick, hash_dir, log_file, start, end, args): + fail("%s Error during Changelog Crawl: %s" % (brick, e), + logger=logger) + ++ logger.info("[1/4] Finished changelog parsing.") ++ + # Convert all pgfid available from Changelogs ++ logger.info("[2/4] Starting 'pgfid to path' conversions ...") + pgfid_to_path(brick, changelog_data) + changelog_data.commit() ++ logger.info("[2/4] Finished 'pgfid to path' conversions.") + + # Convert all GFIDs for which no other additional details available ++ logger.info("[3/4] Starting 'gfid to path using pgfid' conversions ...") + gfid_to_path_using_pgfid(brick, changelog_data, args) + changelog_data.commit() ++ logger.info("[3/4] Finished 'gfid to path using pgfid' conversions.") + + # If some GFIDs fail to get converted from previous step, + # convert using find ++ logger.info("[4/4] Starting 'gfid to path using batchfind' " ++ "conversions ...") + gfid_to_path_using_batchfind(brick, changelog_data) + changelog_data.commit() ++ logger.info("[4/4] Finished 'gfid to path using batchfind' conversions.") + + return actual_end + +-- +1.8.3.1 + diff --git a/0414-logrotate-utilize-the-new-maxsize-option.patch b/0414-logrotate-utilize-the-new-maxsize-option.patch new file mode 100644 index 0000000..7809452 --- /dev/null +++ b/0414-logrotate-utilize-the-new-maxsize-option.patch @@ -0,0 +1,121 @@ +From 405a367205c72318fc48d014a201eab3b7031010 Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Mon, 5 Nov 2018 10:27:10 +0530 +Subject: [PATCH 414/444] logrotate: utilize the new 'maxsize' option + +Since logrotate 3.8.x version, a new option 'maxsize' is supported, +which helps in rotating the logs before the specified time if the +size exceeds maxsize limit. This should help in reducing the +overflow of gluster logs. + +Upstream: +> URL: https://review.gluster.org/21187 + +BUG: 1599808 +Change-Id: Ic662ada8b73798146736ff81963053d8981745b8 +Signed-off-by: Amar Tumballi +Reviewed-on: https://code.engineering.redhat.com/gerrit/154846 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: RHGS Build Bot +--- + .testignore | 2 ++ + extras/glusterfs-georep-logrotate | 24 +++++++++++++++++++++--- + extras/glusterfs-logrotate | 14 ++++++++++++-- + 3 files changed, 35 insertions(+), 5 deletions(-) + +diff --git a/.testignore b/.testignore +index 4a72bc4..6e5df3a 100644 +--- a/.testignore ++++ b/.testignore +@@ -32,6 +32,8 @@ extras/cliutils/README.md + extras/command-completion/README + extras/create_new_xlator/README.md + extras/glusterfs.vim ++extras/glusterfs-logrotate ++extras/glusterfs-georep-logrotate + extras/group-gluster-block + extras/group-db-workload + extras/group-metadata-cache +diff --git a/extras/glusterfs-georep-logrotate b/extras/glusterfs-georep-logrotate +index 6fdb8c6..3e7ecf3 100644 +--- a/extras/glusterfs-georep-logrotate ++++ b/extras/glusterfs-georep-logrotate +@@ -1,6 +1,12 @@ + /var/log/glusterfs/geo-replication/*/*.log { + sharedscripts +- rotate 52 ++ weekly ++ maxsize 10M ++ minsize 100k ++ ++ # 6 months of logs are good enough ++ rotate 26 ++ + missingok + compress + delaycompress +@@ -15,7 +21,13 @@ + + /var/log/glusterfs/geo-replication-slaves/*.log { + sharedscripts +- rotate 52 ++ weekly ++ maxsize 10M ++ minsize 100k ++ ++ # 6 months of logs are good enough ++ rotate 26 ++ + missingok + compress + delaycompress +@@ -30,7 +42,13 @@ + + /var/log/glusterfs/geo-replication-slaves/*/*.log { + sharedscripts +- rotate 52 ++ weekly ++ maxsize 10M ++ minsize 100k ++ ++ # 6 months of logs are good enough ++ rotate 26 ++ + missingok + compress + delaycompress +diff --git a/extras/glusterfs-logrotate b/extras/glusterfs-logrotate +index 575c0ee..75f700e 100644 +--- a/extras/glusterfs-logrotate ++++ b/extras/glusterfs-logrotate +@@ -2,7 +2,12 @@ + /var/log/glusterfs/*.log { + sharedscripts + weekly +- rotate 52 ++ maxsize 10M ++ minsize 100k ++ ++# 6 months of logs are good enough ++ rotate 26 ++ + missingok + compress + delaycompress +@@ -17,7 +22,12 @@ + /var/log/glusterfs/bricks/*.log { + sharedscripts + weekly +- rotate 52 ++ maxsize 10M ++ minsize 100k ++ ++# 6 months of logs are good enough ++ rotate 26 ++ + missingok + compress + delaycompress +-- +1.8.3.1 + diff --git a/0415-statedump-fix-clang-null-dereference-error.patch b/0415-statedump-fix-clang-null-dereference-error.patch new file mode 100644 index 0000000..08ed7bb --- /dev/null +++ b/0415-statedump-fix-clang-null-dereference-error.patch @@ -0,0 +1,39 @@ +From a469cad3a6b7f340c6ac6fad7c2186299d675d70 Mon Sep 17 00:00:00 2001 +From: Amar Tumballi +Date: Mon, 5 Nov 2018 10:22:44 +0530 +Subject: [PATCH 415/444] statedump: fix clang null dereference error + +ctx->active can be null, and is checked elsewhere in the +same function. In another case, where 'ctx->active' gets +dereferenced, it needs to be validated before the loop +is hit. + +Upstream: +> URL: https://review.gluster.org/21493 + +BUG: 1643035 +Change-Id: I799d92c8089ddbfd9171da4e7e1d77ac91133aba +Signed-off-by: Amar Tumballi +Reviewed-on: https://code.engineering.redhat.com/gerrit/154845 +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +Tested-by: RHGS Build Bot +--- + libglusterfs/src/statedump.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libglusterfs/src/statedump.c b/libglusterfs/src/statedump.c +index a123adb..a4635f3 100644 +--- a/libglusterfs/src/statedump.c ++++ b/libglusterfs/src/statedump.c +@@ -812,7 +812,7 @@ gf_proc_dump_info (int signum, glusterfs_ctx_t *ctx) + if (!ctx) + goto out; + +- if (ctx) { ++ if (ctx && ctx->active) { + top = ctx->active->first; + for (trav_p = &top->children; *trav_p; + trav_p = &(*trav_p)->next) { +-- +1.8.3.1 + diff --git a/0416-glusterd-ignore-RPC-events-when-glusterd-is-shutting.patch b/0416-glusterd-ignore-RPC-events-when-glusterd-is-shutting.patch new file mode 100644 index 0000000..d45b92f --- /dev/null +++ b/0416-glusterd-ignore-RPC-events-when-glusterd-is-shutting.patch @@ -0,0 +1,63 @@ +From 04e697b79edd55680a319e6fdb5983a1e5686db9 Mon Sep 17 00:00:00 2001 +From: Atin Mukherjee +Date: Wed, 3 Oct 2018 16:34:54 +0530 +Subject: [PATCH 416/444] glusterd: ignore RPC events when glusterd is shutting + down + +When glusterd receives a SIGTERM while it receives RPC +connect/disconnect/destroy events, the thread might lead to a crash +while accessing rcu_read_lock () as the clean up thread might have +already freed up the resources. This is more observable when glusterd +comes up with upgrade mode = on during upgrade process. + +The solution is to ignore these events if glusterd is already in the +middle of cleanup_and_exit (). + +> upstream patch : https://review.gluster.org/#/c/glusterfs/+/21330/ + +>Fixes: bz#1635593 +>Change-Id: I12831d31c2f689d4deb038b83b9421bd5cce26d9 +>Signed-off-by: Atin Mukherjee + +Change-Id: I12831d31c2f689d4deb038b83b9421bd5cce26d9 +BUG: 1635071 +Signed-off-by: Atin Mukherjee +Reviewed-on: https://code.engineering.redhat.com/gerrit/154848 +Reviewed-by: Sanju Rakonde +Tested-by: Sanju Rakonde +Tested-by: RHGS Build Bot +--- + xlators/mgmt/glusterd/src/glusterd-handler.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-handler.c b/xlators/mgmt/glusterd/src/glusterd-handler.c +index 861ff17..bf37e70 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handler.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handler.c +@@ -6340,6 +6340,7 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata, + glusterd_peerctx_t *peerctx = NULL; + gf_boolean_t quorum_action = _gf_false; + glusterd_volinfo_t *volinfo = NULL; ++ glusterfs_ctx_t *ctx = NULL; + uuid_t uuid; + + peerctx = mydata; +@@ -6355,7 +6356,14 @@ __glusterd_peer_rpc_notify (struct rpc_clnt *rpc, void *mydata, + GF_FREE (peerctx); + return 0; + } +- ++ ctx = this->ctx; ++ GF_VALIDATE_OR_GOTO (this->name, ctx, out); ++ if (ctx->cleanup_started) { ++ gf_log (this->name, GF_LOG_INFO, "glusterd already received a " ++ "SIGTERM, dropping the event %d for peer %s", event, ++ peerctx->peername); ++ return 0; ++ } + rcu_read_lock (); + + peerinfo = glusterd_peerinfo_find_by_generation (peerctx->peerinfo_gen); +-- +1.8.3.1 + diff --git a/0417-cli-Add-warning-message-while-converting-to-replica-.patch b/0417-cli-Add-warning-message-while-converting-to-replica-.patch new file mode 100644 index 0000000..58f9366 --- /dev/null +++ b/0417-cli-Add-warning-message-while-converting-to-replica-.patch @@ -0,0 +1,190 @@ +From 1a24a7942fe9ecccaf29ae9bc125cd9b08fc8906 Mon Sep 17 00:00:00 2001 +From: karthik-us +Date: Mon, 5 Nov 2018 17:57:55 +0530 +Subject: [PATCH 417/444] cli: Add warning message while converting to replica + 2 configuration + +Backport of: https://review.gluster.org/#/c/glusterfs/+/21136/ + +Currently while creating replica 2 volume we display a warning message +of ending up in split-brain. But while converting an existing volume +from other configuration to replica 2 by add-brick or remove-brick +operations we do not show any such messages. +With this fix in add-brick and remove-brick cases also we will display +the same warning message and prompt for confirmation if the configuration +changes to replica 2. + +Change-Id: Id7b1a40e80fca3e1043b802fa5f7c3b656ef2228 +BUG: 1579758 +Signed-off-by: karthik-us +Reviewed-on: https://code.engineering.redhat.com/gerrit/154947 +Tested-by: RHGS Build Bot +Reviewed-by: Ravishankar Narayanankutty +Reviewed-by: Atin Mukherjee +--- + cli/src/cli-cmd-parser.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- + cli/src/cli-cmd-volume.c | 11 +++++++---- + cli/src/cli.h | 10 +++++----- + 3 files changed, 55 insertions(+), 14 deletions(-) + +diff --git a/cli/src/cli-cmd-parser.c b/cli/src/cli-cmd-parser.c +index 7917d66..3745fb4 100644 +--- a/cli/src/cli-cmd-parser.c ++++ b/cli/src/cli-cmd-parser.c +@@ -1774,8 +1774,8 @@ out: + } + + int32_t +-cli_cmd_volume_add_brick_parse (const char **words, int wordcount, +- dict_t **options, int *ret_type) ++cli_cmd_volume_add_brick_parse (struct cli_state *state, const char **words, ++ int wordcount, dict_t **options, int *ret_type) + { + dict_t *dict = NULL; + char *volname = NULL; +@@ -1790,6 +1790,8 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount, + int index; + gf_boolean_t is_force = _gf_false; + int wc = wordcount; ++ gf_answer_t answer = GF_ANSWER_NO; ++ const char *question = NULL; + + GF_ASSERT (words); + GF_ASSERT (options); +@@ -1854,6 +1856,23 @@ cli_cmd_volume_add_brick_parse (const char **words, int wordcount, + goto out; + index = 7; + } ++ ++ if (count == 2) { ++ if (strcmp (words[wordcount - 1], "force")) { ++ question = "Replica 2 volumes are prone to " ++ "split-brain. Use Arbiter or " ++ "Replica 3 to avoid this.\n" ++ "Do you still want to continue?\n"; ++ answer = cli_cmd_get_confirmation (state, ++ question); ++ if (GF_ANSWER_NO == answer) { ++ gf_log ("cli", GF_LOG_ERROR, "Add brick" ++ " cancelled, exiting"); ++ ret = -1; ++ goto out; ++ } ++ } ++ } + } else if ((strcmp (w, "stripe")) == 0) { + type = GF_CLUSTER_TYPE_STRIPE; + count = strtol (words[4], NULL, 0); +@@ -2061,9 +2080,9 @@ out: + } + + int32_t +-cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, +- dict_t **options, int *question, +- int *brick_count) ++cli_cmd_volume_remove_brick_parse (struct cli_state *state, const char **words, ++ int wordcount, dict_t **options, ++ int *question, int *brick_count) + { + dict_t *dict = NULL; + char *volname = NULL; +@@ -2081,6 +2100,8 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, + char *w = NULL; + int32_t command = GF_OP_CMD_NONE; + long count = 0; ++ gf_answer_t answer = GF_ANSWER_NO; ++ const char *ques = NULL; + + GF_ASSERT (words); + GF_ASSERT (options); +@@ -2115,6 +2136,23 @@ cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, + goto out; + } + ++ if (count == 2) { ++ if (strcmp (words[wordcount - 1], "force")) { ++ ques = "Replica 2 volumes are prone to " ++ "split-brain. Use Arbiter or Replica 3 " ++ "to avoid this.\n" ++ "Do you still want to continue?\n"; ++ answer = cli_cmd_get_confirmation (state, ++ ques); ++ if (GF_ANSWER_NO == answer) { ++ gf_log ("cli", GF_LOG_ERROR, "Remove " ++ "brick cancelled, exiting"); ++ ret = -1; ++ goto out; ++ } ++ } ++ } ++ + ret = dict_set_int32 (dict, "replica-count", count); + if (ret) + goto out; +diff --git a/cli/src/cli-cmd-volume.c b/cli/src/cli-cmd-volume.c +index a1f0840..32efa73 100644 +--- a/cli/src/cli-cmd-volume.c ++++ b/cli/src/cli-cmd-volume.c +@@ -1021,7 +1021,8 @@ cli_cmd_volume_add_brick_cbk (struct cli_state *state, + if (!frame) + goto out; + +- ret = cli_cmd_volume_add_brick_parse (words, wordcount, &options, 0); ++ ret = cli_cmd_volume_add_brick_parse (state, words, wordcount, &options, ++ 0); + if (ret) { + cli_usage_out (word->pattern); + parse_error = 1; +@@ -1151,7 +1152,8 @@ do_cli_cmd_volume_attach_tier (struct cli_state *state, + if (!frame) + goto out; + +- ret = cli_cmd_volume_add_brick_parse (words, wordcount, &options, &type); ++ ret = cli_cmd_volume_add_brick_parse (state, words, wordcount, &options, ++ &type); + if (ret) { + cli_usage_out (word->pattern); + parse_error = 1; +@@ -2032,8 +2034,9 @@ cli_cmd_volume_remove_brick_cbk (struct cli_state *state, + if (!frame) + goto out; + +- ret = cli_cmd_volume_remove_brick_parse (words, wordcount, &options, +- &need_question, &brick_count); ++ ret = cli_cmd_volume_remove_brick_parse (state, words, wordcount, ++ &options, &need_question, ++ &brick_count); + if (ret) { + cli_usage_out (word->pattern); + parse_error = 1; +diff --git a/cli/src/cli.h b/cli/src/cli.h +index c9bf93d..109dcd4 100644 +--- a/cli/src/cli.h ++++ b/cli/src/cli.h +@@ -264,8 +264,8 @@ cli_cmd_get_state_parse (struct cli_state *state, const char **words, + int wordcount, dict_t **options, char **op_errstr); + + int32_t +-cli_cmd_volume_add_brick_parse (const char **words, int wordcount, +- dict_t **options, int *type); ++cli_cmd_volume_add_brick_parse (struct cli_state *state, const char **words, ++ int wordcount, dict_t **options, int *type); + + int32_t + cli_cmd_volume_detach_tier_parse (const char **words, int wordcount, +@@ -280,9 +280,9 @@ cli_cmd_volume_old_tier_parse (const char **words, int wordcount, + dict_t **options); + + int32_t +-cli_cmd_volume_remove_brick_parse (const char **words, int wordcount, +- dict_t **options, int *question, +- int *brick_count); ++cli_cmd_volume_remove_brick_parse (struct cli_state *state, const char **words, ++ int wordcount, dict_t **options, ++ int *question, int *brick_count); + + int32_t + cli_cmd_volume_replace_brick_parse (const char **words, int wordcount, +-- +1.8.3.1 + diff --git a/0418-cli-correct-rebalance-status-elapsed-check.patch b/0418-cli-correct-rebalance-status-elapsed-check.patch new file mode 100644 index 0000000..9aa3154 --- /dev/null +++ b/0418-cli-correct-rebalance-status-elapsed-check.patch @@ -0,0 +1,58 @@ +From 414d6d378b7d63b172859f619bd3ffb72bd3f434 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Tue, 8 Aug 2017 23:11:10 +0530 +Subject: [PATCH 418/444] cli: correct rebalance status elapsed check + +Check that elapsed time has crossed 10 mins for at least +one rebalance process before displaying the estimates. + +Upstream patch: https://review.gluster.org/#/c/glusterfs/+/18000/ + +> Change-Id: Ib357a6f0d0125a178e94ede1e31514fdc6ce3593 +> BUG: 1479528 +> Signed-off-by: N Balachandran + +Change-Id: Ic4606acad991b9369c6b674691e0ec15621c6932 +BUG: 1479446 +Signed-off-by: N Balachandran +Reviewed-on: https://code.engineering.redhat.com/gerrit/154929 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + cli/src/cli-rpc-ops.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/cli/src/cli-rpc-ops.c b/cli/src/cli-rpc-ops.c +index 10f772c..5623950 100644 +--- a/cli/src/cli-rpc-ops.c ++++ b/cli/src/cli-rpc-ops.c +@@ -1616,6 +1616,7 @@ gf_cli_print_rebalance_status (dict_t *dict, enum gf_task_types task_type, + gf_boolean_t down = _gf_false; + gf_boolean_t fix_layout = _gf_false; + uint64_t max_time = 0; ++ uint64_t max_elapsed = 0; + uint64_t time_left = 0; + gf_boolean_t show_estimates = _gf_false; + +@@ -1758,6 +1759,9 @@ gf_cli_print_rebalance_status (dict_t *dict, enum gf_task_types task_type, + gf_log ("cli", GF_LOG_TRACE, + "failed to get time left"); + ++ if (elapsed > max_elapsed) ++ max_elapsed = elapsed; ++ + if (time_left > max_time) + max_time = time_left; + +@@ -1818,7 +1822,7 @@ gf_cli_print_rebalance_status (dict_t *dict, enum gf_task_types task_type, + if (!show_estimates) { + goto out; + } +- if (elapsed <= REBAL_ESTIMATE_START_TIME) { ++ if (max_elapsed <= REBAL_ESTIMATE_START_TIME) { + cli_out ("The estimated time for rebalance to complete " + "will be unavailable for the first 10 " + "minutes."); +-- +1.8.3.1 + diff --git a/0419-glusterfs-During-reconfigure-set-log-level-per-xlato.patch b/0419-glusterfs-During-reconfigure-set-log-level-per-xlato.patch new file mode 100644 index 0000000..757266c --- /dev/null +++ b/0419-glusterfs-During-reconfigure-set-log-level-per-xlato.patch @@ -0,0 +1,84 @@ +From 56fb13d05cb4465c14cc231bab1296a48c33c57d Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Tue, 6 Nov 2018 09:06:34 +0530 +Subject: [PATCH 419/444] glusterfs: During reconfigure set log-level per + xlator level + +Problem: In brick_mux environment, while a user has enabled brick-log-level + for anyone volume, it automatically enables for other volumes + also those are attached with same brick. + +Solution: A log-level option is automatically enabled for other volumes + because log-level saved in glusterfsd_ctx and ctx is common for + volumes those are attached with same brick. To resolve it + set log level for all children xlator's at the time of the graph + reconfigure at io-stat xlator. + +> Change-Id: Id9a6efa05d286e0bea2d47f49292d084e7bb2fcf +> fixes: bz#1640495 +> (Reviwed on upstream link https://review.gluster.org/#/c/glusterfs/+/20488/) +> (Cherry pick from commit c34e4161f3cb6539ec83a9020f3d27eb4759a975) + +Change-Id: I1dd57c52997f16e8a05f982c6c05bb4f758e8bd3 +BUG: 1598407 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/155021 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/debug/io-stats/src/io-stats.c | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/xlators/debug/io-stats/src/io-stats.c b/xlators/debug/io-stats/src/io-stats.c +index 0f71334..aade097 100644 +--- a/xlators/debug/io-stats/src/io-stats.c ++++ b/xlators/debug/io-stats/src/io-stats.c +@@ -3812,6 +3812,35 @@ ios_set_log_format_code (struct ios_conf *conf) + conf->dump_format = IOS_DUMP_TYPE_SAMPLES; + } + ++void ++xlator_set_loglevel(xlator_t *this, int log_level) ++{ ++ glusterfs_ctx_t *ctx = NULL; ++ glusterfs_graph_t *active = NULL; ++ xlator_t *top = NULL; ++ xlator_t *trav = this; ++ ++ ctx = this->ctx; ++ GF_ASSERT(ctx); ++ active = ctx->active; ++ top = active->first; ++ ++ if (strcmp(top->type, "protocol/server") || (log_level == -1)) ++ return; ++ ++ /* Set log-level for server xlator */ ++ top->loglevel = log_level; ++ ++ /* Set log-level for parent xlator */ ++ if (this->parents) ++ this->parents->xlator->loglevel = log_level; ++ ++ while (trav) { ++ trav->loglevel = log_level; ++ trav = trav->next; ++ } ++} ++ + int + reconfigure (xlator_t *this, dict_t *options) + { +@@ -3867,7 +3896,8 @@ reconfigure (xlator_t *this, dict_t *options) + GF_OPTION_RECONF ("log-level", log_str, options, str, out); + if (log_str) { + log_level = glusterd_check_log_level (log_str); +- gf_log_set_loglevel (log_level); ++ /* Set loglevel for all children and server xlators */ ++ xlator_set_loglevel(this, log_level); + } + + GF_OPTION_RECONF ("logger", logger_str, options, str, out); +-- +1.8.3.1 + diff --git a/0420-Modify-log-message-DH-ciphers-are-disabled-from-ERRO.patch b/0420-Modify-log-message-DH-ciphers-are-disabled-from-ERRO.patch new file mode 100644 index 0000000..71e539c --- /dev/null +++ b/0420-Modify-log-message-DH-ciphers-are-disabled-from-ERRO.patch @@ -0,0 +1,40 @@ +From 9a6ad46e3d7ae9ac683ef790c12937fee8f1143c Mon Sep 17 00:00:00 2001 +From: Mohit Agrawal +Date: Tue, 6 Nov 2018 09:31:50 +0530 +Subject: [PATCH 420/444] Modify log message 'DH ciphers are disabled' from + ERROR to INFO + +Per the latest comment in bz#1398237 this message is confusing for users +because it suggests an error where none exists. + +> Fixes: bz#1626319 +> Change-Id: I2f05999da157b11e225bf3d95edb597e964f9923 +> Signed-off-by: Omar Kohl +> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/21108/) + +Change-Id: I154cdd6e33e17d426bcba10fe17fceceba047b16 +BUG: 1632563 +Signed-off-by: Mohit Agrawal +Reviewed-on: https://code.engineering.redhat.com/gerrit/155023 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + rpc/rpc-transport/socket/src/socket.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c +index 243d49c..8a08177 100644 +--- a/rpc/rpc-transport/socket/src/socket.c ++++ b/rpc/rpc-transport/socket/src/socket.c +@@ -4378,7 +4378,7 @@ socket_init (rpc_transport_t *this) + #endif + + if ((bio = BIO_new_file(dh_param, "r")) == NULL) { +- gf_log(this->name,GF_LOG_ERROR, ++ gf_log(this->name, GF_LOG_INFO, + "failed to open %s, " + "DH ciphers are disabled", dh_param); + } +-- +1.8.3.1 + diff --git a/0421-rpc-handle-EAGAIN-when-SSL_ERROR_SYSCALL-is-returned.patch b/0421-rpc-handle-EAGAIN-when-SSL_ERROR_SYSCALL-is-returned.patch new file mode 100644 index 0000000..623fd14 --- /dev/null +++ b/0421-rpc-handle-EAGAIN-when-SSL_ERROR_SYSCALL-is-returned.patch @@ -0,0 +1,66 @@ +From ce2c9ea016ffa20bf291264a012cc14102040900 Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Mon, 10 Sep 2018 13:48:18 +0530 +Subject: [PATCH 421/444] rpc: handle EAGAIN when SSL_ERROR_SYSCALL is returned + +Problem: +A return value of ENODATA was forcibly returned in the case where +SSL_get_error(r) returned SSL_ERROR_SYSCALL. Sometimes SSL_ERROR_SYSCALL +is a transient error which is identified by setting errno to EAGAIN. +EAGAIN is not a fatal error and indicates that the syscall needs to be +retried. + +Solution: +Bubble up the errno in case SSL_get_error(r) returns SSL_ERROR_SYSCALL +and let the upper layers handle it appropriately. + +mainline: +> Reviewed-on: https://review.gluster.org/c/glusterfs/+/20993 +> fixes: bz#1622405 +> Change-Id: I76eff278378930ee79abbf9fa267a7e77356eed6 +> BUG: 1622405 + +Change-Id: I76eff278378930ee79abbf9fa267a7e77356eed6 +BUG: 1622308 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/154868 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + rpc/rpc-transport/socket/src/socket.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/rpc/rpc-transport/socket/src/socket.c b/rpc/rpc-transport/socket/src/socket.c +index 8a08177..34a937f 100644 +--- a/rpc/rpc-transport/socket/src/socket.c ++++ b/rpc/rpc-transport/socket/src/socket.c +@@ -209,6 +209,7 @@ ssl_do (rpc_transport_t *this, void *buf, size_t len, SSL_trinary_func *func) + int r = (-1); + struct pollfd pfd = {-1,}; + socket_private_t *priv = NULL; ++ int myerrno = -1; + + GF_VALIDATE_OR_GOTO(this->name,this->private,out); + priv = this->private; +@@ -276,10 +277,16 @@ ssl_do (rpc_transport_t *this, void *buf, size_t len, SSL_trinary_func *func) + } + break; + case SSL_ERROR_SYSCALL: ++ myerrno = errno; + /* This is what we get when remote disconnects. */ + gf_log(this->name,GF_LOG_DEBUG, +- "syscall error (probably remote disconnect)"); +- errno = ENODATA; ++ "syscall error (probably remote disconnect)" ++ " errno:%d(%s)", errno, strerror(errno)); ++ /* sometimes, errno is set to EAGAIN in this case ++ * so let the upper layers do what they need to do ++ * with it ++ */ ++ errno = myerrno; + goto out; + default: + errno = EIO; +-- +1.8.3.1 + diff --git a/0422-glusterd-raise-default-transport.listen-backlog.patch b/0422-glusterd-raise-default-transport.listen-backlog.patch new file mode 100644 index 0000000..dd95bc3 --- /dev/null +++ b/0422-glusterd-raise-default-transport.listen-backlog.patch @@ -0,0 +1,46 @@ +From ccac7336bb6fa667b4f9b51426440d898ff3d184 Mon Sep 17 00:00:00 2001 +From: Milind Changire +Date: Mon, 5 Nov 2018 19:38:08 +0530 +Subject: [PATCH 422/444] glusterd: raise default transport.listen-backlog + +Problem: +data center setups with large number of bricks with replication +causes a flood of connections from bricks and self-heal daemons +to glusterd causing connections to be dropped due to insufficient +listener socket backlog queue length + +Solution: +raise default value of transport.listen-backlog to 1024 + +mainline: +> Reviewed-on: https://review.gluster.org/c/glusterfs/+/21482 +> Change-Id: I879e4161a88f1e30875046dff232499a8e2e6c51 +> fixes: bz#1642850 +> Signed-off-by: Milind Changire + +Change-Id: I879e4161a88f1e30875046dff232499a8e2e6c51 +BUG: 1642854 +Signed-off-by: Milind Changire +Reviewed-on: https://code.engineering.redhat.com/gerrit/154959 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/glusterfs.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h +index d71a9c1..5e641fd 100644 +--- a/libglusterfs/src/glusterfs.h ++++ b/libglusterfs/src/glusterfs.h +@@ -73,7 +73,7 @@ + #endif + + #define GLUSTERD_MAX_SNAP_NAME 255 +-#define GLUSTERFS_SOCKET_LISTEN_BACKLOG 10 ++#define GLUSTERFS_SOCKET_LISTEN_BACKLOG 1024 + #define ZR_MOUNTPOINT_OPT "mountpoint" + #define ZR_ATTR_TIMEOUT_OPT "attribute-timeout" + #define ZR_ENTRY_TIMEOUT_OPT "entry-timeout" +-- +1.8.3.1 + diff --git a/0423-glusterd-acquire-lock-to-update-volinfo-structure.patch b/0423-glusterd-acquire-lock-to-update-volinfo-structure.patch new file mode 100644 index 0000000..8b23e30 --- /dev/null +++ b/0423-glusterd-acquire-lock-to-update-volinfo-structure.patch @@ -0,0 +1,150 @@ +From 216ac7a1bd22db08cc02d7b8688a3338e78c71cd Mon Sep 17 00:00:00 2001 +From: Sanju Rakonde +Date: Tue, 11 Sep 2018 14:19:42 +0530 +Subject: [PATCH 423/444] glusterd: acquire lock to update volinfo structure + +Problem: With commit cb0339f92, we are using a separate syntask +for restart_bricks. There can be a situation where two threads +are accessing the same volinfo structure at the same time and +updating volinfo structure. This can lead volinfo to have +inconsistent values and assertion failures because of unexpected +values. + +Solution: While updating the volinfo structure, acquire a +store_volinfo_lock, and release the lock only when the thread +completed its critical section part. + +> Fixes: bz#1627610 +> Signed-off-by: Sanju Rakonde +> Change-Id: I545e4e2368e3285d8f7aa28081ff4448abb72f5d + +upstream patch: https://review.gluster.org/#/c/glusterfs/+/21150/ + +Change-Id: I545e4e2368e3285d8f7aa28081ff4448abb72f5d +BUG: 1631418 +Signed-off-by: Sanju Rakonde +Reviewed-on: https://code.engineering.redhat.com/gerrit/154885 +Tested-by: RHGS Build Bot +Reviewed-by: Atin Mukherjee +--- + xlators/mgmt/glusterd/src/glusterd-store.c | 67 +++++++++++++------------ + xlators/mgmt/glusterd/src/glusterd-volume-ops.c | 2 + + xlators/mgmt/glusterd/src/glusterd.h | 3 ++ + 3 files changed, 40 insertions(+), 32 deletions(-) + +diff --git a/xlators/mgmt/glusterd/src/glusterd-store.c b/xlators/mgmt/glusterd/src/glusterd-store.c +index 015f6c2..37542e7 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-store.c ++++ b/xlators/mgmt/glusterd/src/glusterd-store.c +@@ -1796,46 +1796,49 @@ glusterd_store_volinfo (glusterd_volinfo_t *volinfo, glusterd_volinfo_ver_ac_t a + + GF_ASSERT (volinfo); + +- glusterd_perform_volinfo_version_action (volinfo, ac); +- ret = glusterd_store_create_volume_dir (volinfo); +- if (ret) +- goto out; +- +- ret = glusterd_store_create_volume_run_dir (volinfo); +- if (ret) +- goto out; ++ pthread_mutex_lock(&volinfo->store_volinfo_lock); ++ { ++ glusterd_perform_volinfo_version_action(volinfo, ac); ++ ret = glusterd_store_create_volume_dir(volinfo); ++ if (ret) ++ goto unlock; + +- ret = glusterd_store_create_vol_shandle_on_absence (volinfo); +- if (ret) +- goto out; ++ ret = glusterd_store_create_volume_run_dir(volinfo); ++ if (ret) ++ goto unlock; + +- ret = glusterd_store_create_nodestate_sh_on_absence (volinfo); +- if (ret) +- goto out; ++ ret = glusterd_store_create_vol_shandle_on_absence(volinfo); ++ if (ret) ++ goto unlock; + +- ret = glusterd_store_perform_volume_store (volinfo); +- if (ret) +- goto out; ++ ret = glusterd_store_create_nodestate_sh_on_absence(volinfo); ++ if (ret) ++ goto unlock; + +- ret = glusterd_store_volume_atomic_update (volinfo); +- if (ret) { +- glusterd_perform_volinfo_version_action (volinfo, +- GLUSTERD_VOLINFO_VER_AC_DECREMENT); +- goto out; +- } ++ ret = glusterd_store_perform_volume_store(volinfo); ++ if (ret) ++ goto unlock; + +- ret = glusterd_store_perform_node_state_store (volinfo); +- if (ret) +- goto out; ++ ret = glusterd_store_volume_atomic_update(volinfo); ++ if (ret) { ++ glusterd_perform_volinfo_version_action(volinfo, ++ GLUSTERD_VOLINFO_VER_AC_DECREMENT); ++ goto unlock; ++ } + +- /* checksum should be computed at the end */ +- ret = glusterd_compute_cksum (volinfo, _gf_false); +- if (ret) +- goto out; ++ ret = glusterd_store_perform_node_state_store(volinfo); ++ if (ret) ++ goto unlock; + +-out: ++ /* checksum should be computed at the end */ ++ ret = glusterd_compute_cksum(volinfo, _gf_false); ++ if (ret) ++ goto unlock; ++ } ++unlock: ++ pthread_mutex_unlock(&volinfo->store_volinfo_lock); + if (ret) +- glusterd_store_volume_cleanup_tmp (volinfo); ++ glusterd_store_volume_cleanup_tmp(volinfo); + + gf_msg_debug (THIS->name, 0, "Returning %d", ret); + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +index 87b7acc..b91a516 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-ops.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-ops.c +@@ -2198,6 +2198,8 @@ glusterd_op_create_volume (dict_t *dict, char **op_errstr) + goto out; + } + ++ pthread_mutex_init(&volinfo->store_volinfo_lock, NULL); ++ + ret = dict_get_str (dict, "volname", &volname); + + if (ret) { +diff --git a/xlators/mgmt/glusterd/src/glusterd.h b/xlators/mgmt/glusterd/src/glusterd.h +index 8c70d48..edd41aa 100644 +--- a/xlators/mgmt/glusterd/src/glusterd.h ++++ b/xlators/mgmt/glusterd/src/glusterd.h +@@ -478,6 +478,9 @@ struct glusterd_volinfo_ { + gf_boolean_t stage_deleted; /* volume has passed staging + * for delete operation + */ ++ pthread_mutex_t store_volinfo_lock; /* acquire lock for ++ * updating the volinfo ++ */ + }; + + typedef enum gd_snap_status_ { +-- +1.8.3.1 + diff --git a/0424-cluster-afr-Delegate-metadata-heal-with-pending-xatt.patch b/0424-cluster-afr-Delegate-metadata-heal-with-pending-xatt.patch new file mode 100644 index 0000000..2584cb4 --- /dev/null +++ b/0424-cluster-afr-Delegate-metadata-heal-with-pending-xatt.patch @@ -0,0 +1,272 @@ +From 68b0db385ce968547349b187222b9a9401faee12 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 27 Aug 2018 11:46:33 +0530 +Subject: [PATCH 424/444] cluster/afr: Delegate metadata heal with pending + xattrs to SHD + +Problem: +When metadata-self-heal is triggered on the mount, it blocks +lookup until metadata-self-heal completes. But that can lead +to hangs when lot of clients are accessing a directory which +needs metadata heal and all of them trigger heals waiting +for other clients to complete heal. + +Fix: +Only when the heal is needed but the pending xattrs are not set, +trigger metadata heal that could block lookup. This is the only +case where different clients may give different metadata to the +clients without heals, which should be avoided. + +BUG: 1619357 +Upstream Patch: https://review.gluster.org/c/glusterfs/+/21086 +Change-Id: I6089e9fda0770a83fb287941b229c882711f4e66 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/155028 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/basic/afr/client-side-heal.t | 28 ++++++++++------ + tests/bugs/glusterfs/bug-906646.t | 10 ++++-- + xlators/cluster/afr/src/afr-common.c | 44 ++++++++++++++++++++++++++ + xlators/cluster/afr/src/afr-self-heal-common.c | 38 ---------------------- + xlators/cluster/afr/src/afr.h | 3 ++ + 5 files changed, 72 insertions(+), 51 deletions(-) + +diff --git a/tests/basic/afr/client-side-heal.t b/tests/basic/afr/client-side-heal.t +index eba7dc2..1e93361 100755 +--- a/tests/basic/afr/client-side-heal.t ++++ b/tests/basic/afr/client-side-heal.t +@@ -17,6 +17,7 @@ TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; + echo "some data" > $M0/datafile + EXPECT 0 echo $? + TEST touch $M0/mdatafile ++TEST touch $M0/mdatafile-backend-direct-modify + TEST mkdir $M0/dir + + #Kill a brick and perform I/O to have pending heals. +@@ -29,6 +30,7 @@ EXPECT 0 echo $? + + #pending metadata heal + TEST chmod +x $M0/mdatafile ++TEST chmod +x $B0/${V0}0/mdatafile-backend-direct-modify + + #pending entry heal. Also causes pending metadata/data heals on file{1..5} + TEST touch $M0/dir/file{1..5} +@@ -40,9 +42,12 @@ TEST $CLI volume start $V0 force + EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 0 + + #Medatada heal via explicit lookup must not happen +-TEST ls $M0/mdatafile ++TEST getfattr -d -m. -e hex $M0/mdatafile ++TEST ls $M0/mdatafile-backend-direct-modify + +-#Inode refresh must not trigger data and entry heals. ++TEST [[ "$(stat -c %A $B0/${V0}0/mdatafile-backend-direct-modify)" != "$(stat -c %A $B0/${V0}1/mdatafile-backend-direct-modify)" ]] ++ ++#Inode refresh must not trigger data metadata and entry heals. + #To trigger inode refresh for sure, the volume is unmounted and mounted each time. + #Check that data heal does not happen. + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +@@ -52,7 +57,6 @@ TEST cat $M0/datafile + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; + TEST ls $M0/dir +- + #No heal must have happened + EXPECT 8 get_pending_heal_count $V0 + +@@ -61,21 +65,25 @@ TEST $CLI volume set $V0 cluster.data-self-heal on + TEST $CLI volume set $V0 cluster.metadata-self-heal on + TEST $CLI volume set $V0 cluster.entry-self-heal on + +-#Metadata heal is triggered by lookup without need for inode refresh. +-TEST ls $M0/mdatafile +-EXPECT 7 get_pending_heal_count $V0 +- +-#Inode refresh must trigger data and entry heals. ++#Inode refresh must trigger data metadata and entry heals. + #To trigger inode refresh for sure, the volume is unmounted and mounted each time. + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++TEST ls $M0/mdatafile-backend-direct-modify ++ ++TEST [[ "$(stat -c %A $B0/${V0}0/mdatafile-backend-direct-modify)" == "$(stat -c %A $B0/${V0}1/mdatafile-backend-direct-modify)" ]] ++ ++ ++TEST getfattr -d -m. -e hex $M0/mdatafile ++EXPECT_WITHIN $HEAL_TIMEOUT 7 get_pending_heal_count $V0 ++ + TEST cat $M0/datafile + EXPECT_WITHIN $HEAL_TIMEOUT 6 get_pending_heal_count $V0 + + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; + TEST ls $M0/dir +-EXPECT 5 get_pending_heal_count $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT 5 get_pending_heal_count $V0 + + TEST cat $M0/dir/file1 + TEST cat $M0/dir/file2 +@@ -83,5 +91,5 @@ TEST cat $M0/dir/file3 + TEST cat $M0/dir/file4 + TEST cat $M0/dir/file5 + +-EXPECT 0 get_pending_heal_count $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT 0 get_pending_heal_count $V0 + cleanup; +diff --git a/tests/bugs/glusterfs/bug-906646.t b/tests/bugs/glusterfs/bug-906646.t +index 45c85d9..37b8fe5 100644 +--- a/tests/bugs/glusterfs/bug-906646.t ++++ b/tests/bugs/glusterfs/bug-906646.t +@@ -13,7 +13,6 @@ TEST pidof glusterd + TEST $CLI volume create $V0 replica $REPLICA $H0:$B0/${V0}-00 $H0:$B0/${V0}-01 $H0:$B0/${V0}-10 $H0:$B0/${V0}-11 + TEST $CLI volume start $V0 + +-TEST $CLI volume set $V0 cluster.self-heal-daemon off + TEST $CLI volume set $V0 cluster.background-self-heal-count 0 + + ## Mount FUSE with caching disabled +@@ -82,10 +81,15 @@ EXPECT 1 xattr_query_check ${backend_paths_array[1]} "trusted.name" + # restart the brick process + TEST $CLI volume start $V0 force + +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 `expr $brick_id - 1` ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 3 + +-cat $pth >/dev/null ++TEST $CLI volume heal $V0 + ++EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 + # check backends - xattr should not be present anywhere + EXPECT 1 xattr_query_check ${backend_paths_array[0]} "trusted.name" + EXPECT 1 xattr_query_check ${backend_paths_array[1]} "trusted.name" +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index e8107c9..e74fdec 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2571,6 +2571,42 @@ out: + return 0; + } + ++gf_boolean_t ++afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) ++{ ++ int idx = -1; ++ afr_private_t *priv = NULL; ++ void *pending_raw = NULL; ++ int *pending_int = NULL; ++ int i = 0; ++ ++ priv = this->private; ++ idx = afr_index_for_transaction_type (type); ++ ++ if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { ++ if (pending_raw) { ++ pending_int = pending_raw; ++ ++ if (ntoh32 (pending_int[idx])) ++ return _gf_true; ++ } ++ } ++ ++ for (i = 0; i < priv->child_count; i++) { ++ if (dict_get_ptr (xdata, priv->pending_key[i], ++ &pending_raw)) ++ continue; ++ if (!pending_raw) ++ continue; ++ pending_int = pending_raw; ++ ++ if (ntoh32 (pending_int[idx])) ++ return _gf_true; ++ } ++ ++ return _gf_false; ++} ++ + static gf_boolean_t + afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this) + { +@@ -2597,6 +2633,14 @@ afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this) + continue; + } + ++ if (afr_is_pending_set (this, replies[i].xdata, ++ AFR_METADATA_TRANSACTION)) { ++ /* Let shd do the heal so that lookup is not blocked ++ * on getting metadata lock/doing the heal */ ++ start = _gf_false; ++ break; ++ } ++ + if (gf_uuid_compare (stbuf.ia_gfid, replies[i].poststat.ia_gfid)) { + start = _gf_false; + break; +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index d04f11d..c6ee75b 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -2182,44 +2182,6 @@ afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, + return 0; + } + +- +-gf_boolean_t +-afr_is_pending_set (xlator_t *this, dict_t *xdata, int type) +-{ +- int idx = -1; +- afr_private_t *priv = NULL; +- void *pending_raw = NULL; +- int *pending_int = NULL; +- int i = 0; +- +- priv = this->private; +- idx = afr_index_for_transaction_type (type); +- +- if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) { +- if (pending_raw) { +- pending_int = pending_raw; +- +- if (ntoh32 (pending_int[idx])) +- return _gf_true; +- } +- } +- +- for (i = 0; i < priv->child_count; i++) { +- if (dict_get_ptr (xdata, priv->pending_key[i], +- &pending_raw)) +- continue; +- if (!pending_raw) +- continue; +- pending_int = pending_raw; +- +- if (ntoh32 (pending_int[idx])) +- return _gf_true; +- } +- +- return _gf_false; +-} +- +- + gf_boolean_t + afr_is_data_set (xlator_t *this, dict_t *xdata) + { +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 2e6d995..af9dbc8 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -1225,4 +1225,7 @@ afr_set_inode_local (xlator_t *this, afr_local_t *local, inode_t *inode); + + gf_boolean_t + afr_is_symmetric_error (call_frame_t *frame, xlator_t *this); ++ ++gf_boolean_t ++afr_is_pending_set (xlator_t *this, dict_t *xdata, int type); + #endif /* __AFR_H__ */ +-- +1.8.3.1 + diff --git a/0425-cluster-afr-Delegate-name-heal-when-possible.patch b/0425-cluster-afr-Delegate-name-heal-when-possible.patch new file mode 100644 index 0000000..193538e --- /dev/null +++ b/0425-cluster-afr-Delegate-name-heal-when-possible.patch @@ -0,0 +1,352 @@ +From 8a3c0fb64c8798ecf5a3635fe0922e3cfd476817 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 27 Aug 2018 12:40:16 +0530 +Subject: [PATCH 425/444] cluster/afr: Delegate name-heal when possible + +Problem: +When name-self-heal is triggered on the mount, it blocks +lookup until name-self-heal completes. But that can lead +to hangs when lot of clients are accessing a directory which +needs name heal and all of them trigger heals waiting +for other clients to complete heal. + +Fix: +When a name-heal is needed but quorum number of names have the +file and pending xattrs exist on the parent, then better to +delegate the heal to SHD which will be completed as part of +entry-heal of the parent directory. We could also do the same +for quorum-number of names not present but we don't have +any known use-case where this is a frequent occurrence so +not changing that part at the moment. When there is a gfid +mismatch or missing gfid it is important to complete the heal +so that next rename doesn't assume everything is fine and +perform a rename etc + +BUG: 1619357 +Upstream Patch: https://review.gluster.org/c/glusterfs/+/21087 +Change-Id: I8b002c85dffc6eb6f2833e742684a233daefeb2c +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/155029 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/afr.rc | 8 ++ + tests/basic/afr/name-self-heal.t | 112 +++++++++++++++++++++++++++ + xlators/cluster/afr/src/afr-common.c | 100 ++++++++++++++++++------ + xlators/cluster/afr/src/afr-self-heal-name.c | 12 ++- + 4 files changed, 205 insertions(+), 27 deletions(-) + create mode 100644 tests/basic/afr/name-self-heal.t + +diff --git a/tests/afr.rc b/tests/afr.rc +index 1fd0310..a1e8a44 100644 +--- a/tests/afr.rc ++++ b/tests/afr.rc +@@ -89,3 +89,11 @@ function count_index_entries() + { + ls $1/.glusterfs/indices/xattrop | wc -l + } ++ ++function get_quorum_type() ++{ ++ local m="$1" ++ local v="$2" ++ local repl_id="$3" ++ cat $m/.meta/graphs/active/$v-replicate-$repl_id/private|grep quorum-type|awk '{print $3}' ++} +diff --git a/tests/basic/afr/name-self-heal.t b/tests/basic/afr/name-self-heal.t +new file mode 100644 +index 0000000..50fc2ec +--- /dev/null ++++ b/tests/basic/afr/name-self-heal.t +@@ -0,0 +1,112 @@ ++#!/bin/bash ++#Self-heal tests ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++cleanup; ++ ++#Check that when quorum is not enabled name-heal happens correctly ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/brick{0,1} ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume start $V0 ++TEST $CLI volume heal $V0 disable ++TEST $CLI volume set $V0 cluster.entry-self-heal off ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++TEST touch $M0/a ++TEST touch $M0/c ++TEST kill_brick $V0 $H0 $B0/brick0 ++TEST touch $M0/b ++TEST rm -f $M0/a ++TEST rm -f $M0/c ++TEST touch $M0/c #gfid mismatch case ++c_gfid=$(gf_get_gfid_xattr $B0/brick1/c) ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++TEST ! stat $M0/a ++TEST ! stat $B0/brick0/a ++TEST ! stat $B0/brick1/a ++ ++TEST stat $M0/b ++TEST stat $B0/brick0/b ++TEST stat $B0/brick1/b ++TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]] ++ ++TEST stat $M0/c ++TEST stat $B0/brick0/c ++TEST stat $B0/brick1/c ++TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]] ++ ++cleanup; ++ ++#Check that when quorum is enabled name-heal happens as expected ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/brick{0,1,2} ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume start $V0 ++TEST $CLI volume heal $V0 disable ++TEST $CLI volume set $V0 cluster.entry-self-heal off ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++ ++TEST touch $M0/a ++TEST touch $M0/c ++TEST kill_brick $V0 $H0 $B0/brick0 ++TEST touch $M0/b ++TEST rm -f $M0/a ++TEST rm -f $M0/c ++TEST touch $M0/c #gfid mismatch case ++c_gfid=$(gf_get_gfid_xattr $B0/brick1/c) ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 ++TEST ! stat $M0/a ++TEST ! stat $B0/brick0/a ++TEST ! stat $B0/brick1/a ++TEST ! stat $B0/brick2/a ++ ++TEST stat $M0/b ++TEST ! stat $B0/brick0/b #Name heal shouldn't be triggered ++TEST stat $B0/brick1/b ++TEST stat $B0/brick2/b ++ ++TEST stat $M0/c ++TEST stat $B0/brick0/c ++TEST stat $B0/brick1/c ++TEST stat $B0/brick2/c ++TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]] ++ ++TEST $CLI volume set $V0 cluster.quorum-type none ++EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "none" get_quorum_type $M0 $V0 0 ++TEST stat $M0/b ++TEST stat $B0/brick0/b #Name heal should be triggered ++TEST stat $B0/brick1/b ++TEST stat $B0/brick2/b ++TEST [[ "$(gf_get_gfid_xattr $B0/brick0/b)" == "$(gf_get_gfid_xattr $B0/brick1/b)" ]] ++TEST $CLI volume set $V0 cluster.quorum-type auto ++EXPECT_WITHIN $CONFIG_UPDATE_TIMEOUT "auto" get_quorum_type $M0 $V0 0 ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++#Missing parent xattrs cases ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "Y" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0; ++TEST $CLI volume heal $V0 disable ++#In cases where a good parent doesn't have pending xattrs and a file, ++#name-heal will be triggered ++TEST gf_rm_file_and_gfid_link $B0/brick1 c ++TEST stat $M0/c ++TEST stat $B0/brick0/c ++TEST stat $B0/brick1/c ++TEST stat $B0/brick2/c ++TEST [[ "$(gf_get_gfid_xattr $B0/brick0/c)" == "$c_gfid" ]] ++cleanup +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index e74fdec..ce2b17a 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2302,8 +2302,6 @@ afr_lookup_done (call_frame_t *frame, xlator_t *this) + */ + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid || replies[i].op_ret == -1) { +- if (priv->child_up[i]) +- can_interpret = _gf_false; + continue; + } + +@@ -2742,21 +2740,52 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) + afr_private_t *priv = NULL; + call_frame_t *heal = NULL; + int i = 0, first = -1; +- gf_boolean_t need_heal = _gf_false; ++ gf_boolean_t name_state_mismatch = _gf_false; + struct afr_reply *replies = NULL; + int ret = 0; ++ unsigned char *par_readables = NULL; ++ unsigned char *success = NULL; ++ int32_t op_errno = 0; ++ uuid_t gfid = {0}; + + local = frame->local; + replies = local->replies; + priv = this->private; ++ par_readables = alloca0(priv->child_count); ++ success = alloca0(priv->child_count); ++ ++ ret = afr_inode_read_subvol_get (local->loc.parent, this, par_readables, ++ NULL, NULL); ++ if (ret < 0 || AFR_COUNT (par_readables, priv->child_count) == 0) { ++ /* In this case set par_readables to all 1 so that name_heal ++ * need checks at the end of this function will flag missing ++ * entry when name state mismatches*/ ++ memset (par_readables, 1, priv->child_count); ++ } + + for (i = 0; i < priv->child_count; i++) { + if (!replies[i].valid) + continue; + ++ if (replies[i].op_ret == 0) { ++ if (uuid_is_null (gfid)) { ++ gf_uuid_copy (gfid, ++ replies[i].poststat.ia_gfid); ++ } ++ success[i] = 1; ++ } else { ++ if ((replies[i].op_errno != ENOTCONN) && ++ (replies[i].op_errno != ENOENT) && ++ (replies[i].op_errno != ESTALE)) { ++ op_errno = replies[i].op_errno; ++ } ++ } ++ ++ /*gfid is missing, needs heal*/ + if ((replies[i].op_ret == -1) && +- (replies[i].op_errno == ENODATA)) +- need_heal = _gf_true; ++ (replies[i].op_errno == ENODATA)) { ++ goto name_heal; ++ } + + if (first == -1) { + first = i; +@@ -2764,30 +2793,53 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) + } + + if (replies[i].op_ret != replies[first].op_ret) { +- need_heal = _gf_true; +- break; ++ name_state_mismatch = _gf_true; + } + +- if (gf_uuid_compare (replies[i].poststat.ia_gfid, +- replies[first].poststat.ia_gfid)) { +- need_heal = _gf_true; +- break; +- } ++ if (replies[i].op_ret == 0) { ++ /* Rename after this lookup may succeed if we don't do ++ * a name-heal and the destination may not have pending xattrs ++ * to indicate which name is good and which is bad so always do ++ * this heal*/ ++ if (gf_uuid_compare (replies[i].poststat.ia_gfid, ++ gfid)) { ++ goto name_heal; ++ } ++ } + } + +- if (need_heal) { +- heal = afr_frame_create (this, NULL); +- if (!heal) +- goto metadata_heal; +- +- ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, +- afr_refresh_selfheal_done, heal, frame); +- if (ret) { +- AFR_STACK_DESTROY (heal); +- goto metadata_heal; ++ if (name_state_mismatch) { ++ if (!priv->quorum_count) ++ goto name_heal; ++ if (!afr_has_quorum (success, this)) ++ goto name_heal; ++ if (op_errno) ++ goto name_heal; ++ for (i = 0; i < priv->child_count; i++) { ++ if (!replies[i].valid) ++ continue; ++ if (par_readables[i] && replies[i].op_ret < 0 && ++ replies[i].op_errno != ENOTCONN) { ++ goto name_heal; ++ } + } +- return ret; +- } ++ } ++ ++ goto metadata_heal; ++ ++name_heal: ++ heal = afr_frame_create (this, NULL); ++ if (!heal) ++ goto metadata_heal; ++ ++ ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap, ++ afr_refresh_selfheal_done, heal, frame); ++ if (ret) { ++ AFR_STACK_DESTROY (heal); ++ goto metadata_heal; ++ } ++ return ret; ++ + metadata_heal: + ret = afr_lookup_metadata_heal_check (frame, this); + +diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c +index bcd0e60..0a5be29 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-name.c ++++ b/xlators/cluster/afr/src/afr-self-heal-name.c +@@ -634,20 +634,26 @@ afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this, + continue; + + if ((replies[i].op_ret == -1) && +- (replies[i].op_errno == ENODATA)) ++ (replies[i].op_errno == ENODATA)) { + *need_heal = _gf_true; ++ break; ++ } + + if (first_idx == -1) { + first_idx = i; + continue; + } + +- if (replies[i].op_ret != replies[first_idx].op_ret) ++ if (replies[i].op_ret != replies[first_idx].op_ret) { + *need_heal = _gf_true; ++ break; ++ } + + if (gf_uuid_compare (replies[i].poststat.ia_gfid, +- replies[first_idx].poststat.ia_gfid)) ++ replies[first_idx].poststat.ia_gfid)) { + *need_heal = _gf_true; ++ break; ++ } + } + + if (inode) +-- +1.8.3.1 + diff --git a/0426-features-shard-Make-operations-on-internal-directori.patch b/0426-features-shard-Make-operations-on-internal-directori.patch new file mode 100644 index 0000000..aa8e5a9 --- /dev/null +++ b/0426-features-shard-Make-operations-on-internal-directori.patch @@ -0,0 +1,582 @@ +From 9be984ac2b71423b72ab3b1fa45b4d77a263ce1e Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Wed, 28 Mar 2018 12:09:27 +0530 +Subject: [PATCH 426/444] features/shard: Make operations on internal + directories generic + +> Upstream: https://review.gluster.org/19892 +> BUG: 1568521 +> Change-Id: Iea7ad2102220c6d415909f8caef84167ce2d6818 + +Change-Id: Iea7ad2102220c6d415909f8caef84167ce2d6818 +BUG: 1520882 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/154860 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez +--- + xlators/features/shard/src/shard.c | 295 +++++++++++++++++++++++++------------ + xlators/features/shard/src/shard.h | 4 + + 2 files changed, 206 insertions(+), 93 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index f5fb181..5ff04df 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -546,30 +546,55 @@ shard_call_count_return (call_frame_t *frame) + return call_count; + } + ++static char * ++shard_internal_dir_string (shard_internal_dir_type_t type) ++{ ++ char *str = NULL; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ str = ".shard"; ++ break; ++ default: ++ break; ++ } ++ return str; ++} ++ + static int +-shard_init_dot_shard_loc (xlator_t *this, shard_local_t *local) ++shard_init_internal_dir_loc (xlator_t *this, shard_local_t *local, ++ shard_internal_dir_type_t type) + { +- int ret = -1; +- loc_t *dot_shard_loc = NULL; ++ int ret = -1; ++ char *bname = NULL; ++ loc_t *internal_dir_loc = NULL; + + if (!local) + return -1; + +- dot_shard_loc = &local->dot_shard_loc; +- dot_shard_loc->inode = inode_new (this->itable); +- dot_shard_loc->parent = inode_ref (this->itable->root); +- ret = inode_path (dot_shard_loc->parent, GF_SHARD_DIR, +- (char **)&dot_shard_loc->path); +- if (ret < 0 || !(dot_shard_loc->inode)) { ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ internal_dir_loc = &local->dot_shard_loc; ++ bname = GF_SHARD_DIR; ++ break; ++ default: ++ break; ++ } ++ ++ internal_dir_loc->inode = inode_new (this->itable); ++ internal_dir_loc->parent = inode_ref (this->itable->root); ++ ret = inode_path (internal_dir_loc->parent, bname, ++ (char **)&internal_dir_loc->path); ++ if (ret < 0 || !(internal_dir_loc->inode)) { + gf_msg (this->name, GF_LOG_ERROR, 0, + SHARD_MSG_INODE_PATH_FAILED, +- "Inode path failed on %s", GF_SHARD_DIR); ++ "Inode path failed on %s", bname); + goto out; + } + +- dot_shard_loc->name = strrchr (dot_shard_loc->path, '/'); +- if (dot_shard_loc->name) +- dot_shard_loc->name++; ++ internal_dir_loc->name = strrchr (internal_dir_loc->path, '/'); ++ if (internal_dir_loc->name) ++ internal_dir_loc->name++; + + ret = 0; + out: +@@ -1029,28 +1054,42 @@ out: + } + + static inode_t * +-shard_link_dot_shard_inode (shard_local_t *local, inode_t *inode, +- struct iatt *buf) ++shard_link_internal_dir_inode (shard_local_t *local, inode_t *inode, ++ struct iatt *buf, shard_internal_dir_type_t type) + { + inode_t *linked_inode = NULL; + shard_priv_t *priv = NULL; ++ char *bname = NULL; ++ inode_t **priv_inode = NULL; + + priv = THIS->private; + +- linked_inode = inode_link (inode, inode->table->root, ".shard", buf); ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ bname = ".shard"; ++ priv_inode = &priv->dot_shard_inode; ++ break; ++ default: ++ break; ++ } ++ ++ linked_inode = inode_link (inode, inode->table->root, bname, buf); + inode_lookup (linked_inode); +- priv->dot_shard_inode = linked_inode; ++ *priv_inode = linked_inode; + return linked_inode; + } + + + int +-shard_refresh_dot_shard_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) ++shard_refresh_internal_dir_cbk (call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) + { +- shard_local_t *local = NULL; ++ shard_local_t *local = NULL; ++ inode_t *linked_inode = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t) cookie; + + local = frame->local; + +@@ -1061,27 +1100,37 @@ shard_refresh_dot_shard_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + } + + /* To-Do: Fix refcount increment per call to +- * shard_link_dot_shard_inode(). ++ * shard_link_internal_dir_inode(). + */ +- shard_link_dot_shard_inode (local, inode, buf); +- shard_inode_ctx_set_refreshed_flag (inode, this); ++ linked_inode = shard_link_internal_dir_inode (local, inode, buf, type); ++ shard_inode_ctx_set_refreshed_flag (linked_inode, this); + out: + shard_common_resolve_shards (frame, this, local->post_res_handler); + return 0; + } + + int +-shard_refresh_dot_shard (call_frame_t *frame, xlator_t *this) ++shard_refresh_internal_dir (call_frame_t *frame, xlator_t *this, ++ shard_internal_dir_type_t type) + { + loc_t loc = {0,}; + inode_t *inode = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; ++ uuid_t gfid = {0,}; + + local = frame->local; + priv = this->private; + +- inode = inode_find (this->itable, priv->dot_shard_gfid); ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy (gfid, priv->dot_shard_gfid); ++ break; ++ default: ++ break; ++ } ++ ++ inode = inode_find (this->itable, gfid); + + if (!shard_inode_ctx_needs_lookup (inode, this)) { + local->op_ret = 0; +@@ -1092,10 +1141,11 @@ shard_refresh_dot_shard (call_frame_t *frame, xlator_t *this) + * call to inode_find() + */ + loc.inode = inode; +- gf_uuid_copy (loc.gfid, priv->dot_shard_gfid); ++ gf_uuid_copy (loc.gfid, gfid); + +- STACK_WIND (frame, shard_refresh_dot_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, &loc, NULL); ++ STACK_WIND_COOKIE (frame, shard_refresh_internal_dir_cbk, ++ (void *)(long) type, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, &loc, NULL); + loc_wipe (&loc); + + return 0; +@@ -1106,13 +1156,14 @@ out: + } + + int +-shard_lookup_dot_shard_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, inode_t *inode, +- struct iatt *buf, dict_t *xdata, +- struct iatt *postparent) ++shard_lookup_internal_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) + { +- inode_t *link_inode = NULL; +- shard_local_t *local = NULL; ++ inode_t *link_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t) cookie; + + local = frame->local; + +@@ -1124,17 +1175,17 @@ shard_lookup_dot_shard_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + if (!IA_ISDIR (buf->ia_type)) { + gf_msg (this->name, GF_LOG_CRITICAL, 0, +- SHARD_MSG_DOT_SHARD_NODIR, "/.shard already exists and " +- "is not a directory. Please remove /.shard from all " +- "bricks and try again"); ++ SHARD_MSG_DOT_SHARD_NODIR, "%s already exists and " ++ "is not a directory. Please remove it from all bricks " ++ "and try again", shard_internal_dir_string (type)); + local->op_ret = -1; + local->op_errno = EIO; + goto unwind; + } + +- link_inode = shard_link_dot_shard_inode (local, inode, buf); ++ link_inode = shard_link_internal_dir_inode (local, inode, buf, type); + if (link_inode != inode) { +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, type); + } else { + shard_inode_ctx_set_refreshed_flag (link_inode, this); + shard_common_resolve_shards (frame, this, +@@ -1148,18 +1199,26 @@ unwind: + } + + int +-shard_lookup_dot_shard (call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t post_res_handler) ++shard_lookup_internal_dir (call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t post_res_handler, ++ shard_internal_dir_type_t type) + { + int ret = -1; + dict_t *xattr_req = NULL; + shard_priv_t *priv = NULL; + shard_local_t *local = NULL; ++ uuid_t *gfid = NULL; ++ loc_t *loc = NULL; ++ gf_boolean_t free_gfid = _gf_true; + + local = frame->local; + priv = this->private; + local->post_res_handler = post_res_handler; + ++ gfid = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); ++ if (!gfid) ++ goto err; ++ + xattr_req = dict_new (); + if (!xattr_req) { + local->op_ret = -1; +@@ -1167,26 +1226,38 @@ shard_lookup_dot_shard (call_frame_t *frame, xlator_t *this, + goto err; + } + +- ret = dict_set_static_bin (xattr_req, "gfid-req", priv->dot_shard_gfid, +- 16); ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy (*gfid, priv->dot_shard_gfid); ++ loc = &local->dot_shard_loc; ++ break; ++ default: ++ break; ++ } ++ ++ ret = dict_set_bin (xattr_req, "gfid-req", *gfid, 16); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED, +- "Failed to set gfid of /.shard into dict"); ++ "Failed to set gfid of %s into dict", ++ shard_internal_dir_string (type)); + local->op_ret = -1; + local->op_errno = ENOMEM; + goto err; ++ } else { ++ free_gfid = _gf_false; + } + +- STACK_WIND (frame, shard_lookup_dot_shard_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, &local->dot_shard_loc, +- xattr_req); +- ++ STACK_WIND_COOKIE (frame, shard_lookup_internal_dir_cbk, ++ (void *) (long) type, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xattr_req); + dict_unref (xattr_req); + return 0; + + err: + if (xattr_req) + dict_unref (xattr_req); ++ if (free_gfid) ++ GF_FREE (gfid); + post_res_handler (frame, this); + return 0; + } +@@ -2203,14 +2274,17 @@ shard_truncate_begin (call_frame_t *frame, xlator_t *this) + local->dot_shard_loc.inode = inode_find (this->itable, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { +- ret = shard_init_dot_shard_loc (this, local); ++ ret = shard_init_internal_dir_loc (this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto err; +- shard_lookup_dot_shard (frame, this, +- shard_post_resolve_truncate_handler); ++ shard_lookup_internal_dir (frame, this, ++ shard_post_resolve_truncate_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_truncate_handler; +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; + +@@ -2682,14 +2756,17 @@ shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->dot_shard_loc.inode = inode_find (this->itable, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { +- ret = shard_init_dot_shard_loc (this, local); ++ ret = shard_init_internal_dir_loc (this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto unwind; +- shard_lookup_dot_shard (frame, this, +- shard_post_resolve_unlink_handler); ++ shard_lookup_internal_dir (frame, this, ++ shard_post_resolve_unlink_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_unlink_handler; +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } + + return 0; +@@ -3048,14 +3125,17 @@ shard_rename_unlink_dst_shards_do (call_frame_t *frame, xlator_t *this) + local->dot_shard_loc.inode = inode_find (this->itable, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { +- ret = shard_init_dot_shard_loc (this, local); ++ ret = shard_init_internal_dir_loc (this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto out; +- shard_lookup_dot_shard (frame, this, +- shard_post_resolve_unlink_handler); ++ shard_lookup_internal_dir (frame, this, ++ shard_post_resolve_unlink_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_unlink_handler; +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } + + return 0; +@@ -3811,14 +3891,17 @@ shard_post_lookup_readv_handler (call_frame_t *frame, xlator_t *this) + local->dot_shard_loc.inode = inode_find (this->itable, + priv->dot_shard_gfid); + if (!local->dot_shard_loc.inode) { +- ret = shard_init_dot_shard_loc (this, local); ++ ret = shard_init_internal_dir_loc (this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + if (ret) + goto err; +- shard_lookup_dot_shard (frame, this, +- shard_post_resolve_readv_handler); ++ shard_lookup_internal_dir (frame, this, ++ shard_post_resolve_readv_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + local->post_res_handler = shard_post_resolve_readv_handler; +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; + +@@ -4249,8 +4332,9 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame, + } + + int +-shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler); ++shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type); + int + shard_common_inode_write_post_resolve_handler (call_frame_t *frame, + xlator_t *this) +@@ -4323,26 +4407,28 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame, + + if (!local->dot_shard_loc.inode) { + /*change handler*/ +- shard_mkdir_dot_shard (frame, this, +- shard_common_inode_write_post_resolve_handler); ++ shard_mkdir_internal_dir (frame, this, ++ shard_common_inode_write_post_resolve_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } else { + /*change handler*/ + local->post_res_handler = + shard_common_inode_write_post_resolve_handler; +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; + } + + int +-shard_mkdir_dot_shard_cbk (call_frame_t *frame, void *cookie, +- xlator_t *this, int32_t op_ret, +- int32_t op_errno, inode_t *inode, +- struct iatt *buf, struct iatt *preparent, +- struct iatt *postparent, dict_t *xdata) ++shard_mkdir_internal_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) + { +- inode_t *link_inode = NULL; +- shard_local_t *local = NULL; ++ inode_t *link_inode = NULL; ++ shard_local_t *local = NULL; ++ shard_internal_dir_type_t type = (shard_internal_dir_type_t) cookie; + + local = frame->local; + +@@ -4354,17 +4440,19 @@ shard_mkdir_dot_shard_cbk (call_frame_t *frame, void *cookie, + local->op_errno = op_errno; + goto unwind; + } else { +- gf_msg_debug (this->name, 0, "mkdir on /.shard failed " +- "with EEXIST. Attempting lookup now"); +- shard_lookup_dot_shard (frame, this, +- local->post_res_handler); ++ gf_msg_debug (this->name, 0, "mkdir on %s failed " ++ "with EEXIST. Attempting lookup now", ++ shard_internal_dir_string (type)); ++ shard_lookup_internal_dir (frame, this, ++ local->post_res_handler, ++ type); + return 0; + } + } + +- link_inode = shard_link_dot_shard_inode (local, inode, buf); ++ link_inode = shard_link_internal_dir_inode (local, inode, buf, type); + if (link_inode != inode) { +- shard_refresh_dot_shard (frame, this); ++ shard_refresh_internal_dir (frame, this, type); + } else { + shard_inode_ctx_set_refreshed_flag (link_inode, this); + shard_common_resolve_shards (frame, this, +@@ -4377,40 +4465,59 @@ unwind: + } + + int +-shard_mkdir_dot_shard (call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler) ++shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type) + { + int ret = -1; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; + dict_t *xattr_req = NULL; ++ uuid_t *gfid = NULL; ++ loc_t *loc = NULL; ++ gf_boolean_t free_gfid = _gf_true; + + local = frame->local; + priv = this->private; + + local->post_res_handler = handler; ++ gfid = GF_CALLOC (1, sizeof(uuid_t), gf_common_mt_uuid_t); ++ if (!gfid) ++ goto err; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ gf_uuid_copy (*gfid, priv->dot_shard_gfid); ++ loc = &local->dot_shard_loc; ++ break; ++ default: ++ break; ++ } + + xattr_req = dict_new (); + if (!xattr_req) + goto err; + +- ret = shard_init_dot_shard_loc (this, local); ++ ret = shard_init_internal_dir_loc (this, local, type); + if (ret) + goto err; + +- ret = dict_set_static_bin (xattr_req, "gfid-req", priv->dot_shard_gfid, +- 16); ++ ret = dict_set_bin (xattr_req, "gfid-req", *gfid, 16); + if (ret) { + gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED, +- "Failed to set gfid-req for /.shard"); ++ "Failed to set gfid-req for %s", ++ shard_internal_dir_string (type)); + goto err; ++ } else { ++ free_gfid = _gf_false; + } + + SHARD_SET_ROOT_FS_ID (frame, local); + +- STACK_WIND (frame, shard_mkdir_dot_shard_cbk, +- FIRST_CHILD(this), FIRST_CHILD(this)->fops->mkdir, +- &local->dot_shard_loc, 0755, 0, xattr_req); ++ STACK_WIND_COOKIE (frame, shard_mkdir_internal_dir_cbk, ++ (void *)(long) type, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->mkdir, loc, 0755, 0, ++ xattr_req); + dict_unref (xattr_req); + return 0; + +@@ -4419,6 +4526,8 @@ err: + dict_unref (xattr_req); + local->op_ret = -1; + local->op_errno = ENOMEM; ++ if (free_gfid) ++ GF_FREE (gfid); + handler (frame, this); + return 0; + } +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 75d39a1..a1adb6a 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -278,4 +278,8 @@ typedef struct shard_inode_ctx { + inode_t *base_inode; + } shard_inode_ctx_t; + ++typedef enum { ++ SHARD_INTERNAL_DIR_DOT_SHARD = 1, ++} shard_internal_dir_type_t; ++ + #endif /* __SHARD_H__ */ +-- +1.8.3.1 + diff --git a/0427-features-shard-Add-option-to-barrier-parallel-lookup.patch b/0427-features-shard-Add-option-to-barrier-parallel-lookup.patch new file mode 100644 index 0000000..282045d --- /dev/null +++ b/0427-features-shard-Add-option-to-barrier-parallel-lookup.patch @@ -0,0 +1,291 @@ +From 64b238d3a5caf7bdb32bca25946f84e0afe9bc7a Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Tue, 17 Apr 2018 22:14:20 +0530 +Subject: [PATCH 427/444] features/shard: Add option to barrier parallel lookup + and unlink of shards + +> Upstream: https://review.gluster.org/19915 +> BUG: 1568521 +> Change-Id: Ib0f90a5f62abdfa89cda7bef9f3ff99f349ec332 + +Also move the common parallel unlink callback for GF_FOP_TRUNCATE and +GF_FOP_FTRUNCATE into a separate function. + +Change-Id: Ib0f90a5f62abdfa89cda7bef9f3ff99f349ec332 +BUG: 1520882 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/154861 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez +Tested-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 113 ++++++++++++++++++++++++++++--------- + xlators/features/shard/src/shard.h | 4 ++ + 2 files changed, 89 insertions(+), 28 deletions(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 5ff04df..268ba20 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -475,6 +475,7 @@ shard_local_wipe (shard_local_t *local) + + count = local->num_blocks; + ++ syncbarrier_destroy (&local->barrier); + loc_wipe (&local->loc); + loc_wipe (&local->dot_shard_loc); + loc_wipe (&local->loc2); +@@ -861,6 +862,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + + priv = this->private; + local = frame->local; ++ local->call_count = 0; + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; + +@@ -1780,6 +1782,37 @@ shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata); + ++void ++shard_unlink_block_inode (shard_local_t *local, int shard_block_num); ++ ++int ++shard_truncate_htol_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int call_count = 0; ++ int shard_block_num = (long) cookie; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ goto done; ++ } ++ ++ shard_unlink_block_inode (local, shard_block_num); ++done: ++ call_count = shard_call_count_return (frame); ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID (frame, local); ++ shard_truncate_last_shard (frame, this, local->inode_list[0]); ++ } ++ return 0; ++} ++ + int + shard_truncate_htol (call_frame_t *frame, xlator_t *this, inode_t *inode) + { +@@ -1839,10 +1872,9 @@ shard_truncate_htol (call_frame_t *frame, xlator_t *this, inode_t *inode) + continue; + } + if (wind_failed) { +- shard_unlink_shards_do_cbk (frame, +- (void *)(long) cur_block, +- this, -1, ENOMEM, NULL, +- NULL, NULL); ++ shard_truncate_htol_cbk (frame, (void *)(long) cur_block, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); + goto next; + } + +@@ -1860,10 +1892,9 @@ shard_truncate_htol (call_frame_t *frame, xlator_t *this, inode_t *inode) + local->op_errno = ENOMEM; + loc_wipe (&loc); + wind_failed = _gf_true; +- shard_unlink_shards_do_cbk (frame, +- (void *)(long) cur_block, +- this, -1, ENOMEM, NULL, +- NULL, NULL); ++ shard_truncate_htol_cbk (frame, (void *)(long) cur_block, ++ this, -1, ENOMEM, NULL, NULL, ++ NULL); + goto next; + } + loc.name = strrchr (loc.path, '/'); +@@ -1871,7 +1902,7 @@ shard_truncate_htol (call_frame_t *frame, xlator_t *this, inode_t *inode) + loc.name++; + loc.inode = inode_ref (local->inode_list[i]); + +- STACK_WIND_COOKIE (frame, shard_unlink_shards_do_cbk, ++ STACK_WIND_COOKIE (frame, shard_truncate_htol_cbk, + (void *) (long) cur_block, FIRST_CHILD(this), + FIRST_CHILD (this)->fops->unlink, &loc, + 0, NULL); +@@ -2022,13 +2053,18 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie, + + done: + call_count = shard_call_count_return (frame); +- if (call_count == 0) { +- if (!local->first_lookup_done) +- local->first_lookup_done = _gf_true; +- if (local->op_ret < 0) +- goto unwind; +- else +- local->pls_fop_handler (frame, this); ++ if (local->lookup_shards_barriered) { ++ syncbarrier_wake (&local->barrier); ++ return 0; ++ } else { ++ if (call_count == 0) { ++ if (!local->first_lookup_done) ++ local->first_lookup_done = _gf_true; ++ if (local->op_ret < 0) ++ goto unwind; ++ else ++ local->pls_fop_handler (frame, this); ++ } + } + return 0; + +@@ -2074,6 +2110,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode, + { + int i = 0; + int ret = 0; ++ int count = 0; + int call_count = 0; + int32_t shard_idx_iter = 0; + int last_block = 0; +@@ -2087,10 +2124,12 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode, + + priv = this->private; + local = frame->local; +- call_count = local->call_count; ++ count = call_count = local->call_count; + shard_idx_iter = local->first_block; + last_block = local->last_block; + local->pls_fop_handler = handler; ++ if (local->lookup_shards_barriered) ++ local->barrier.waitfor = local->call_count; + + while (shard_idx_iter <= last_block) { + if (local->inode_list[i]) { +@@ -2162,7 +2201,8 @@ next: + if (!--call_count) + break; + } +- ++ if (local->lookup_shards_barriered) ++ syncbarrier_wait (&local->barrier, count); + return 0; + } + +@@ -2400,6 +2440,9 @@ shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + + frame->local = local; + ++ ret = syncbarrier_init (&local->barrier); ++ if (ret) ++ goto err; + loc_copy (&local->loc, loc); + local->offset = offset; + local->block_size = block_size; +@@ -2450,6 +2493,9 @@ shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + goto err; + + frame->local = local; ++ ret = syncbarrier_init (&local->barrier); ++ if (ret) ++ goto err; + local->fd = fd_ref (fd); + local->offset = offset; + local->block_size = block_size; +@@ -2881,18 +2927,19 @@ shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + done: + call_count = shard_call_count_return (frame); +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID (frame, local); ++ if (local->unlink_shards_barriered) { ++ syncbarrier_wake (&local->barrier); ++ } else { + +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_cbk (frame, this); +- else if (local->fop == GF_FOP_RENAME) +- shard_rename_cbk (frame, this); +- else +- shard_truncate_last_shard (frame, this, +- local->inode_list[0]); +- } ++ if (call_count == 0) { ++ SHARD_UNSET_ROOT_FS_ID (frame, local); + ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_cbk (frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_cbk (frame, this); ++ } ++ } + return 0; + } + +@@ -2952,6 +2999,8 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + local->call_count = call_count = count; + cur_block = 1; + SHARD_SET_ROOT_FS_ID (frame, local); ++ if (local->unlink_shards_barriered) ++ local->barrier.waitfor = count; + + /* Ignore the base file and start iterating from the first block shard. + */ +@@ -3006,6 +3055,8 @@ next: + if (!--call_count) + break; + } ++ if (local->unlink_shards_barriered) ++ syncbarrier_wait (&local->barrier, count); + + return 0; + } +@@ -3947,6 +3998,9 @@ shard_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + + frame->local = local; + ++ ret = syncbarrier_init (&local->barrier); ++ if (ret) ++ goto err; + local->fd = fd_ref (fd); + local->block_size = block_size; + local->offset = offset; +@@ -5414,6 +5468,9 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this, + + frame->local = local; + ++ ret = syncbarrier_init (&local->barrier); ++ if (ret) ++ goto out; + local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); + if (!local->xattr_req) + goto out; +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index a1adb6a..225caa0 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -15,6 +15,7 @@ + #include "xlator.h" + #include "compat-errno.h" + #include "shard-messages.h" ++#include "syncop.h" + + #define GF_SHARD_DIR ".shard" + #define SHARD_MIN_BLOCK_SIZE (4 * GF_UNIT_MB) +@@ -257,6 +258,9 @@ typedef struct shard_local { + } lock; + inode_t *resolver_base_inode; + gf_boolean_t first_lookup_done; ++ syncbarrier_t barrier; ++ gf_boolean_t lookup_shards_barriered; ++ gf_boolean_t unlink_shards_barriered; + } shard_local_t; + + typedef struct shard_inode_ctx { +-- +1.8.3.1 + diff --git a/0428-libglusterfs-syncop-Handle-barrier_-init-destroy-in-.patch b/0428-libglusterfs-syncop-Handle-barrier_-init-destroy-in-.patch new file mode 100644 index 0000000..a6136c0 --- /dev/null +++ b/0428-libglusterfs-syncop-Handle-barrier_-init-destroy-in-.patch @@ -0,0 +1,99 @@ +From c285acf172d42271d87eb069045ea70bce97b0b1 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Mon, 23 Apr 2018 21:04:58 +0530 +Subject: [PATCH 428/444] libglusterfs/syncop: Handle barrier_{init/destroy} in + error cases + +> Upstream: https://review.gluster.org/19927 +> BUG: 1568521 +> Change-Id: I53e60cfcaa7f8edfa5eca47307fa99f10ee64505 + +Change-Id: I53e60cfcaa7f8edfa5eca47307fa99f10ee64505 +BUG: 1520882 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/154862 +Tested-by: Krutika Dhananjay +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez +Tested-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/syncop.c | 30 ++++++++++++++++++++++++++---- + libglusterfs/src/syncop.h | 1 + + 2 files changed, 27 insertions(+), 4 deletions(-) + +diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c +index ac40a1d..81d73b2 100644 +--- a/libglusterfs/src/syncop.c ++++ b/libglusterfs/src/syncop.c +@@ -1087,30 +1087,52 @@ synclock_unlock (synclock_t *lock) + int + syncbarrier_init (struct syncbarrier *barrier) + { ++ int ret = 0; + if (!barrier) { + errno = EINVAL; + return -1; + } + +- pthread_cond_init (&barrier->cond, 0); ++ ret = pthread_cond_init (&barrier->cond, 0); ++ if (ret) { ++ errno = ret; ++ return -1; ++ } + barrier->count = 0; + barrier->waitfor = 0; + INIT_LIST_HEAD (&barrier->waitq); + +- return pthread_mutex_init (&barrier->guard, 0); ++ ret = pthread_mutex_init (&barrier->guard, 0); ++ if (ret) { ++ (void)pthread_cond_destroy (&barrier->cond); ++ errno = ret; ++ return -1; ++ } ++ barrier->initialized = _gf_true; ++ return 0; + } + + + int + syncbarrier_destroy (struct syncbarrier *barrier) + { ++ int ret = 0; ++ int ret1 = 0; + if (!barrier) { + errno = EINVAL; + return -1; + } + +- pthread_cond_destroy (&barrier->cond); +- return pthread_mutex_destroy (&barrier->guard); ++ if (barrier->initialized) { ++ ret = pthread_cond_destroy (&barrier->cond); ++ ret1 = pthread_mutex_destroy (&barrier->guard); ++ barrier->initialized = _gf_false; ++ } ++ if (ret || ret1) { ++ errno = ret?ret:ret1; ++ return -1; ++ } ++ return 0; + } + + +diff --git a/libglusterfs/src/syncop.h b/libglusterfs/src/syncop.h +index 5b5ad4e..9ab5ee8 100644 +--- a/libglusterfs/src/syncop.h ++++ b/libglusterfs/src/syncop.h +@@ -134,6 +134,7 @@ typedef struct synclock synclock_t; + + + struct syncbarrier { ++ gf_boolean_t initialized; /*Set on successful initialization*/ + pthread_mutex_t guard; /* guard the remaining members, pair @cond */ + pthread_cond_t cond; /* waiting non-synctasks */ + struct list_head waitq; /* waiting synctasks */ +-- +1.8.3.1 + diff --git a/0429-features-shard-Introducing-.shard-.remove_me-for-ato.patch b/0429-features-shard-Introducing-.shard-.remove_me-for-ato.patch new file mode 100644 index 0000000..94cd4a6 --- /dev/null +++ b/0429-features-shard-Introducing-.shard-.remove_me-for-ato.patch @@ -0,0 +1,2749 @@ +From b92aedc0b10d3c7b6150b8f18c950bf95494bc5f Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Thu, 29 Mar 2018 17:21:32 +0530 +Subject: [PATCH 429/444] features/shard: Introducing ".shard/.remove_me" for + atomic shard deletion (part 1) + +> Upstream: https://review.gluster.org/19929 +> BUG: 1568521 +> Change-Id: Ia1d238b721a3e99f951a73abbe199e4245f51a3a + +PROBLEM: +Shards are deleted synchronously when a sharded file is unlinked or +when a sharded file participating as the dst in a rename() is going to +be replaced. The problem with this approach is it makes the operation +really slow, sometimes causing the application to time out, especially +with large files. + +SOLUTION: +To make this operation atomic, we introduce a ".remove_me" directory. +Now renames and unlinks will simply involve two steps: +1. creating an empty file under .remove_me named after the gfid of the file +participating in unlink/rename +2. carrying out the actual rename/unlink +A synctask is created (more on that in part 2) to scan this directory +after every unlink/rename operation (or upon a volume mount) and clean +up all shards associated with it. All of this happens in the background. +The task takes care to delete the shards associated with the gfid in +.remove_me only if this gfid doesn't exist in backend, ensuring that the +file was successfully renamed/unlinked and its shards can be discarded now +safely. + +Change-Id: Ia1d238b721a3e99f951a73abbe199e4245f51a3a +BUG: 1520882 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/154863 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez +--- + libglusterfs/src/common-utils.h | 1 + + tests/bugs/shard/bug-1245547.t | 4 +- + tests/bugs/shard/bug-1568521-EEXIST.t | 79 ++ + tests/bugs/shard/bug-shard-discard.t | 16 +- + tests/bugs/shard/shard-inode-refcount-test.t | 2 +- + tests/bugs/shard/unlinks-and-renames.t | 118 ++- + xlators/features/shard/src/shard-mem-types.h | 1 + + xlators/features/shard/src/shard-messages.h | 9 +- + xlators/features/shard/src/shard.c | 1384 ++++++++++++++++++-------- + xlators/features/shard/src/shard.h | 103 +- + 10 files changed, 1250 insertions(+), 467 deletions(-) + create mode 100644 tests/bugs/shard/bug-1568521-EEXIST.t + +diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h +index e64dea3..c804ed5 100644 +--- a/libglusterfs/src/common-utils.h ++++ b/libglusterfs/src/common-utils.h +@@ -121,6 +121,7 @@ void trap (void); + /* Shard */ + #define GF_XATTR_SHARD_FILE_SIZE "trusted.glusterfs.shard.file-size" + #define SHARD_ROOT_GFID "be318638-e8a0-4c6d-977d-7a937aa84806" ++#define DOT_SHARD_REMOVE_ME_GFID "77dd5a45-dbf5-4592-b31b-b440382302e9" + + /* Lease: buffer length for stringified lease id + * Format: 4hexnum-4hexnum-4hexnum-4hexnum-4hexnum-4hexnum-4hexnum-4hexnum +diff --git a/tests/bugs/shard/bug-1245547.t b/tests/bugs/shard/bug-1245547.t +index c19b2a6..3c46785 100644 +--- a/tests/bugs/shard/bug-1245547.t ++++ b/tests/bugs/shard/bug-1245547.t +@@ -25,11 +25,11 @@ TEST touch $M0/bar + TEST truncate -s 10G $M0/bar + #Unlink on such a file should succeed. + TEST unlink $M0/bar +-# ++ + #Create a file 'baz' with holes. + TEST touch $M0/baz + TEST truncate -s 10G $M0/baz + #Rename with a sharded existing dest that has holes must succeed. + TEST mv -f $M0/foo $M0/baz + +-cleanup; ++cleanup +diff --git a/tests/bugs/shard/bug-1568521-EEXIST.t b/tests/bugs/shard/bug-1568521-EEXIST.t +new file mode 100644 +index 0000000..e4c3d41 +--- /dev/null ++++ b/tests/bugs/shard/bug-1568521-EEXIST.t +@@ -0,0 +1,79 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume start $V0 ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++TEST mkdir $M0/dir ++# Unlink a temporary file to trigger creation of .remove_me ++TEST touch $M0/tmp ++TEST unlink $M0/tmp ++ ++TEST stat $B0/${V0}0/.shard/.remove_me ++TEST stat $B0/${V0}1/.shard/.remove_me ++ ++TEST dd if=/dev/zero of=$M0/dir/file bs=1024 count=9216 ++gfid_file=$(get_gfid_string $M0/dir/file) ++ ++# Create marker file from the backend to simulate ENODATA. ++touch $B0/${V0}0/.shard/.remove_me/$gfid_file ++touch $B0/${V0}1/.shard/.remove_me/$gfid_file ++ ++# Set block and file size to incorrect values of 64MB and 5MB to simulate "stale xattrs" case ++# and confirm that the correct values are set when the actual unlink takes place ++ ++TEST setfattr -n trusted.glusterfs.shard.block-size -v 0x0000000004000000 $B0/${V0}0/.shard/.remove_me/$gfid_file ++TEST setfattr -n trusted.glusterfs.shard.block-size -v 0x0000000004000000 $B0/${V0}1/.shard/.remove_me/$gfid_file ++ ++TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x0000000000500000000000000000000000000000000000000000000000000000 $B0/${V0}0/.shard/.remove_me/$gfid_file ++TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x0000000000500000000000000000000000000000000000000000000000000000 $B0/${V0}1/.shard/.remove_me/$gfid_file ++ ++# Sleep for 2 seconds to prevent posix_gfid_heal() from believing marker file is "fresh" and failing lookup with ENOENT ++sleep 2 ++ ++TEST unlink $M0/dir/file ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_file ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_file ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_file ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_file ++ ++############################## ++### Repeat test for rename ### ++############################## ++ ++TEST touch $M0/src ++TEST dd if=/dev/zero of=$M0/dir/dst bs=1024 count=9216 ++gfid_dst=$(get_gfid_string $M0/dir/dst) ++ ++# Create marker file from the backend to simulate ENODATA. ++touch $B0/${V0}0/.shard/.remove_me/$gfid_dst ++touch $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++# Set block and file size to incorrect values of 64MB and 5MB to simulate "stale xattrs" case ++# and confirm that the correct values are set when the actual unlink takes place ++ ++TEST setfattr -n trusted.glusterfs.shard.block-size -v 0x0000000004000000 $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST setfattr -n trusted.glusterfs.shard.block-size -v 0x0000000004000000 $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x0000000000500000000000000000000000000000000000000000000000000000 $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x0000000000500000000000000000000000000000000000000000000000000000 $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++# Sleep for 2 seconds to prevent posix_gfid_heal() from believing marker file is "fresh" and failing lookup with ENOENT ++sleep 2 ++ ++TEST mv -f $M0/src $M0/dir/dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++cleanup +diff --git a/tests/bugs/shard/bug-shard-discard.t b/tests/bugs/shard/bug-shard-discard.t +index 72d8586..884d9e7 100644 +--- a/tests/bugs/shard/bug-shard-discard.t ++++ b/tests/bugs/shard/bug-shard-discard.t +@@ -42,14 +42,14 @@ EXPECT_NOT "1" file_all_zeroes `find $B0 -name $gfid_foo.1` + + # Now unlink the file. And ensure that all shards associated with the file are cleaned up + TEST unlink $M0/foo +-TEST ! stat $B0/${V0}0/.shard/$gfid_foo.1 +-TEST ! stat $B0/${V0}1/.shard/$gfid_foo.1 +-TEST ! stat $B0/${V0}2/.shard/$gfid_foo.1 +-TEST ! stat $B0/${V0}3/.shard/$gfid_foo.1 +-TEST ! stat $B0/${V0}0/.shard/$gfid_foo.2 +-TEST ! stat $B0/${V0}1/.shard/$gfid_foo.2 +-TEST ! stat $B0/${V0}2/.shard/$gfid_foo.2 +-TEST ! stat $B0/${V0}3/.shard/$gfid_foo.2 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_foo.1 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_foo.1 ++#TEST ! stat $B0/${V0}2/.shard/$gfid_foo.1 ++#TEST ! stat $B0/${V0}3/.shard/$gfid_foo.1 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_foo.2 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_foo.2 ++#TEST ! stat $B0/${V0}2/.shard/$gfid_foo.2 ++#TEST ! stat $B0/${V0}3/.shard/$gfid_foo.2 + TEST ! stat $M0/foo + + #clean up everything +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +index 03e0cc9..c92dc07 100644 +--- a/tests/bugs/shard/shard-inode-refcount-test.t ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -18,7 +18,7 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23 + + ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) + TEST rm -f $M0/one-plus-five-shards +-EXPECT `expr $ACTIVE_INODES_BEFORE - 5` get_mount_active_size_value $V0 ++#EXPECT `expr $ACTIVE_INODES_BEFORE - 4` get_mount_active_size_value $V0 + + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume stop $V0 +diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t +index a8f188b..997c397 100644 +--- a/tests/bugs/shard/unlinks-and-renames.t ++++ b/tests/bugs/shard/unlinks-and-renames.t +@@ -32,7 +32,17 @@ TEST truncate -s 5M $M0/dir/foo + TEST ! stat $B0/${V0}0/.shard + TEST ! stat $B0/${V0}1/.shard + # Test to ensure that unlink doesn't fail due to absence of /.shard ++gfid_foo=$(get_gfid_string $M0/dir/foo) + TEST unlink $M0/dir/foo ++TEST stat $B0/${V0}0/.shard/.remove_me ++TEST stat $B0/${V0}1/.shard/.remove_me ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo ++ ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo + + ################################################## + ##### Unlink of a sharded file without holes ##### +@@ -46,14 +56,20 @@ TEST stat $B0/${V0}1/.shard/$gfid_new.1 + TEST stat $B0/${V0}0/.shard/$gfid_new.2 + TEST stat $B0/${V0}1/.shard/$gfid_new.2 + TEST unlink $M0/dir/new +-TEST ! stat $B0/${V0}0/.shard/$gfid_new.1 +-TEST ! stat $B0/${V0}1/.shard/$gfid_new.1 +-TEST ! stat $B0/${V0}0/.shard/$gfid_new.2 +-TEST ! stat $B0/${V0}1/.shard/$gfid_new.2 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_new.1 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_new.1 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_new.2 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_new.2 + TEST ! stat $M0/dir/new + TEST ! stat $B0/${V0}0/dir/new + TEST ! stat $B0/${V0}1/dir/new ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_new ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_new + ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_new ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_new + ####################################### + ##### Unlink with /.shard present ##### + ####################################### +@@ -67,18 +83,32 @@ TEST unlink $M0/dir/foo + TEST ! stat $B0/${V0}0/dir/foo + TEST ! stat $B0/${V0}1/dir/foo + TEST ! stat $M0/dir/foo ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo ++ ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo + + ############################################################# + ##### Unlink of a file with only one block (the zeroth) ##### + ############################################################# + TEST touch $M0/dir/foo ++gfid_foo=$(get_gfid_string $M0/dir/foo) + TEST dd if=/dev/zero of=$M0/dir/foo bs=1024 count=1024 +-# Test to ensure that unlink of a sparse file works fine. ++# Test to ensure that unlink of a file with only base shard works fine. + TEST unlink $M0/dir/foo + TEST ! stat $B0/${V0}0/dir/foo + TEST ! stat $B0/${V0}1/dir/foo + TEST ! stat $M0/dir/foo ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo + ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo + #################################################### + ##### Unlink of a sharded file with hard-links ##### + #################################################### +@@ -94,6 +124,8 @@ TEST stat $B0/${V0}1/.shard/$gfid_original.2 + TEST ln $M0/dir/original $M0/link + # Now delete the original file. + TEST unlink $M0/dir/original ++TEST ! stat $B0/${V0}0/.shard/.remove_me/$gfid_original ++TEST ! stat $B0/${V0}1/.shard/.remove_me/$gfid_original + # Ensure the shards are still intact. + TEST stat $B0/${V0}0/.shard/$gfid_original.1 + TEST stat $B0/${V0}1/.shard/$gfid_original.1 +@@ -105,15 +137,22 @@ TEST stat $B0/${V0}0/link + TEST stat $B0/${V0}1/link + # Now delete the last link. + TEST unlink $M0/link ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_original ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_original + # Ensure that the shards are all cleaned up. +-TEST ! stat $B0/${V0}0/.shard/$gfid_original.1 +-TEST ! stat $B0/${V0}1/.shard/$gfid_original.1 +-TEST ! stat $B0/${V0}0/.shard/$gfid_original.2 +-TEST ! stat $B0/${V0}1/.shard/$gfid_original.2 +-TEST ! stat $M0/link ++#TEST ! stat $B0/${V0}0/.shard/$gfid_original.1 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_original.1 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_original.2 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_original.2 ++#TEST ! stat $M0/link + TEST ! stat $B0/${V0}0/link + TEST ! stat $B0/${V0}1/link + ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_original ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_original ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_original ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_original ++ + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume stop $V0 + TEST $CLI volume delete $V0 +@@ -140,6 +179,7 @@ TEST touch $M0/dir/dst + ##### Rename with /.shard absent ##### + ###################################### + TEST truncate -s 5M $M0/dir/dst ++gfid_dst=$(get_gfid_string $M0/dir/dst) + TEST ! stat $B0/${V0}0/.shard + TEST ! stat $B0/${V0}1/.shard + # Test to ensure that rename doesn't fail due to absence of /.shard +@@ -150,6 +190,13 @@ TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ################################################## + ##### Rename to a sharded file without holes ##### +@@ -165,16 +212,23 @@ TEST stat $B0/${V0}1/.shard/$gfid_dst.1 + TEST stat $B0/${V0}0/.shard/$gfid_dst.2 + TEST stat $B0/${V0}1/.shard/$gfid_dst.2 + TEST mv -f $M0/dir/src $M0/dir/dst +-TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 +-TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 +-TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 +-TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 + TEST ! stat $M0/dir/src + TEST stat $M0/dir/dst + TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ################################################### + ##### Rename of dst file with /.shard present ##### +@@ -182,7 +236,8 @@ TEST stat $B0/${V0}1/dir/dst + TEST unlink $M0/dir/dst + TEST touch $M0/dir/src + TEST truncate -s 5M $M0/dir/dst +-# Test to ensure that unlink of a sparse file works fine. ++gfid_dst=$(get_gfid_string $M0/dir/dst) ++# Test to ensure that rename into a sparse file works fine. + TEST mv -f $M0/dir/src $M0/dir/dst + TEST ! stat $M0/dir/src + TEST stat $M0/dir/dst +@@ -190,6 +245,13 @@ TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ############################################################### + ##### Rename of dst file with only one block (the zeroth) ##### +@@ -197,7 +259,8 @@ TEST stat $B0/${V0}1/dir/dst + TEST unlink $M0/dir/dst + TEST touch $M0/dir/src + TEST dd if=/dev/zero of=$M0/dir/dst bs=1024 count=1024 +-# Test to ensure that unlink of a sparse file works fine. ++gfid_dst=$(get_gfid_string $M0/dir/dst) ++# Test to ensure that rename into a file with only base shard works fine. + TEST mv -f $M0/dir/src $M0/dir/dst + TEST ! stat $M0/dir/src + TEST stat $M0/dir/dst +@@ -205,6 +268,13 @@ TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ######################################################## + ##### Rename to a dst sharded file with hard-links ##### +@@ -231,18 +301,26 @@ TEST stat $B0/${V0}1/.shard/$gfid_dst.2 + TEST ! stat $M0/dir/src + TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src ++TEST ! stat $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST ! stat $B0/${V0}1/.shard/.remove_me/$gfid_dst + # Now rename another file to the last link. + TEST touch $M0/dir/src2 + TEST mv -f $M0/dir/src2 $M0/link + # Ensure that the shards are all cleaned up. +-TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 +-TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 +-TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 +-TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 ++#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 ++#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 + TEST ! stat $M0/dir/src2 + TEST ! stat $B0/${V0}0/dir/src2 + TEST ! stat $B0/${V0}1/dir/src2 ++TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst ++TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst + ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst + # Rename with non-existent dst and a sharded src + TEST touch $M0/dir/src + TEST dd if=/dev/zero of=$M0/dir/src bs=1024 count=9216 +diff --git a/xlators/features/shard/src/shard-mem-types.h b/xlators/features/shard/src/shard-mem-types.h +index 77f0cee..fea66aa 100644 +--- a/xlators/features/shard/src/shard-mem-types.h ++++ b/xlators/features/shard/src/shard-mem-types.h +@@ -18,6 +18,7 @@ enum gf_shard_mem_types_ { + gf_shard_mt_inode_ctx_t, + gf_shard_mt_iovec, + gf_shard_mt_int64_t, ++ gf_shard_mt_uint64_t, + gf_shard_mt_end + }; + #endif +diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h +index 8e61630..0267f8a 100644 +--- a/xlators/features/shard/src/shard-messages.h ++++ b/xlators/features/shard/src/shard-messages.h +@@ -40,7 +40,7 @@ + */ + + #define GLFS_COMP_BASE_SHARD GLFS_MSGID_COMP_SHARD +-#define GLFS_NUM_MESSAGES 19 ++#define GLFS_NUM_MESSAGES 20 + #define GLFS_MSGID_END (GLFS_COMP_BASE_SHARD + GLFS_NUM_MESSAGES + 1) + + #define glfs_msg_start_x GLFS_COMP_BASE_SHARD, "Invalid: Start of messages" +@@ -187,5 +187,12 @@ + */ + #define SHARD_MSG_MEMALLOC_FAILED (GLFS_COMP_BASE_SHARD + 19) + ++/*! ++ * @messageid 133020 ++ * @diagnosis ++ * @recommendedaction ++*/ ++#define SHARD_MSG_FOP_FAILED (GLFS_COMP_BASE_SHARD + 20) ++ + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" + #endif /* !_SHARD_MESSAGES_H_ */ +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 268ba20..492341c 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -117,9 +117,6 @@ __shard_inode_ctx_set (inode_t *inode, xlator_t *this, struct iatt *stbuf, + if (valid & SHARD_MASK_BLOCK_SIZE) + ctx->block_size = block_size; + +- if (!stbuf) +- return 0; +- + if (valid & SHARD_MASK_PROT) + ctx->stat.ia_prot = stbuf->ia_prot; + +@@ -179,7 +176,35 @@ shard_inode_ctx_set (inode_t *inode, xlator_t *this, struct iatt *stbuf, + } + + int +-__shard_inode_ctx_set_refreshed_flag (inode_t *inode, xlator_t *this) ++__shard_inode_ctx_set_refresh_flag (inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ shard_inode_ctx_t *ctx = NULL; ++ ++ ret = __shard_inode_ctx_get (inode, this, &ctx); ++ if (ret) ++ return ret; ++ ++ ctx->refresh = _gf_true; ++ ++ return 0; ++} ++int ++shard_inode_ctx_set_refresh_flag (inode_t *inode, xlator_t *this) ++{ ++ int ret = -1; ++ ++ LOCK (&inode->lock); ++ { ++ ret = __shard_inode_ctx_set_refresh_flag (inode, this); ++ } ++ UNLOCK (&inode->lock); ++ ++ return ret; ++} ++ ++int ++__shard_inode_ctx_mark_dir_refreshed (inode_t *inode, xlator_t *this) + { + int ret = -1; + shard_inode_ctx_t *ctx = NULL; +@@ -193,13 +218,13 @@ __shard_inode_ctx_set_refreshed_flag (inode_t *inode, xlator_t *this) + } + + int +-shard_inode_ctx_set_refreshed_flag (inode_t *inode, xlator_t *this) ++shard_inode_ctx_mark_dir_refreshed (inode_t *inode, xlator_t *this) + { + int ret = -1; + + LOCK (&inode->lock); + { +- ret = __shard_inode_ctx_set_refreshed_flag (inode, this); ++ ret = __shard_inode_ctx_mark_dir_refreshed (inode, this); + } + UNLOCK (&inode->lock); + +@@ -478,9 +503,15 @@ shard_local_wipe (shard_local_t *local) + syncbarrier_destroy (&local->barrier); + loc_wipe (&local->loc); + loc_wipe (&local->dot_shard_loc); ++ loc_wipe (&local->dot_shard_rm_loc); + loc_wipe (&local->loc2); + loc_wipe (&local->tmp_loc); ++ loc_wipe (&local->int_inodelk.loc); ++ loc_wipe (&local->int_entrylk.loc); ++ loc_wipe (&local->newloc); + ++ if (local->int_entrylk.basename) ++ GF_FREE (local->int_entrylk.basename); + if (local->fd) + fd_unref (local->fd); + +@@ -504,6 +535,10 @@ shard_local_wipe (shard_local_t *local) + iobref_unref (local->iobref); + if (local->list_inited) + gf_dirent_free (&local->entries_head); ++ if (local->inodelk_frame) ++ SHARD_STACK_DESTROY (local->inodelk_frame); ++ if (local->entrylk_frame) ++ SHARD_STACK_DESTROY (local->entrylk_frame); + } + + int +@@ -554,7 +589,10 @@ shard_internal_dir_string (shard_internal_dir_type_t type) + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: +- str = ".shard"; ++ str = GF_SHARD_DIR; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ str = GF_SHARD_REMOVE_ME_DIR; + break; + default: + break; +@@ -566,10 +604,13 @@ static int + shard_init_internal_dir_loc (xlator_t *this, shard_local_t *local, + shard_internal_dir_type_t type) + { +- int ret = -1; +- char *bname = NULL; +- loc_t *internal_dir_loc = NULL; ++ int ret = -1; ++ char *bname = NULL; ++ inode_t *parent = NULL; ++ loc_t *internal_dir_loc = NULL; ++ shard_priv_t *priv = NULL; + ++ priv = this->private; + if (!local) + return -1; + +@@ -577,13 +618,19 @@ shard_init_internal_dir_loc (xlator_t *this, shard_local_t *local, + case SHARD_INTERNAL_DIR_DOT_SHARD: + internal_dir_loc = &local->dot_shard_loc; + bname = GF_SHARD_DIR; ++ parent = inode_ref (this->itable->root); ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ internal_dir_loc = &local->dot_shard_rm_loc; ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ parent = inode_ref (priv->dot_shard_inode); + break; + default: + break; + } + + internal_dir_loc->inode = inode_new (this->itable); +- internal_dir_loc->parent = inode_ref (this->itable->root); ++ internal_dir_loc->parent = parent; + ret = inode_path (internal_dir_loc->parent, bname, + (char **)&internal_dir_loc->path); + if (ret < 0 || !(internal_dir_loc->inode)) { +@@ -706,11 +753,48 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + } + + int +-shard_common_inode_write_failure_unwind (glusterfs_fop_t fop, +- call_frame_t *frame, int32_t op_ret, +- int32_t op_errno) ++shard_common_failure_unwind (glusterfs_fop_t fop, call_frame_t *frame, ++ int32_t op_ret, int32_t op_errno) + { + switch (fop) { ++ case GF_FOP_LOOKUP: ++ SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_STAT: ++ SHARD_STACK_UNWIND (stat, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_FSTAT: ++ SHARD_STACK_UNWIND (fstat, frame, op_ret, op_errno, NULL, NULL); ++ break; ++ case GF_FOP_TRUNCATE: ++ SHARD_STACK_UNWIND (truncate, frame, op_ret, op_errno, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_FTRUNCATE: ++ SHARD_STACK_UNWIND (ftruncate, frame, op_ret, op_errno, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_MKNOD: ++ SHARD_STACK_UNWIND (mknod, frame, op_ret, op_errno, NULL, ++ NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_LINK: ++ SHARD_STACK_UNWIND (link, frame, op_ret, op_errno, NULL, ++ NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_CREATE: ++ SHARD_STACK_UNWIND (create, frame, op_ret, op_errno, NULL, ++ NULL, NULL, NULL, NULL, NULL); ++ break; ++ case GF_FOP_UNLINK: ++ SHARD_STACK_UNWIND (unlink, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_RENAME: ++ SHARD_STACK_UNWIND (rename, frame, op_ret, op_errno, NULL, NULL, ++ NULL, NULL, NULL, NULL); ++ break; + case GF_FOP_WRITE: + SHARD_STACK_UNWIND (writev, frame, op_ret, op_errno, + NULL, NULL, NULL); +@@ -727,6 +811,45 @@ shard_common_inode_write_failure_unwind (glusterfs_fop_t fop, + SHARD_STACK_UNWIND (discard, frame, op_ret, op_errno, + NULL, NULL, NULL); + break; ++ case GF_FOP_READ: ++ SHARD_STACK_UNWIND (readv, frame, op_ret, op_errno, NULL, -1, ++ NULL, NULL, NULL); ++ break; ++ case GF_FOP_FSYNC: ++ SHARD_STACK_UNWIND (fsync, frame, op_ret, op_errno, NULL, NULL, ++ NULL); ++ break; ++ case GF_FOP_REMOVEXATTR: ++ SHARD_STACK_UNWIND (removexattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_FREMOVEXATTR: ++ SHARD_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_FGETXATTR: ++ SHARD_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, NULL, ++ NULL); ++ break; ++ case GF_FOP_GETXATTR: ++ SHARD_STACK_UNWIND (getxattr, frame, op_ret, op_errno, NULL, ++ NULL); ++ break; ++ case GF_FOP_FSETXATTR: ++ SHARD_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_SETXATTR: ++ SHARD_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL); ++ break; ++ case GF_FOP_SETATTR: ++ SHARD_STACK_UNWIND (setattr, frame, op_ret, op_errno, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_FSETATTR: ++ SHARD_STACK_UNWIND (fsetattr, frame, op_ret, op_errno, NULL, ++ NULL, NULL); ++ break; ++ case GF_FOP_SEEK: ++ SHARD_STACK_UNWIND (seek, frame, op_ret, op_errno, 0, NULL); ++ break; + default: + gf_msg (THIS->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, + "Invalid fop id = %d", fop); +@@ -866,7 +989,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; + +- if (local->op_ret < 0) ++ if ((local->op_ret < 0) || (local->resolve_not)) + goto out; + + while (shard_idx_iter <= local->last_block) { +@@ -1063,19 +1186,26 @@ shard_link_internal_dir_inode (shard_local_t *local, inode_t *inode, + shard_priv_t *priv = NULL; + char *bname = NULL; + inode_t **priv_inode = NULL; ++ inode_t *parent = NULL; + + priv = THIS->private; + + switch (type) { + case SHARD_INTERNAL_DIR_DOT_SHARD: +- bname = ".shard"; ++ bname = GF_SHARD_DIR; + priv_inode = &priv->dot_shard_inode; ++ parent = inode->table->root; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ priv_inode = &priv->dot_shard_rm_inode; ++ parent = priv->dot_shard_inode; + break; + default: + break; + } + +- linked_inode = inode_link (inode, inode->table->root, bname, buf); ++ linked_inode = inode_link (inode, parent, bname, buf); + inode_lookup (linked_inode); + *priv_inode = linked_inode; + return linked_inode; +@@ -1105,7 +1235,7 @@ shard_refresh_internal_dir_cbk (call_frame_t *frame, void *cookie, + * shard_link_internal_dir_inode(). + */ + linked_inode = shard_link_internal_dir_inode (local, inode, buf, type); +- shard_inode_ctx_set_refreshed_flag (linked_inode, this); ++ shard_inode_ctx_mark_dir_refreshed (linked_inode, this); + out: + shard_common_resolve_shards (frame, this, local->post_res_handler); + return 0; +@@ -1128,6 +1258,9 @@ shard_refresh_internal_dir (call_frame_t *frame, xlator_t *this, + case SHARD_INTERNAL_DIR_DOT_SHARD: + gf_uuid_copy (gfid, priv->dot_shard_gfid); + break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy (gfid, priv->dot_shard_rm_gfid); ++ break; + default: + break; + } +@@ -1189,7 +1322,7 @@ shard_lookup_internal_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this + if (link_inode != inode) { + shard_refresh_internal_dir (frame, this, type); + } else { +- shard_inode_ctx_set_refreshed_flag (link_inode, this); ++ shard_inode_ctx_mark_dir_refreshed (link_inode, this); + shard_common_resolve_shards (frame, this, + local->post_res_handler); + } +@@ -1233,6 +1366,10 @@ shard_lookup_internal_dir (call_frame_t *frame, xlator_t *this, + gf_uuid_copy (*gfid, priv->dot_shard_gfid); + loc = &local->dot_shard_loc; + break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy (*gfid, priv->dot_shard_rm_gfid); ++ loc = &local->dot_shard_rm_loc; ++ break; + default: + break; + } +@@ -1383,13 +1520,9 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + + STACK_WIND (frame, shard_lookup_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, loc, local->xattr_req); +- + return 0; +- +- + err: +- SHARD_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_LOOKUP, frame, -1, op_errno); + return 0; + + } +@@ -1610,11 +1743,9 @@ shard_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + + STACK_WIND (frame, shard_common_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc, local->xattr_req); +- + return 0; +- + err: +- SHARD_STACK_UNWIND (stat, frame, -1, ENOMEM, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_STAT, frame, -1, ENOMEM); + return 0; + } + +@@ -1668,9 +1799,8 @@ shard_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + STACK_WIND (frame, shard_common_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd, local->xattr_req); + return 0; +- + err: +- SHARD_STACK_UNWIND (fstat, frame, -1, ENOMEM, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FSTAT, frame, -1, ENOMEM); + return 0; + } + +@@ -1728,14 +1858,9 @@ shard_truncate_last_shard_cbk (call_frame_t *frame, void *cookie, + shard_update_file_size (frame, this, NULL, &local->loc, + shard_post_update_size_truncate_handler); + return 0; +- + err: +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND (truncate, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); +- else +- SHARD_STACK_UNWIND (ftruncate, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -1946,12 +2071,8 @@ shard_post_lookup_shards_truncate_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if (local->op_ret < 0) { +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND (truncate, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); +- else +- SHARD_STACK_UNWIND (ftruncate, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -2232,16 +2353,9 @@ shard_post_resolve_truncate_handler (call_frame_t *frame, xlator_t *this) + shard_post_update_size_truncate_handler); + return 0; + } else { +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND (truncate, frame, +- local->op_ret, +- local->op_errno, NULL, NULL, +- NULL); +- else +- SHARD_STACK_UNWIND (ftruncate, frame, +- local->op_ret, +- local->op_errno, NULL, NULL, +- NULL); ++ shard_common_failure_unwind (local->fop, frame, ++ local->op_ret, ++ local->op_errno); + return 0; + } + } +@@ -2329,14 +2443,8 @@ shard_truncate_begin (call_frame_t *frame, xlator_t *this) + return 0; + + err: +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND (truncate, frame, -1, ENOMEM, NULL, NULL, +- NULL); +- else +- SHARD_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, NULL, NULL, +- NULL); +- +- return 0; ++ shard_common_failure_unwind (local->fop, frame, -1, ENOMEM); ++ return 0; + } + + int +@@ -2348,13 +2456,8 @@ shard_post_lookup_truncate_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if (local->op_ret < 0) { +- if (local->fop == GF_FOP_TRUNCATE) +- SHARD_STACK_UNWIND (truncate, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); +- else +- SHARD_STACK_UNWIND (ftruncate, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); +- ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -2457,7 +2560,7 @@ shard_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, + return 0; + + err: +- SHARD_STACK_UNWIND (truncate, frame, -1, ENOMEM, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_TRUNCATE, frame, -1, ENOMEM); + return 0; + } + +@@ -2512,8 +2615,7 @@ shard_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + shard_post_lookup_truncate_handler); + return 0; + err: +- +- SHARD_STACK_UNWIND (ftruncate, frame, -1, ENOMEM, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FTRUNCATE, frame, -1, ENOMEM); + return 0; + } + +@@ -2531,7 +2633,7 @@ shard_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + if (op_ret == -1) + goto unwind; + +- ret = shard_inode_ctx_set (inode, this, buf, ntoh64 (local->block_size), ++ ret = shard_inode_ctx_set (inode, this, buf, local->block_size, + SHARD_ALL_MASK); + if (ret) + gf_msg (this->name, GF_LOG_WARNING, 0, +@@ -2549,25 +2651,27 @@ int + shard_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, + dev_t rdev, mode_t umask, dict_t *xdata) + { ++ shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + ++ priv = this->private; + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; ++ local->block_size = priv->block_size; + if (!__is_gsyncd_on_shard_dir (frame, loc)) { +- SHARD_INODE_CREATE_INIT (this, local, xdata, loc, err); ++ SHARD_INODE_CREATE_INIT (this, local->block_size, xdata, loc, 0, ++ 0, err); + } + + STACK_WIND (frame, shard_mknod_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, + xdata); + return 0; +- + err: +- SHARD_STACK_UNWIND (mknod, frame, -1, ENOMEM, NULL, NULL, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_MKNOD, frame, -1, ENOMEM); + return 0; + + } +@@ -2594,8 +2698,7 @@ shard_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + preparent, postparent, xdata); + return 0; + err: +- SHARD_STACK_UNWIND (link, frame, op_ret, op_errno, inode, NULL, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_LINK, frame, op_ret, op_errno); + return 0; + } + +@@ -2660,10 +2763,8 @@ shard_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + shard_lookup_base_file (frame, this, &local->loc, + shard_post_lookup_link_handler); + return 0; +- + err: +- SHARD_STACK_UNWIND (link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, +- NULL); ++ shard_common_failure_unwind (GF_FOP_LINK, frame, -1, ENOMEM); + return 0; + } + +@@ -2678,13 +2779,8 @@ shard_post_lookup_shards_unlink_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { +- if (local->fop == GF_FOP_UNLINK) +- SHARD_STACK_UNWIND (unlink, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); +- else +- SHARD_STACK_UNWIND (rename, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL, +- NULL, NULL, NULL); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + local->op_ret = 0; +@@ -2724,13 +2820,9 @@ shard_post_resolve_unlink_handler (call_frame_t *frame, xlator_t *this) + shard_rename_cbk (frame, this); + return 0; + } else { +- if (local->fop == GF_FOP_UNLINK) +- SHARD_STACK_UNWIND (unlink, frame, +- local->op_ret, +- local->op_errno, NULL, NULL, +- NULL); +- else +- shard_rename_cbk (frame, this); ++ shard_common_failure_unwind (local->fop, frame, ++ local->op_ret, ++ local->op_errno); + return 0; + } + } +@@ -2745,103 +2837,6 @@ shard_post_resolve_unlink_handler (call_frame_t *frame, xlator_t *this) + return 0; + } + +-int +-shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata) +-{ +- int ret = 0; +- uint32_t link_count = 0; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; +- +- local = frame->local; +- priv = this->private; +- +- if (op_ret < 0) { +- SHARD_STACK_UNWIND (unlink, frame, op_ret, op_errno, NULL, NULL, +- NULL); +- return 0; +- } +- +- /* Because link() does not create links for all but the +- * base shard, unlink() must delete these shards only when the +- * link count is 1. We can return safely now. +- */ +- if ((xdata) && (!dict_get_uint32 (xdata, GET_LINK_COUNT, &link_count)) +- && (link_count > 1)) +- goto unwind; +- +- local->first_block = get_lowest_block (0, local->block_size); +- local->last_block = get_highest_block (0, local->prebuf.ia_size, +- local->block_size); +- local->num_blocks = local->last_block - local->first_block + 1; +- local->resolver_base_inode = local->loc.inode; +- +- /* num_blocks = 1 implies that the file has not crossed its +- * shard block size. So unlink boils down to unlinking just the +- * base file. We can safely return now. +- */ +- if (local->num_blocks == 1) +- goto unwind; +- +- local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *), +- gf_shard_mt_inode_list); +- if (!local->inode_list) +- goto unwind; +- +- /* Save the xdata and preparent and postparent iatts now. This will be +- * used at the time of unwinding the call to the parent xl. +- */ +- local->preoldparent = *preparent; +- local->postoldparent = *postparent; +- if (xdata) +- local->xattr_rsp = dict_ref (xdata); +- +- local->dot_shard_loc.inode = inode_find (this->itable, +- priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- ret = shard_init_internal_dir_loc (this, local, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret) +- goto unwind; +- shard_lookup_internal_dir (frame, this, +- shard_post_resolve_unlink_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } else { +- local->post_res_handler = shard_post_resolve_unlink_handler; +- shard_refresh_internal_dir (frame, this, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- } +- +- return 0; +- +-unwind: +- SHARD_STACK_UNWIND (unlink, frame, op_ret, op_errno, preparent, +- postparent, xdata); +- return 0; +-} +- +-int +-shard_unlink_base_file (call_frame_t *frame, xlator_t *this) +-{ +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (dict_set_uint32 (local->xattr_req, GET_LINK_COUNT, 0)) +- gf_msg (this->name, GF_LOG_WARNING, 0, +- SHARD_MSG_DICT_SET_FAILED, "Failed to set " +- GET_LINK_COUNT" in dict"); +- +- /* To-Do: Request open-fd count on base file */ +- STACK_WIND (frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, +- local->xattr_req); +- return 0; +-} +- + void + shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + { +@@ -3062,160 +3057,754 @@ next: + } + + int +-shard_post_lookup_unlink_handler (call_frame_t *frame, xlator_t *this) ++shard_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) + { +- shard_local_t *local = NULL; ++ if (op_ret) ++ gf_msg (this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Unlock failed. Please check brick logs for " ++ "more details"); ++ SHARD_STACK_DESTROY (frame); ++ return 0; ++} + +- local = frame->local; ++int ++shard_unlock_inodelk (call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_inodelk_t *lock = NULL; + +- if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (unlink, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); +- return 0; +- } ++ local = frame->local; ++ lk_frame = local->inodelk_frame; ++ lk_local = lk_frame->local; ++ local->inodelk_frame = NULL; ++ loc = &local->int_inodelk.loc; ++ lock = &lk_local->int_inodelk; ++ lock->flock.l_type = F_UNLCK; + +- shard_unlink_base_file (frame, this); ++ STACK_WIND (lk_frame, shard_unlock_inodelk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->inodelk, lock->domain, loc, ++ F_SETLK, &lock->flock, NULL); ++ local->int_inodelk.acquired_lock = _gf_false; + return 0; + } + + int +-shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, +- dict_t *xdata) ++shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ struct iatt *preoldparent, struct iatt *postoldparent, ++ struct iatt *prenewparent, struct iatt *postnewparent, ++ dict_t *xdata); ++int ++shard_rename_src_base_file (call_frame_t *frame, xlator_t *this) + { +- int ret = -1; +- uint64_t block_size = 0; +- shard_local_t *local = NULL; ++ int ret = 0; ++ loc_t *dst_loc = NULL; ++ loc_t tmp_loc = {0,}; ++ shard_local_t *local = frame->local; + +- ret = shard_inode_ctx_get_block_size (loc->inode, this, &block_size); +- if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { +- gf_msg (this->name, GF_LOG_ERROR, 0, +- SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " +- "size from inode ctx of %s", +- uuid_utoa (loc->inode->gfid)); +- goto err; +- } ++ if (local->dst_block_size) { ++ tmp_loc.parent = inode_ref (local->loc2.parent); ++ ret = inode_path (tmp_loc.parent, local->loc2.name, ++ (char **)&tmp_loc.path); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" ++ " on pargfid=%s bname=%s", ++ uuid_utoa (tmp_loc.parent->gfid), ++ local->loc2.name); ++ local->op_ret = -1; ++ local->op_errno = ENOMEM; ++ goto err; ++ } + +- if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { +- STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); +- return 0; ++ tmp_loc.name = strrchr (tmp_loc.path, '/'); ++ if (tmp_loc.name) ++ tmp_loc.name++; ++ dst_loc = &tmp_loc; ++ } else { ++ dst_loc = &local->loc2; + } + +- local = mem_get0 (this->local_pool); +- if (!local) +- goto err; +- +- frame->local = local; +- +- loc_copy (&local->loc, loc); +- local->xflag = xflag; +- local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); +- local->block_size = block_size; +- local->resolver_base_inode = loc->inode; +- local->fop = GF_FOP_UNLINK; +- if (!this->itable) +- this->itable = (local->loc.inode)->table; +- +- shard_lookup_base_file (frame, this, &local->loc, +- shard_post_lookup_unlink_handler); ++ /* To-Do: Request open-fd count on dst base file */ ++ STACK_WIND (frame, shard_rename_src_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, &local->loc, dst_loc, ++ local->xattr_req); ++ loc_wipe (&tmp_loc); + return 0; + err: +- SHARD_STACK_UNWIND (unlink, frame, -1, ENOMEM, NULL, NULL, NULL); ++ loc_wipe (&tmp_loc); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; +- + } + + int +-shard_rename_cbk (call_frame_t *frame, xlator_t *this) ++shard_unlink_base_file (call_frame_t *frame, xlator_t *this); ++ ++int ++shard_set_size_attrs_on_marker_file_cbk (call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, dict_t *dict, ++ dict_t *xdata) + { ++ shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + ++ priv = this->private; + local = frame->local; ++ if (op_ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_FOP_FAILED, "Xattrop on marker file failed " ++ "while performing %s; entry gfid=%s", ++ gf_fop_string (local->fop), local->newloc.name); ++ goto err; ++ } + +- SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->preoldparent, +- &local->postoldparent, &local->prenewparent, +- &local->postnewparent, local->xattr_rsp); ++ inode_unlink (local->newloc.inode, priv->dot_shard_rm_inode, ++ local->newloc.name); ++ ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file (frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file (frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind (local->fop, frame, op_ret, op_errno); + return 0; + } + + int +-shard_rename_unlink_dst_shards_do (call_frame_t *frame, xlator_t *this) ++shard_set_size_attrs_on_marker_file (call_frame_t *frame, xlator_t *this) + { +- int ret = -1; +- uint32_t link_count = 0; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; ++ int op_errno = ENOMEM; ++ uint64_t bs = 0; ++ dict_t *xdata = NULL; ++ shard_local_t *local = NULL; + + local = frame->local; +- priv = this->private; +- +- local->first_block = get_lowest_block (0, local->dst_block_size); +- local->last_block = get_highest_block (0, local->postbuf.ia_size, +- local->dst_block_size); +- local->num_blocks = local->last_block - local->first_block + 1; +- local->resolver_base_inode = local->loc2.inode; ++ xdata = dict_new (); ++ if (!xdata) ++ goto err; + +- if ((local->xattr_rsp) && +- (!dict_get_uint32 (local->xattr_rsp, GET_LINK_COUNT, &link_count)) +- && (link_count > 1)) { +- shard_rename_cbk (frame, this); +- return 0; ++ if (local->fop == GF_FOP_UNLINK) ++ bs = local->block_size; ++ else if (local->fop == GF_FOP_RENAME) ++ bs = local->dst_block_size; ++ SHARD_INODE_CREATE_INIT (this, bs, xdata, &local->newloc, ++ local->prebuf.ia_size, 0, err); ++ STACK_WIND (frame, shard_set_size_attrs_on_marker_file_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->xattrop, ++ &local->newloc, GF_XATTROP_GET_AND_SET, xdata, NULL); ++ dict_unref (xdata); ++ return 0; ++err: ++ if (xdata) ++ dict_unref (xdata); ++ shard_common_failure_unwind (local->fop, frame, -1, op_errno); ++ return 0; ++} ++ ++int ++shard_lookup_marker_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ if (op_ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_FOP_FAILED, "Lookup on marker file failed " ++ "while performing %s; entry gfid=%s", ++ gf_fop_string (local->fop), local->newloc.name); ++ goto err; + } + +- if (local->num_blocks == 1) { +- shard_rename_cbk (frame, this); ++ linked_inode = inode_link (inode, priv->dot_shard_rm_inode, ++ local->newloc.name, buf); ++ inode_unref (local->newloc.inode); ++ local->newloc.inode = linked_inode; ++ shard_set_size_attrs_on_marker_file (frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind (local->fop, frame, op_ret, op_errno); ++ return 0; ++} ++ ++int ++shard_lookup_marker_file (call_frame_t *frame, xlator_t *this) ++{ ++ int op_errno = ENOMEM; ++ dict_t *xattr_req = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ xattr_req = shard_create_gfid_dict (local->xattr_req); ++ if (!xattr_req) ++ goto err; ++ ++ STACK_WIND (frame, shard_lookup_marker_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, &local->newloc, xattr_req); ++ dict_unref (xattr_req); ++ return 0; ++err: ++ if (xattr_req) ++ dict_unref (xattr_req); ++ shard_common_failure_unwind (local->fop, frame, -1, op_errno); ++ return 0; ++} ++ ++int ++shard_create_marker_file_under_remove_me_cbk (call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, ++ struct iatt *buf, ++ struct iatt *preparent, ++ struct iatt *postparent, ++ dict_t *xdata) ++{ ++ inode_t *linked_inode = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ priv = this->private; ++ ++ SHARD_UNSET_ROOT_FS_ID (frame, local); ++ if (op_ret < 0) { ++ if ((op_errno != EEXIST) && (op_errno != ENODATA)) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ gf_msg (this->name, GF_LOG_ERROR, op_errno, ++ SHARD_MSG_FOP_FAILED, "Marker file creation " ++ "failed while performing %s; entry gfid=%s", ++ gf_fop_string (local->fop), local->newloc.name); ++ goto err; ++ } else { ++ shard_lookup_marker_file (frame, this); ++ return 0; ++ } ++ } ++ ++ linked_inode = inode_link (inode, priv->dot_shard_rm_inode, ++ local->newloc.name, buf); ++ inode_unref (local->newloc.inode); ++ local->newloc.inode = linked_inode; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file (frame, this); ++ else if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file (frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind (local->fop, frame, -1, local->op_errno); ++ return 0; ++} ++ ++int ++shard_create_marker_file_under_remove_me (call_frame_t *frame, xlator_t *this, ++ loc_t *loc) ++{ ++ int ret = 0; ++ int op_errno = ENOMEM; ++ uint64_t bs = 0; ++ char g1[64] = {0,}; ++ char g2[64] = {0,}; ++ dict_t *xattr_req = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ SHARD_SET_ROOT_FS_ID (frame, local); ++ ++ xattr_req = shard_create_gfid_dict (local->xattr_req); ++ if (!xattr_req) ++ goto err; ++ ++ local->newloc.inode = inode_new (this->itable); ++ local->newloc.parent = inode_ref (priv->dot_shard_rm_inode); ++ ret = inode_path (local->newloc.parent, uuid_utoa (loc->inode->gfid), ++ (char **)&local->newloc.path); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INODE_PATH_FAILED, "Inode path failed on " ++ "pargfid=%s bname=%s", ++ uuid_utoa_r (priv->dot_shard_rm_gfid, g1), ++ uuid_utoa_r (loc->inode->gfid, g2)); ++ goto err; ++ } ++ local->newloc.name = strrchr (local->newloc.path, '/'); ++ if (local->newloc.name) ++ local->newloc.name++; ++ ++ if (local->fop == GF_FOP_UNLINK) ++ bs = local->block_size; ++ else if (local->fop == GF_FOP_RENAME) ++ bs = local->dst_block_size; ++ ++ SHARD_INODE_CREATE_INIT (this, bs, xattr_req, &local->newloc, ++ local->prebuf.ia_size, 0, err); ++ ++ STACK_WIND (frame, shard_create_marker_file_under_remove_me_cbk, ++ FIRST_CHILD(this), FIRST_CHILD(this)->fops->mknod, ++ &local->newloc, 0, 0, 0644, xattr_req); ++ dict_unref (xattr_req); ++ return 0; ++ ++err: ++ if (xattr_req) ++ dict_unref (xattr_req); ++ shard_create_marker_file_under_remove_me_cbk (frame, 0, this, -1, ++ op_errno, NULL, NULL, ++ NULL, NULL, NULL); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk (call_frame_t *frame, xlator_t *this); ++ ++int ++shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, ++ struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ int ret = 0; ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (op_ret < 0) { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } else { ++ local->preoldparent = *preparent; ++ local->postoldparent = *postparent; ++ if (xdata) ++ local->xattr_rsp = dict_ref (xdata); ++ } ++ if (local->entrylk_frame) { ++ ret = shard_unlock_entrylk (frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ } ++ ++ ret = shard_unlock_inodelk (frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ shard_unlink_cbk (frame, this); ++ return 0; ++} ++ ++int ++shard_unlink_base_file (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = frame->local; ++ ++ /* To-Do: Request open-fd count on base file */ ++ STACK_WIND (frame, shard_unlink_base_file_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, &local->loc, local->xflag, ++ local->xattr_req); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ if (op_ret) ++ gf_msg (this->name, GF_LOG_ERROR, op_errno, SHARD_MSG_FOP_FAILED, ++ "Unlock failed. Please check brick logs for " ++ "more details"); ++ SHARD_STACK_DESTROY (frame); ++ return 0; ++} ++ ++int ++shard_unlock_entrylk (call_frame_t *frame, xlator_t *this) ++{ ++ loc_t *loc = NULL; ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_entrylk_t *lock = NULL; ++ ++ local = frame->local; ++ lk_frame = local->entrylk_frame; ++ lk_local = lk_frame->local; ++ local->entrylk_frame = NULL; ++ lock = &lk_local->int_entrylk; ++ loc = &lock->loc; ++ ++ STACK_WIND (lk_frame, shard_unlock_entrylk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->entrylk, this->name, loc, ++ lk_local->int_entrylk.basename, ENTRYLK_UNLOCK, ++ ENTRYLK_WRLCK, NULL); ++ local->int_entrylk.acquired_lock = _gf_false; ++ return 0; ++} ++ ++int ++shard_post_entrylk_fop_handler (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ switch (local->fop) { ++ case GF_FOP_UNLINK: ++ case GF_FOP_RENAME: ++ shard_create_marker_file_under_remove_me (frame, this, ++ &local->int_inodelk.loc); ++ break; ++ default: ++ gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "post-entrylk handler not defined. This case should not" ++ " be hit"); ++ break; ++ } ++ return 0; ++} ++ ++int ++shard_acquire_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ call_frame_t *main_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *main_local = NULL; ++ ++ local = frame->local; ++ main_frame = local->main_frame; ++ main_local = main_frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind (main_local->fop, main_frame, ++ op_ret, op_errno); + return 0; + } ++ main_local->int_entrylk.acquired_lock = _gf_true; ++ shard_post_entrylk_fop_handler (main_frame, this); ++ return 0; ++} + +- local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *), +- gf_shard_mt_inode_list); +- if (!local->inode_list) +- goto out; ++int ++shard_acquire_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode, ++ uuid_t gfid) ++{ ++ char gfid_str[GF_UUID_BUF_SIZE] = {0,}; ++ shard_local_t *local = NULL; ++ shard_local_t *entrylk_local = NULL; ++ shard_entrylk_t *int_entrylk = NULL; ++ call_frame_t *entrylk_frame = NULL; + +- local->dot_shard_loc.inode = inode_find (this->itable, +- priv->dot_shard_gfid); +- if (!local->dot_shard_loc.inode) { +- ret = shard_init_internal_dir_loc (this, local, +- SHARD_INTERNAL_DIR_DOT_SHARD); +- if (ret) +- goto out; +- shard_lookup_internal_dir (frame, this, +- shard_post_resolve_unlink_handler, +- SHARD_INTERNAL_DIR_DOT_SHARD); ++ entrylk_frame = create_frame (this, this->ctx->pool); ++ if (!entrylk_frame) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " ++ "to lock marker file"); ++ goto err; ++ } ++ ++ entrylk_local = mem_get0 (this->local_pool); ++ if (!entrylk_local) { ++ STACK_DESTROY (entrylk_frame->root); ++ goto err; ++ } ++ ++ local = frame->local; ++ entrylk_frame->local = entrylk_local; ++ entrylk_local->main_frame = frame; ++ int_entrylk = &entrylk_local->int_entrylk; ++ ++ int_entrylk->loc.inode = inode_ref (inode); ++ set_lk_owner_from_ptr (&entrylk_frame->root->lk_owner, ++ entrylk_frame->root); ++ local->entrylk_frame = entrylk_frame; ++ gf_uuid_unparse (gfid, gfid_str); ++ int_entrylk->basename = gf_strdup (gfid_str); ++ ++ STACK_WIND (entrylk_frame, shard_acquire_entrylk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->entrylk, this->name, ++ &int_entrylk->loc, int_entrylk->basename, ENTRYLK_LOCK, ++ ENTRYLK_WRLCK, NULL); ++ return 0; ++err: ++ shard_common_failure_unwind (local->fop, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_lookup_base_shard_rm_handler (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind (local->fop, frame, -1, ++ local->op_errno); ++ return 0; ++ } ++ ++ if (local->prebuf.ia_nlink > 1) { ++ gf_msg_debug (this->name, 0, "link count on %s > 1:%d, " ++ "performing rename()/unlink()", ++ local->int_inodelk.loc.path, local->prebuf.ia_nlink); ++ if (local->fop == GF_FOP_RENAME) ++ shard_rename_src_base_file (frame, this); ++ else if (local->fop == GF_FOP_UNLINK) ++ shard_unlink_base_file (frame, this); + } else { +- local->post_res_handler = shard_post_resolve_unlink_handler; +- shard_refresh_internal_dir (frame, this, +- SHARD_INTERNAL_DIR_DOT_SHARD); ++ gf_msg_debug (this->name, 0, "link count on %s = 1, creating " ++ "file under .remove_me", local->int_inodelk.loc.path); ++ shard_acquire_entrylk (frame, this, priv->dot_shard_rm_inode, ++ local->prebuf.ia_gfid); + } ++ return 0; ++} + ++int ++shard_post_inodelk_fop_handler (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ switch (local->fop) { ++ case GF_FOP_UNLINK: ++ case GF_FOP_RENAME: ++ shard_lookup_base_file (frame, this, &local->int_inodelk.loc, ++ shard_post_lookup_base_shard_rm_handler); ++ break; ++ default: ++ gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_INVALID_FOP, ++ "post-inodelk handler not defined. This case should not" ++ " be hit"); ++ break; ++ } + return 0; ++} + +-out: +- SHARD_STACK_UNWIND (rename, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, +- NULL, NULL); ++int ++shard_acquire_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ call_frame_t *main_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *main_local = NULL; ++ ++ local = frame->local; ++ main_frame = local->main_frame; ++ main_local = main_frame->local; ++ ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind (main_local->fop, main_frame, ++ op_ret, op_errno); ++ return 0; ++ } ++ main_local->int_inodelk.acquired_lock = _gf_true; ++ shard_post_inodelk_fop_handler (main_frame, this); + return 0; + } + + int +-shard_post_rename_lookup_handler (call_frame_t *frame, xlator_t *this) ++shard_acquire_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc) ++{ ++ call_frame_t *lk_frame = NULL; ++ shard_local_t *local = NULL; ++ shard_local_t *lk_local = NULL; ++ shard_inodelk_t *int_inodelk = NULL; ++ ++ lk_frame = create_frame (this, this->ctx->pool); ++ if (!lk_frame) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create new frame " ++ "to lock base shard"); ++ goto err; ++ } ++ lk_local = mem_get0 (this->local_pool); ++ if (!lk_local) { ++ STACK_DESTROY (lk_frame->root); ++ goto err; ++ } ++ ++ local = frame->local; ++ lk_frame->local = lk_local; ++ lk_local->main_frame = frame; ++ int_inodelk = &lk_local->int_inodelk; ++ ++ int_inodelk->flock.l_len = 0; ++ int_inodelk->flock.l_start = 0; ++ int_inodelk->domain = this->name; ++ int_inodelk->flock.l_type = F_WRLCK; ++ loc_copy (&local->int_inodelk.loc, loc); ++ set_lk_owner_from_ptr (&lk_frame->root->lk_owner, lk_frame->root); ++ local->inodelk_frame = lk_frame; ++ ++ STACK_WIND (lk_frame, shard_acquire_inodelk_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->inodelk, int_inodelk->domain, ++ &local->int_inodelk.loc, F_SETLKW, &int_inodelk->flock, NULL); ++ return 0; ++err: ++ shard_common_failure_unwind (local->fop, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_post_mkdir_rm_handler (call_frame_t *frame, xlator_t *this) + { ++ loc_t *loc = NULL; + shard_local_t *local = NULL; + + local = frame->local; + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (rename, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (local->fop, frame, -1, ++ local->op_errno); + return 0; + } ++ if (local->fop == GF_FOP_UNLINK) ++ loc = &local->loc; ++ else if (local->fop == GF_FOP_RENAME) ++ loc = &local->loc2; ++ shard_acquire_inodelk (frame, this, loc); ++ return 0; ++} + +- if (local->dst_block_size) +- shard_rename_unlink_dst_shards_do (frame, this); +- else +- shard_rename_cbk (frame, this); ++int ++shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this, ++ shard_post_resolve_fop_handler_t handler, ++ shard_internal_dir_type_t type); ++int ++shard_pre_mkdir_rm_handler (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; + ++ if (local->op_ret < 0) { ++ shard_common_failure_unwind (local->fop, frame, -1, ++ local->op_errno); ++ return 0; ++ } ++ shard_mkdir_internal_dir (frame, this, shard_post_mkdir_rm_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ return 0; ++} ++ ++void ++shard_begin_rm_resolution (call_frame_t *frame, xlator_t *this) ++{ ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ ++ priv = this->private; ++ local = frame->local; ++ ++ local->dot_shard_rm_loc.inode = inode_find (this->itable, ++ priv->dot_shard_rm_gfid); ++ if (!local->dot_shard_rm_loc.inode) { ++ local->dot_shard_loc.inode = inode_find (this->itable, ++ priv->dot_shard_gfid); ++ if (!local->dot_shard_loc.inode) { ++ shard_mkdir_internal_dir (frame, this, ++ shard_pre_mkdir_rm_handler, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } else { ++ local->post_res_handler = shard_pre_mkdir_rm_handler; ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ } ++ } else { ++ local->post_res_handler = shard_post_mkdir_rm_handler; ++ shard_refresh_internal_dir (frame, this, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ } ++} ++ ++int ++shard_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint64_t block_size = 0; ++ shard_local_t *local = NULL; ++ ++ ret = shard_inode_ctx_get_block_size (loc->inode, this, &block_size); ++ if ((ret) && (!IA_ISLNK(loc->inode->ia_type))) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_INODE_CTX_GET_FAILED, "Failed to get block " ++ "size from inode ctx of %s", ++ uuid_utoa (loc->inode->gfid)); ++ goto err; ++ } ++ ++ if (!block_size || frame->root->pid == GF_CLIENT_PID_GSYNCD) { ++ STACK_WIND (frame, default_unlink_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); ++ return 0; ++ } ++ ++ local = mem_get0 (this->local_pool); ++ if (!local) ++ goto err; ++ ++ frame->local = local; ++ ++ loc_copy (&local->loc, loc); ++ local->xflag = xflag; ++ local->xattr_req = (xdata) ? dict_ref (xdata) : dict_new (); ++ local->block_size = block_size; ++ local->resolver_base_inode = loc->inode; ++ local->fop = GF_FOP_UNLINK; ++ if (!this->itable) ++ this->itable = (local->loc.inode)->table; ++ ++ local->resolve_not = _gf_true; ++ shard_begin_rm_resolution (frame, this); ++ return 0; ++err: ++ shard_common_failure_unwind (GF_FOP_UNLINK, frame, -1, ENOMEM); ++ return 0; ++} ++ ++int ++shard_rename_cbk (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->preoldparent, ++ &local->postoldparent, &local->prenewparent, ++ &local->postnewparent, local->xattr_rsp); ++ return 0; ++} ++ ++int ++shard_post_rename_lookup_handler (call_frame_t *frame, xlator_t *this) ++{ ++ shard_rename_cbk (frame, this); + return 0; + } + +@@ -3226,6 +3815,7 @@ shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + struct iatt *prenewparent, struct iatt *postnewparent, + dict_t *xdata) + { ++ int ret = 0; + shard_local_t *local = NULL; + + local = frame->local; +@@ -3235,6 +3825,11 @@ shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->op_errno = op_errno; + goto err; + } ++ /* Set ctx->refresh to TRUE to force a lookup on disk when ++ * shard_lookup_base_file() is called next to refresh the hard link ++ * count in ctx ++ */ ++ shard_inode_ctx_set_refresh_flag (local->int_inodelk.loc.inode, this); + + local->prebuf = *buf; + local->preoldparent = *preoldparent; +@@ -3244,40 +3839,37 @@ shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + if (xdata) + local->xattr_rsp = dict_ref (xdata); + +- /* Now the base file is looked up to gather the ia_size and ia_blocks.*/ ++ if (local->dst_block_size) { ++ if (local->entrylk_frame) { ++ ret = shard_unlock_entrylk (frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ } ++ } + ++ ret = shard_unlock_inodelk (frame, this); ++ if (ret < 0) { ++ local->op_ret = -1; ++ local->op_errno = -ret; ++ goto err; ++ } ++ } ++ ++ /* Now the base file of src, if sharded, is looked up to gather ia_size ++ * and ia_blocks.*/ + if (local->block_size) { + local->tmp_loc.inode = inode_new (this->itable); + gf_uuid_copy (local->tmp_loc.gfid, (local->loc.inode)->gfid); + shard_lookup_base_file (frame, this, &local->tmp_loc, + shard_post_rename_lookup_handler); + } else { +- shard_rename_unlink_dst_shards_do (frame, this); ++ shard_rename_cbk (frame, this); + } +- + return 0; + err: +- SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, +- NULL, NULL, NULL, NULL, NULL); +- return 0; +-} +- +-int +-shard_rename_src_base_file (call_frame_t *frame, xlator_t *this) +-{ +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- if (dict_set_uint32 (local->xattr_req, GET_LINK_COUNT, 0)) +- gf_msg (this->name, GF_LOG_WARNING, 0, +- SHARD_MSG_DICT_SET_FAILED, "Failed to set " +- GET_LINK_COUNT" in dict"); +- +- /* To-Do: Request open-fd count on dst base file */ +- STACK_WIND (frame, shard_rename_src_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, &local->loc, &local->loc2, +- local->xattr_req); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -3289,9 +3881,8 @@ shard_post_lookup_dst_base_file_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (rename, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -3332,6 +3923,7 @@ shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + if (newloc->inode) + ret = shard_inode_ctx_get_block_size (newloc->inode, this, + &dst_block_size); ++ + /* The following stack_wind covers the case where: + * a. the src file is not sharded and dst doesn't exist, OR + * b. the src and dst both exist but are not sharded. +@@ -3361,26 +3953,26 @@ shard_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + local->dst_block_size = dst_block_size; + if (!this->itable) + this->itable = (local->loc.inode)->table; ++ local->resolve_not = _gf_true; + +- if (local->dst_block_size) +- /* The if block covers the case where the dst file exists and is +- * sharded. So it is important to look up this inode, record its +- * size, before renaming src to dst, so as to NOT lose this +- * information. +- */ +- shard_lookup_base_file (frame, this, &local->loc2, +- shard_post_lookup_dst_base_file_handler); +- else +- /* The following block covers the case where the dst either +- * doesn't exist or is NOT sharded. In this case, shard xlator +- * would go ahead and rename src to dst. +- */ ++ /* The following if-block covers the case where the dst file exists ++ * and is sharded. ++ */ ++ if (local->dst_block_size) { ++ shard_begin_rm_resolution (frame, this); ++ } else { ++ /* The following block covers the case where the dst either doesn't ++ * exist or is NOT sharded but the src is sharded. In this case, shard ++ * xlator would go ahead and rename src to dst. Once done, it would also ++ * lookup the base shard of src to get the ia_size and ia_blocks xattr ++ * values. ++ */ + shard_rename_src_base_file (frame, this); ++ } + return 0; + + err: +- SHARD_STACK_UNWIND (rename, frame, -1, ENOMEM, NULL, NULL, NULL, +- NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_RENAME, frame, -1, ENOMEM); + return 0; + + } +@@ -3400,8 +3992,8 @@ shard_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + if (op_ret == -1) + goto unwind; + +- ret = shard_inode_ctx_set (inode, this, stbuf, +- ntoh64 (local->block_size), SHARD_ALL_MASK); ++ ret = shard_inode_ctx_set (inode, this, stbuf, local->block_size, ++ SHARD_ALL_MASK); + if (ret) + gf_msg (this->name, GF_LOG_WARNING, 0, + SHARD_MSG_INODE_CTX_SET_FAILED, "Failed to set inode " +@@ -3417,28 +4009,29 @@ int + shard_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) + { ++ shard_priv_t *priv = NULL; + shard_local_t *local = NULL; + ++ priv = this->private; + local = mem_get0 (this->local_pool); + if (!local) + goto err; + + frame->local = local; ++ local->block_size = priv->block_size; + + if (!__is_gsyncd_on_shard_dir (frame, loc)) { +- SHARD_INODE_CREATE_INIT (this, local, xdata, loc, err); ++ SHARD_INODE_CREATE_INIT (this, local->block_size, xdata, loc, 0, ++ 0, err); + } + + STACK_WIND (frame, shard_create_cbk, FIRST_CHILD (this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, + fd, xdata); + return 0; +- + err: +- SHARD_STACK_UNWIND (create, frame, -1, ENOMEM, NULL, NULL, NULL, +- NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_CREATE, frame, -1, ENOMEM); + return 0; +- + } + + int +@@ -3523,9 +4116,9 @@ out: + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID (frame, local); + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (readv, frame, local->op_ret, +- local->op_errno, NULL, 0, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, ++ local->op_ret, ++ local->op_errno); + } else { + if (xdata) + local->xattr_rsp = dict_ref (xdata); +@@ -3792,8 +4385,8 @@ shard_post_lookup_shards_readv_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (readv, frame, local->op_ret, +- local->op_errno, NULL, 0, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -3815,8 +4408,8 @@ shard_post_mknod_readv_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (readv, frame, local->op_ret, +- local->op_errno, NULL, 0, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -3839,9 +4432,9 @@ shard_post_resolve_readv_handler (call_frame_t *frame, xlator_t *this) + + if (local->op_ret < 0) { + if (local->op_errno != ENOENT) { +- SHARD_STACK_UNWIND (readv, frame, local->op_ret, +- local->op_errno, NULL, 0, NULL, +- NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, ++ local->op_ret, ++ local->op_errno); + return 0; + } else { + struct iovec vec = {0,}; +@@ -3878,8 +4471,8 @@ shard_post_lookup_readv_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (readv, frame, local->op_ret, +- local->op_errno, NULL, 0, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -3955,10 +4548,8 @@ shard_post_lookup_readv_handler (call_frame_t *frame, xlator_t *this) + SHARD_INTERNAL_DIR_DOT_SHARD); + } + return 0; +- + err: +- SHARD_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, +- NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, -1, ENOMEM); + return 0; + } + +@@ -4018,8 +4609,7 @@ shard_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + shard_post_lookup_readv_handler); + return 0; + err: +- SHARD_STACK_UNWIND (readv, frame, -1, ENOMEM, NULL, 0, NULL, NULL, +- NULL); ++ shard_common_failure_unwind (GF_FOP_READ, frame, -1, ENOMEM); + return 0; + } + +@@ -4032,9 +4622,8 @@ shard_common_inode_write_post_update_size_handler (call_frame_t *frame, + local = frame->local; + + if (local->op_ret < 0) { +- shard_common_inode_write_failure_unwind (local->fop, frame, +- local->op_ret, +- local->op_errno); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + } else { + shard_common_inode_write_success_unwind (local->fop, frame, + local->written_size); +@@ -4139,9 +4728,8 @@ shard_common_inode_write_do_cbk (call_frame_t *frame, void *cookie, + if (call_count == 0) { + SHARD_UNSET_ROOT_FS_ID (frame, local); + if (local->op_ret < 0) { +- shard_common_inode_write_failure_unwind (fop, frame, +- local->op_ret, +- local->op_errno); ++ shard_common_failure_unwind (fop, frame, local->op_ret, ++ local->op_errno); + } else { + shard_get_delta_size_from_inode_ctx (local, + local->fd->inode, +@@ -4343,9 +4931,8 @@ shard_common_inode_write_post_lookup_shards_handler (call_frame_t *frame, + local = frame->local; + + if (local->op_ret < 0) { +- shard_common_inode_write_failure_unwind (local->fop, frame, +- local->op_ret, +- local->op_errno); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -4368,9 +4955,8 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame, + local = frame->local; + + if (local->op_ret < 0) { +- shard_common_inode_write_failure_unwind (local->fop, frame, +- local->op_ret, +- local->op_errno); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -4386,10 +4972,6 @@ shard_common_inode_write_post_mknod_handler (call_frame_t *frame, + } + + int +-shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this, +- shard_post_resolve_fop_handler_t handler, +- shard_internal_dir_type_t type); +-int + shard_common_inode_write_post_resolve_handler (call_frame_t *frame, + xlator_t *this) + { +@@ -4398,9 +4980,8 @@ shard_common_inode_write_post_resolve_handler (call_frame_t *frame, + local = frame->local; + + if (local->op_ret < 0) { +- shard_common_inode_write_failure_unwind (local->fop, frame, +- local->op_ret, +- local->op_errno); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -4423,9 +5004,8 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame, + shard_priv_t *priv = this->private; + + if (local->op_ret < 0) { +- shard_common_inode_write_failure_unwind (local->fop, frame, +- local->op_ret, +- local->op_errno); ++ shard_common_failure_unwind (local->fop, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -4443,8 +5023,7 @@ shard_common_inode_write_post_lookup_handler (call_frame_t *frame, + local->inode_list = GF_CALLOC (local->num_blocks, sizeof (inode_t *), + gf_shard_mt_inode_list); + if (!local->inode_list) { +- shard_common_inode_write_failure_unwind (local->fop, frame, +- -1, ENOMEM); ++ shard_common_failure_unwind (local->fop, frame, -1, ENOMEM); + return 0; + } + +@@ -4508,7 +5087,7 @@ shard_mkdir_internal_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + if (link_inode != inode) { + shard_refresh_internal_dir (frame, this, type); + } else { +- shard_inode_ctx_set_refreshed_flag (link_inode, this); ++ shard_inode_ctx_mark_dir_refreshed (link_inode, this); + shard_common_resolve_shards (frame, this, + local->post_res_handler); + } +@@ -4544,6 +5123,10 @@ shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this, + gf_uuid_copy (*gfid, priv->dot_shard_gfid); + loc = &local->dot_shard_loc; + break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ gf_uuid_copy (*gfid, priv->dot_shard_rm_gfid); ++ loc = &local->dot_shard_rm_loc; ++ break; + default: + break; + } +@@ -4702,8 +5285,8 @@ out: + return 0; + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (fsync, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FSYNC, frame, local->op_ret, ++ local->op_errno); + } else { + shard_get_timestamps_from_inode_ctx (local, base_inode, this); + SHARD_STACK_UNWIND (fsync, frame, local->op_ret, +@@ -4733,8 +5316,8 @@ shard_post_lookup_fsync_handler (call_frame_t *frame, xlator_t *this) + INIT_LIST_HEAD (©); + + if (local->op_ret < 0) { +- SHARD_STACK_UNWIND (fsync, frame, local->op_ret, +- local->op_errno, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FSYNC, frame, local->op_ret, ++ local->op_errno); + return 0; + } + +@@ -4847,7 +5430,7 @@ shard_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync, + shard_post_lookup_fsync_handler); + return 0; + err: +- SHARD_STACK_UNWIND (fsync, frame, -1, ENOMEM, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FSYNC, frame, -1, ENOMEM); + return 0; + } + +@@ -5069,9 +5652,8 @@ shard_removexattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + FIRST_CHILD(this)->fops->removexattr, loc, name, + xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL); ++ shard_common_failure_unwind (GF_FOP_REMOVEXATTR, frame, -1, op_errno); + return 0; + } + +@@ -5095,9 +5677,8 @@ shard_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + FIRST_CHILD(this)->fops->fremovexattr, fd, name, + xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL); ++ shard_common_failure_unwind (GF_FOP_FREMOVEXATTR, frame, -1, op_errno); + return 0; + } + +@@ -5135,9 +5716,8 @@ shard_fgetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + STACK_WIND (frame, shard_fgetxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetxattr, fd, name, xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FGETXATTR, frame, -1, op_errno); + return 0; + } + +@@ -5176,9 +5756,8 @@ shard_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + STACK_WIND (frame, shard_getxattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, loc, name, xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_GETXATTR, frame, -1, op_errno); + return 0; + } + +@@ -5197,9 +5776,8 @@ shard_fsetxattr (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, + FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, + xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL); ++ shard_common_failure_unwind (GF_FOP_FSETXATTR, frame, -1, op_errno); + return 0; + } + +@@ -5218,9 +5796,8 @@ shard_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + FIRST_CHILD(this)->fops->setxattr, loc, dict, flags, + xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL); ++ shard_common_failure_unwind (GF_FOP_SETXATTR, frame, -1, op_errno); + return 0; + } + +@@ -5335,11 +5912,9 @@ shard_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, + STACK_WIND (frame, shard_common_setattr_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, + local->xattr_req); +- + return 0; +- + err: +- SHARD_STACK_UNWIND (setattr, frame, -1, ENOMEM, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_SETATTR, frame, -1, ENOMEM); + return 0; + } + +@@ -5398,9 +5973,8 @@ shard_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, + FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, + local->xattr_req); + return 0; +- + err: +- SHARD_STACK_UNWIND (fsetattr, frame, -1, ENOMEM, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FSETATTR, frame, -1, ENOMEM); + return 0; + } + +@@ -5502,7 +6076,7 @@ shard_common_inode_write_begin (call_frame_t *frame, xlator_t *this, + shard_common_inode_write_post_lookup_handler); + return 0; + out: +- shard_common_inode_write_failure_unwind (fop, frame, -1, ENOMEM); ++ shard_common_failure_unwind (fop, frame, -1, ENOMEM); + return 0; + } + +@@ -5527,9 +6101,8 @@ shard_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, + shard_common_inode_write_begin (frame, this, GF_FOP_FALLOCATE, fd, NULL, + 0, offset, keep_size, len, NULL, xdata); + return 0; +- + out: +- SHARD_STACK_UNWIND (fallocate, frame, -1, ENOTSUP, NULL, NULL, NULL); ++ shard_common_failure_unwind (GF_FOP_FALLOCATE, frame, -1, ENOTSUP); + return 0; + } + +@@ -5558,7 +6131,7 @@ shard_seek (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + /* TBD */ + gf_msg (this->name, GF_LOG_INFO, ENOTSUP, SHARD_MSG_FOP_NOT_SUPPORTED, + "seek called on %s.", uuid_utoa (fd->inode->gfid)); +- SHARD_STACK_UNWIND (seek, frame, -1, ENOTSUP, 0, NULL); ++ shard_common_failure_unwind (GF_FOP_SEEK, frame, -1, ENOTSUP); + return 0; + } + +@@ -5619,6 +6192,7 @@ init (xlator_t *this) + goto out; + } + gf_uuid_parse (SHARD_ROOT_GFID, priv->dot_shard_gfid); ++ gf_uuid_parse (DOT_SHARD_REMOVE_ME_GFID, priv->dot_shard_rm_gfid); + + this->private = priv; + LOCK_INIT (&priv->lock); +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 225caa0..1783ff6 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -18,6 +18,7 @@ + #include "syncop.h" + + #define GF_SHARD_DIR ".shard" ++#define GF_SHARD_REMOVE_ME_DIR ".remove_me" + #define SHARD_MIN_BLOCK_SIZE (4 * GF_UNIT_MB) + #define SHARD_MAX_BLOCK_SIZE (4 * GF_UNIT_TB) + #define SHARD_XATTR_PREFIX "trusted.glusterfs.shard." +@@ -55,6 +56,12 @@ + #define get_highest_block(off, len, shard_size) \ + (((((off)+(len)) == 0)?0:((off)+(len)-1)) / (shard_size)) + ++int ++shard_unlock_inodelk (call_frame_t *frame, xlator_t *this); ++ ++int ++shard_unlock_entrylk (call_frame_t *frame, xlator_t *this); ++ + #define SHARD_ENTRY_FOP_CHECK(loc, op_errno, label) do { \ + if ((loc->name && !strcmp (GF_SHARD_DIR, loc->name)) && \ + (((loc->parent) && \ +@@ -79,39 +86,57 @@ + } \ + } while (0) + +-#define SHARD_STACK_UNWIND(fop, frame, params ...) do { \ +- shard_local_t *__local = NULL; \ +- if (frame) { \ +- __local = frame->local; \ +- frame->local = NULL; \ +- } \ +- STACK_UNWIND_STRICT (fop, frame, params); \ +- if (__local) { \ +- shard_local_wipe (__local); \ +- mem_put (__local); \ +- } \ ++#define SHARD_STACK_UNWIND(fop, frame, params ...) do { \ ++ shard_local_t *__local = NULL; \ ++ if (frame) { \ ++ __local = frame->local; \ ++ if (__local && __local->int_inodelk.acquired_lock) \ ++ shard_unlock_inodelk (frame, frame->this); \ ++ if (__local && __local->int_entrylk.acquired_lock) \ ++ shard_unlock_entrylk (frame, frame->this); \ ++ frame->local = NULL; \ ++ } \ ++ STACK_UNWIND_STRICT (fop, frame, params); \ ++ if (__local) { \ ++ shard_local_wipe (__local); \ ++ mem_put (__local); \ ++ } \ + } while (0) + ++#define SHARD_STACK_DESTROY(frame) \ ++ do { \ ++ shard_local_t *__local = NULL; \ ++ __local = frame->local; \ ++ frame->local = NULL; \ ++ STACK_DESTROY (frame->root); \ ++ if (__local) { \ ++ shard_local_wipe (__local); \ ++ mem_put (__local); \ ++ } \ ++ } while (0); ++ + +-#define SHARD_INODE_CREATE_INIT(this, local, xattr_req, loc, label) do { \ ++#define SHARD_INODE_CREATE_INIT(this, block_size, xattr_req, loc, size, \ ++ block_count, label) do { \ + int __ret = -1; \ + int64_t *__size_attr = NULL; \ +- shard_priv_t *__priv = NULL; \ ++ uint64_t *__bs = 0; \ + \ +- __priv = this->private; \ +- \ +- local->block_size = hton64 (__priv->block_size); \ +- __ret = dict_set_static_bin (xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, \ +- &local->block_size, \ +- sizeof (local->block_size)); \ ++ __bs = GF_CALLOC (1, sizeof (uint64_t), gf_shard_mt_uint64_t); \ ++ if (!__bs) \ ++ goto label; \ ++ *__bs = hton64 (block_size); \ ++ __ret = dict_set_bin (xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, __bs, \ ++ sizeof (*__bs)); \ + if (__ret) { \ + gf_msg (this->name, GF_LOG_WARNING, 0, \ + SHARD_MSG_DICT_SET_FAILED, "Failed to set key: %s " \ +- "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, loc->path); \ ++ "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, (loc)->path);\ ++ GF_FREE (__bs); \ + goto label; \ + } \ + \ +- __ret = shard_set_size_attrs (0, 0, &__size_attr); \ ++ __ret = shard_set_size_attrs (size, block_count, &__size_attr); \ + if (__ret) \ + goto label; \ + \ +@@ -120,7 +145,7 @@ + if (__ret) { \ + gf_msg (this->name, GF_LOG_WARNING, 0, \ + SHARD_MSG_DICT_SET_FAILED, "Failed to set key: %s " \ +- "on path %s", GF_XATTR_SHARD_FILE_SIZE, loc->path); \ ++ "on path %s", GF_XATTR_SHARD_FILE_SIZE, (loc)->path); \ + GF_FREE (__size_attr); \ + goto label; \ + } \ +@@ -172,21 +197,34 @@ + } \ + } while (0) + ++/* rm = "remove me" */ + + typedef struct shard_priv { + uint64_t block_size; + uuid_t dot_shard_gfid; ++ uuid_t dot_shard_rm_gfid; + inode_t *dot_shard_inode; ++ inode_t *dot_shard_rm_inode; + gf_lock_t lock; + int inode_count; + struct list_head ilist_head; + } shard_priv_t; + + typedef struct { +- loc_t *loc; +- short type; ++ loc_t loc; + char *domain; +-} shard_lock_t; ++ struct gf_flock flock; ++ gf_boolean_t acquired_lock; ++} shard_inodelk_t; ++ ++typedef struct { ++ loc_t loc; ++ char *domain; ++ char *basename; ++ entrylk_cmd cmd; ++ entrylk_type type; ++ gf_boolean_t acquired_lock; ++} shard_entrylk_t; + + typedef int32_t (*shard_post_fop_handler_t) (call_frame_t *frame, + xlator_t *this); +@@ -200,6 +238,7 @@ typedef int32_t (*shard_post_mknod_fop_handler_t) (call_frame_t *frame, + + typedef int32_t (*shard_post_update_size_fop_handler_t) (call_frame_t *frame, + xlator_t *this); ++ + typedef struct shard_local { + int op_ret; + int op_errno; +@@ -227,6 +266,7 @@ typedef struct shard_local { + int delta_blocks; + loc_t loc; + loc_t dot_shard_loc; ++ loc_t dot_shard_rm_loc; + loc_t loc2; + loc_t tmp_loc; + fd_t *fd; +@@ -251,16 +291,18 @@ typedef struct shard_local { + shard_post_resolve_fop_handler_t post_res_handler; + shard_post_mknod_fop_handler_t post_mknod_handler; + shard_post_update_size_fop_handler_t post_update_size_handler; +- struct { +- int lock_count; +- fop_inodelk_cbk_t inodelk_cbk; +- shard_lock_t *shard_lock; +- } lock; ++ shard_inodelk_t int_inodelk; ++ shard_entrylk_t int_entrylk; + inode_t *resolver_base_inode; + gf_boolean_t first_lookup_done; + syncbarrier_t barrier; + gf_boolean_t lookup_shards_barriered; + gf_boolean_t unlink_shards_barriered; ++ gf_boolean_t resolve_not; ++ loc_t newloc; ++ call_frame_t *main_frame; ++ call_frame_t *inodelk_frame; ++ call_frame_t *entrylk_frame; + } shard_local_t; + + typedef struct shard_inode_ctx { +@@ -284,6 +326,7 @@ typedef struct shard_inode_ctx { + + typedef enum { + SHARD_INTERNAL_DIR_DOT_SHARD = 1, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME, + } shard_internal_dir_type_t; + + #endif /* __SHARD_H__ */ +-- +1.8.3.1 + diff --git a/0430-features-shard-Perform-shards-deletion-in-the-backgr.patch b/0430-features-shard-Perform-shards-deletion-in-the-backgr.patch new file mode 100644 index 0000000..cddeddf --- /dev/null +++ b/0430-features-shard-Perform-shards-deletion-in-the-backgr.patch @@ -0,0 +1,1790 @@ +From 93ef66173442aaf4aeaeb161c6d6108eda54014a Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Thu, 12 Apr 2018 15:47:00 +0530 +Subject: [PATCH 430/444] features/shard: Perform shards deletion in the + background + +> Upstream: https://review.gluster.org/19970 +> BUG: 1568521 +> Change-Id: Ia83117230c9dd7d0d9cae05235644f8475e97bc3 + +A synctask is created that would scan the indices from +.shard/.remove_me, to delete the shards associated with the +gfid corresponding to the index bname and the rate of deletion +is controlled by the option features.shard-deletion-rate whose +default value is 100. +The task is launched on two accounts: +1. when shard receives its first-ever lookup on the volume +2. when a rename or unlink deleted an inode + +Change-Id: Ia83117230c9dd7d0d9cae05235644f8475e97bc3 +BUG: 1520882 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/154864 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/globals.h | 1 + + tests/bugs/shard/bug-1568521-EEXIST.t | 30 +- + tests/bugs/shard/bug-1568521.t | 53 ++ + tests/bugs/shard/bug-shard-discard.t | 19 +- + tests/bugs/shard/shard-inode-refcount-test.t | 5 +- + tests/bugs/shard/unlinks-and-renames.t | 123 ++-- + xlators/features/shard/src/shard-messages.h | 18 +- + xlators/features/shard/src/shard.c | 816 +++++++++++++++++++----- + xlators/features/shard/src/shard.h | 19 +- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 5 + + 10 files changed, 829 insertions(+), 260 deletions(-) + create mode 100644 tests/bugs/shard/bug-1568521.t + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index 8e218cb..699e73e 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -109,6 +109,7 @@ + + #define GD_OP_VERSION_3_13_2 31302 /* Op-version for GlusterFS 3.13.2 */ + ++#define GD_OP_VERSION_4_2_0 40200 /* Op-version for GlusterFs 4.2.0 */ + + /* Downstream only change */ + #define GD_OP_VERSION_3_11_2 31102 /* Op-version for RHGS 3.3.1-async */ +diff --git a/tests/bugs/shard/bug-1568521-EEXIST.t b/tests/bugs/shard/bug-1568521-EEXIST.t +index e4c3d41..7de400d 100644 +--- a/tests/bugs/shard/bug-1568521-EEXIST.t ++++ b/tests/bugs/shard/bug-1568521-EEXIST.t +@@ -5,6 +5,12 @@ + + cleanup + ++function get_file_count { ++ ls $1* | wc -l ++} ++ ++FILE_COUNT_TIME=5 ++ + TEST glusterd + TEST pidof glusterd + TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} +@@ -41,10 +47,14 @@ TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x000000000050000000000000 + sleep 2 + + TEST unlink $M0/dir/file +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_file +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_file +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_file +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_file ++ ++TEST ! stat $B0/${V0}0/dir/file ++TEST ! stat $B0/${V0}1/dir/file ++ ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_file ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_file ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_file ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_file + + ############################## + ### Repeat test for rename ### +@@ -71,9 +81,13 @@ TEST setfattr -n trusted.glusterfs.shard.file-size -v 0x000000000050000000000000 + sleep 2 + + TEST mv -f $M0/src $M0/dir/dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++ ++TEST ! stat $B0/${V0}0/src ++TEST ! stat $B0/${V0}1/src ++ ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_dst + + cleanup +diff --git a/tests/bugs/shard/bug-1568521.t b/tests/bugs/shard/bug-1568521.t +new file mode 100644 +index 0000000..167fb63 +--- /dev/null ++++ b/tests/bugs/shard/bug-1568521.t +@@ -0,0 +1,53 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++ ++ ++function delete_files { ++ local mountpoint=$1; ++ local success=0; ++ local value=$2 ++ for i in {1..500}; do ++ unlink $mountpoint/file-$i 2>/dev/null 1>/dev/null ++ if [ $? -eq 0 ]; then ++ echo $2 >> $B0/output.txt ++ fi ++ done ++ echo $success ++} ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 shard-block-size 4MB ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1 ++ ++for i in {1..500}; do ++ dd if=/dev/urandom of=$M0/file-$i bs=1M count=2 ++done ++ ++for i in {1..500}; do ++ stat $M1/file-$i > /dev/null ++done ++ ++delete_files $M0 0 & ++delete_files $M1 1 & ++wait ++ ++success1=$(grep 0 $B0/output.txt | wc -l); ++success2=$(grep 1 $B0/output.txt | wc -l); ++ ++echo "Success1 is $success1"; ++echo "Success2 is $success2"; ++ ++success_total=$((success1 + success2)); ++ ++EXPECT 500 echo $success_total ++ ++cleanup +diff --git a/tests/bugs/shard/bug-shard-discard.t b/tests/bugs/shard/bug-shard-discard.t +index 884d9e7..910ade1 100644 +--- a/tests/bugs/shard/bug-shard-discard.t ++++ b/tests/bugs/shard/bug-shard-discard.t +@@ -5,6 +5,12 @@ + + cleanup + ++FILE_COUNT_TIME=5 ++ ++function get_shard_count { ++ ls $1/$2.* | wc -l ++} ++ + TEST glusterd + TEST pidof glusterd + TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0..3} +@@ -42,14 +48,11 @@ EXPECT_NOT "1" file_all_zeroes `find $B0 -name $gfid_foo.1` + + # Now unlink the file. And ensure that all shards associated with the file are cleaned up + TEST unlink $M0/foo +-#TEST ! stat $B0/${V0}0/.shard/$gfid_foo.1 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_foo.1 +-#TEST ! stat $B0/${V0}2/.shard/$gfid_foo.1 +-#TEST ! stat $B0/${V0}3/.shard/$gfid_foo.1 +-#TEST ! stat $B0/${V0}0/.shard/$gfid_foo.2 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_foo.2 +-#TEST ! stat $B0/${V0}2/.shard/$gfid_foo.2 +-#TEST ! stat $B0/${V0}3/.shard/$gfid_foo.2 ++ ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}0/.shard $gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}1/.shard $gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}2/.shard $gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_shard_count $B0/${V0}3/.shard $gfid_foo + TEST ! stat $M0/foo + + #clean up everything +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +index c92dc07..087c8ba 100644 +--- a/tests/bugs/shard/shard-inode-refcount-test.t ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -5,6 +5,8 @@ + + cleanup + ++SHARD_COUNT_TIME=5 ++ + TEST glusterd + TEST pidof glusterd + TEST $CLI volume create $V0 $H0:$B0/${V0}0 +@@ -18,7 +20,8 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23 + + ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) + TEST rm -f $M0/one-plus-five-shards +-#EXPECT `expr $ACTIVE_INODES_BEFORE - 4` get_mount_active_size_value $V0 ++# Expect 5 inodes less. But one inode more than before because .remove_me would be created. ++EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0 + + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume stop $V0 +diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t +index 997c397..6e5164f 100644 +--- a/tests/bugs/shard/unlinks-and-renames.t ++++ b/tests/bugs/shard/unlinks-and-renames.t +@@ -9,6 +9,12 @@ cleanup + # and rename fops in sharding and make sure they work fine. + # + ++FILE_COUNT_TIME=5 ++ ++function get_file_count { ++ ls $1* | wc -l ++} ++ + ################################################# + ################### UNLINK ###################### + ################################################# +@@ -36,13 +42,8 @@ gfid_foo=$(get_gfid_string $M0/dir/foo) + TEST unlink $M0/dir/foo + TEST stat $B0/${V0}0/.shard/.remove_me + TEST stat $B0/${V0}1/.shard/.remove_me +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo +- +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo + + ################################################## + ##### Unlink of a sharded file without holes ##### +@@ -56,20 +57,14 @@ TEST stat $B0/${V0}1/.shard/$gfid_new.1 + TEST stat $B0/${V0}0/.shard/$gfid_new.2 + TEST stat $B0/${V0}1/.shard/$gfid_new.2 + TEST unlink $M0/dir/new +-#TEST ! stat $B0/${V0}0/.shard/$gfid_new.1 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_new.1 +-#TEST ! stat $B0/${V0}0/.shard/$gfid_new.2 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_new.2 ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_new + TEST ! stat $M0/dir/new + TEST ! stat $B0/${V0}0/dir/new + TEST ! stat $B0/${V0}1/dir/new +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_new +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new + +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_new +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_new +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_new +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_new + ####################################### + ##### Unlink with /.shard present ##### + ####################################### +@@ -83,13 +78,8 @@ TEST unlink $M0/dir/foo + TEST ! stat $B0/${V0}0/dir/foo + TEST ! stat $B0/${V0}1/dir/foo + TEST ! stat $M0/dir/foo +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo +- +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo + + ############################################################# + ##### Unlink of a file with only one block (the zeroth) ##### +@@ -102,13 +92,9 @@ TEST unlink $M0/dir/foo + TEST ! stat $B0/${V0}0/dir/foo + TEST ! stat $B0/${V0}1/dir/foo + TEST ! stat $M0/dir/foo +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_foo +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo + +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_foo +-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_foo + #################################################### + ##### Unlink of a sharded file with hard-links ##### + #################################################### +@@ -137,22 +123,15 @@ TEST stat $B0/${V0}0/link + TEST stat $B0/${V0}1/link + # Now delete the last link. + TEST unlink $M0/link +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_original +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_original ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_original ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_original + # Ensure that the shards are all cleaned up. +-#TEST ! stat $B0/${V0}0/.shard/$gfid_original.1 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_original.1 +-#TEST ! stat $B0/${V0}0/.shard/$gfid_original.2 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_original.2 +-#TEST ! stat $M0/link ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_original ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_original ++TEST ! stat $M0/link + TEST ! stat $B0/${V0}0/link + TEST ! stat $B0/${V0}1/link + +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_original +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_original +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_original +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_original +- + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume stop $V0 + TEST $CLI volume delete $V0 +@@ -190,13 +169,8 @@ TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst +- +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ################################################## + ##### Rename to a sharded file without holes ##### +@@ -212,23 +186,16 @@ TEST stat $B0/${V0}1/.shard/$gfid_dst.1 + TEST stat $B0/${V0}0/.shard/$gfid_dst.2 + TEST stat $B0/${V0}1/.shard/$gfid_dst.2 + TEST mv -f $M0/dir/src $M0/dir/dst +-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 +-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_dst + TEST ! stat $M0/dir/src + TEST stat $M0/dir/dst + TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst +- +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ################################################### + ##### Rename of dst file with /.shard present ##### +@@ -245,13 +212,8 @@ TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst +- +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000500000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ############################################################### + ##### Rename of dst file with only one block (the zeroth) ##### +@@ -268,13 +230,8 @@ TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src + TEST stat $B0/${V0}0/dir/dst + TEST stat $B0/${V0}1/dir/dst +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst +- +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000100000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + + ######################################################## + ##### Rename to a dst sharded file with hard-links ##### +@@ -307,20 +264,18 @@ TEST ! stat $B0/${V0}1/.shard/.remove_me/$gfid_dst + TEST touch $M0/dir/src2 + TEST mv -f $M0/dir/src2 $M0/link + # Ensure that the shards are all cleaned up. +-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 +-#TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 +-#TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/$gfid_dst ++TEST ! stat $B0/${V0}0/.shard/$gfid_dst.1 ++TEST ! stat $B0/${V0}1/.shard/$gfid_dst.1 ++TEST ! stat $B0/${V0}0/.shard/$gfid_dst.2 ++TEST ! stat $B0/${V0}1/.shard/$gfid_dst.2 + TEST ! stat $M0/dir/src2 + TEST ! stat $B0/${V0}0/dir/src2 + TEST ! stat $B0/${V0}1/dir/src2 +-TEST stat $B0/${V0}0/.shard/.remove_me/$gfid_dst +-TEST stat $B0/${V0}1/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst ++EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000400000" get_hex_xattr trusted.glusterfs.shard.block-size $B0/${V0}1/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}0/.shard/.remove_me/$gfid_dst +-EXPECT "0000000000900000000000000000000000000000000000000000000000000000" get_hex_xattr trusted.glusterfs.shard.file-size $B0/${V0}1/.shard/.remove_me/$gfid_dst + # Rename with non-existent dst and a sharded src + TEST touch $M0/dir/src + TEST dd if=/dev/zero of=$M0/dir/src bs=1024 count=9216 +diff --git a/xlators/features/shard/src/shard-messages.h b/xlators/features/shard/src/shard-messages.h +index 0267f8a..bc04e5e 100644 +--- a/xlators/features/shard/src/shard-messages.h ++++ b/xlators/features/shard/src/shard-messages.h +@@ -40,7 +40,7 @@ + */ + + #define GLFS_COMP_BASE_SHARD GLFS_MSGID_COMP_SHARD +-#define GLFS_NUM_MESSAGES 20 ++#define GLFS_NUM_MESSAGES 22 + #define GLFS_MSGID_END (GLFS_COMP_BASE_SHARD + GLFS_NUM_MESSAGES + 1) + + #define glfs_msg_start_x GLFS_COMP_BASE_SHARD, "Invalid: Start of messages" +@@ -58,7 +58,7 @@ + * @diagnosis + * @recommendedaction + */ +-#define SHARD_MSG_DICT_SET_FAILED (GLFS_COMP_BASE_SHARD + 2) ++#define SHARD_MSG_DICT_OP_FAILED (GLFS_COMP_BASE_SHARD + 2) + + + /*! +@@ -194,5 +194,19 @@ + */ + #define SHARD_MSG_FOP_FAILED (GLFS_COMP_BASE_SHARD + 20) + ++/*! ++ * @messageid 133021 ++ * @diagnosis ++ * @recommendedaction ++*/ ++#define SHARD_MSG_SHARDS_DELETION_FAILED (GLFS_COMP_BASE_SHARD + 21) ++ ++/*! ++ * @messageid 133022 ++ * @diagnosis ++ * @recommendedaction ++*/ ++#define SHARD_MSG_SHARDS_DELETION_COMPLETED (GLFS_COMP_BASE_SHARD + 22) ++ + #define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages" + #endif /* !_SHARD_MESSAGES_H_ */ +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 492341c..2faf711 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -677,7 +677,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + * keep it alive by holding a ref on it. + */ + inode_ref (linked_inode); +- gf_uuid_copy (ctx->base_gfid, base_inode->gfid); ++ if (base_inode) ++ gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); + priv->inode_count++; +@@ -738,7 +739,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + * keep it alive by holding a ref on it. + */ + inode_ref (linked_inode); +- gf_uuid_copy (ctx->base_gfid, base_inode->gfid); ++ if (base_inode) ++ gf_uuid_copy (ctx->base_gfid, base_inode->gfid); + ctx->block_num = block_num; + ctx->base_inode = base_inode; + list_add_tail (&ctx->ilist, &priv->ilist_head); +@@ -977,6 +979,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + int i = -1; + uint32_t shard_idx_iter = 0; + char path[PATH_MAX] = {0,}; ++ uuid_t gfid = {0,}; + inode_t *inode = NULL; + inode_t *res_inode = NULL; + inode_t *fsync_inode = NULL; +@@ -988,6 +991,10 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + local->call_count = 0; + shard_idx_iter = local->first_block; + res_inode = local->resolver_base_inode; ++ if (res_inode) ++ gf_uuid_copy (gfid, res_inode->gfid); ++ else ++ gf_uuid_copy (gfid, local->base_gfid); + + if ((local->op_ret < 0) || (local->resolve_not)) + goto out; +@@ -1000,7 +1007,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + continue; + } + +- shard_make_block_abspath (shard_idx_iter, res_inode->gfid, path, ++ shard_make_block_abspath (shard_idx_iter, gfid, path, + sizeof(path)); + + inode = NULL; +@@ -1147,7 +1154,7 @@ shard_update_file_size (call_frame_t *frame, xlator_t *this, fd_t *fd, + ret = dict_set_bin (xattr_req, GF_XATTR_SHARD_FILE_SIZE, size_attr, + 8 * 4); + if (ret) { +- gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED, ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set key %s into dict. gfid=%s", + GF_XATTR_SHARD_FILE_SIZE, uuid_utoa (inode->gfid)); + GF_FREE (size_attr); +@@ -1376,7 +1383,7 @@ shard_lookup_internal_dir (call_frame_t *frame, xlator_t *this, + + ret = dict_set_bin (xattr_req, "gfid-req", *gfid, 16); + if (ret) { +- gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED, ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set gfid of %s into dict", + shard_internal_dir_string (type)); + local->op_ret = -1; +@@ -1431,10 +1438,49 @@ shard_inode_ctx_update (inode_t *inode, xlator_t *this, dict_t *xdata, + } + + int ++shard_delete_shards (void *opaque); ++ ++int ++shard_delete_shards_cbk (int ret, call_frame_t *frame, void *data); ++ ++int ++shard_start_background_deletion (xlator_t *this) ++{ ++ int ret = 0; ++ call_frame_t *cleanup_frame = NULL; ++ ++ cleanup_frame = create_frame (this, this->ctx->pool); ++ if (!cleanup_frame) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create " ++ "new frame to delete shards"); ++ return -ENOMEM; ++ } ++ ++ ret = synctask_new (this->ctx->env, shard_delete_shards, ++ shard_delete_shards_cbk, cleanup_frame, ++ cleanup_frame); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_WARNING, errno, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "failed to create task to do background " ++ "cleanup of shards"); ++ STACK_DESTROY (cleanup_frame->root); ++ } ++ return ret; ++} ++ ++int + shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) + { ++ int ret = 0; ++ shard_priv_t *priv = NULL; ++ gf_boolean_t i_start_cleanup = _gf_false; ++ ++ priv = this->private; ++ + if (op_ret < 0) + goto unwind; + +@@ -1460,6 +1506,25 @@ shard_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + + (void) shard_inode_ctx_update (inode, this, xdata, buf); + ++ LOCK (&priv->lock); ++ { ++ if (priv->first_lookup == SHARD_FIRST_LOOKUP_PENDING) { ++ priv->first_lookup = SHARD_FIRST_LOOKUP_IN_PROGRESS; ++ i_start_cleanup = _gf_true; ++ } ++ } ++ UNLOCK (&priv->lock); ++ ++ if (i_start_cleanup) { ++ ret = shard_start_background_deletion (this); ++ if (ret) { ++ LOCK (&priv->lock); ++ { ++ priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING; ++ } ++ UNLOCK (&priv->lock); ++ } ++ } + unwind: + SHARD_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); +@@ -1475,6 +1540,7 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + uint64_t block_size = 0; + shard_local_t *local = NULL; + ++ this->itable = loc->inode->table; + if (frame->root->pid != GF_CLIENT_PID_GSYNCD) { + SHARD_ENTRY_FOP_CHECK (loc, op_errno, err); + } +@@ -1496,7 +1562,7 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + GF_XATTR_SHARD_BLOCK_SIZE, 0); + if (ret) { + gf_msg (this->name, GF_LOG_WARNING, 0, +- SHARD_MSG_DICT_SET_FAILED, "Failed to set dict" ++ SHARD_MSG_DICT_OP_FAILED, "Failed to set dict" + " value: key:%s for path %s", + GF_XATTR_SHARD_BLOCK_SIZE, loc->path); + goto err; +@@ -1508,7 +1574,7 @@ shard_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + GF_XATTR_SHARD_FILE_SIZE, 8 * 4); + if (ret) { + gf_msg (this->name, GF_LOG_WARNING, 0, +- SHARD_MSG_DICT_SET_FAILED, ++ SHARD_MSG_DICT_OP_FAILED, + "Failed to set dict value: key:%s for path %s.", + GF_XATTR_SHARD_FILE_SIZE, loc->path); + goto err; +@@ -1901,12 +1967,6 @@ shard_truncate_last_shard (call_frame_t *frame, xlator_t *this, inode_t *inode) + return 0; + } + +-int +-shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, +- int32_t op_ret, int32_t op_errno, +- struct iatt *preparent, struct iatt *postparent, +- dict_t *xdata); +- + void + shard_unlink_block_inode (shard_local_t *local, int shard_block_num); + +@@ -1941,17 +2001,17 @@ done: + int + shard_truncate_htol (call_frame_t *frame, xlator_t *this, inode_t *inode) + { +- int i = 1; +- int ret = -1; +- int call_count = 0; +- uint32_t cur_block = 0; +- uint32_t last_block = 0; +- char path[PATH_MAX] = {0,}; +- char *bname = NULL; +- loc_t loc = {0,}; +- gf_boolean_t wind_failed = _gf_false; +- shard_local_t *local = NULL; +- shard_priv_t *priv = NULL; ++ int i = 1; ++ int ret = -1; ++ int call_count = 0; ++ uint32_t cur_block = 0; ++ uint32_t last_block = 0; ++ char path[PATH_MAX] = {0,}; ++ char *bname = NULL; ++ loc_t loc = {0,}; ++ gf_boolean_t wind_failed = _gf_false; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; + + local = frame->local; + priv = this->private; +@@ -2086,6 +2146,7 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + { + int list_index = 0; + char block_bname[256] = {0,}; ++ uuid_t gfid = {0,}; + inode_t *linked_inode = NULL; + xlator_t *this = NULL; + inode_t *fsync_inode = NULL; +@@ -2093,9 +2154,12 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + + this = THIS; + priv = this->private; ++ if (local->loc.inode) ++ gf_uuid_copy (gfid, local->loc.inode->gfid); ++ else ++ gf_uuid_copy (gfid, local->base_gfid); + +- shard_make_block_bname (block_num, (local->loc.inode)->gfid, +- block_bname, sizeof (block_bname)); ++ shard_make_block_bname (block_num, gfid, block_bname, sizeof (block_bname)); + + shard_inode_ctx_set (inode, this, buf, 0, SHARD_LOOKUP_MASK); + linked_inode = inode_link (inode, priv->dot_shard_inode, block_bname, +@@ -2125,9 +2189,14 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie, + { + int call_count = 0; + int shard_block_num = (long) cookie; ++ uuid_t gfid = {0,}; + shard_local_t *local = NULL; + + local = frame->local; ++ if (local->resolver_base_inode) ++ gf_uuid_copy (gfid, local->resolver_base_inode->gfid); ++ else ++ gf_uuid_copy (gfid, local->base_gfid); + + if (op_ret < 0) { + /* Ignore absence of shards in the backend in truncate fop. */ +@@ -2162,9 +2231,7 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie, + gf_msg (this->name, GF_LOG_ERROR, op_errno, + SHARD_MSG_LOOKUP_SHARD_FAILED, "Lookup on shard %d " + "failed. Base file gfid = %s", shard_block_num, +- (local->fop == GF_FOP_RENAME) ? +- uuid_utoa (local->loc2.inode->gfid) +- : uuid_utoa (local->loc.inode->gfid)); ++ uuid_utoa (gfid)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto done; +@@ -2173,25 +2240,18 @@ shard_common_lookup_shards_cbk (call_frame_t *frame, void *cookie, + shard_link_block_inode (local, shard_block_num, inode, buf); + + done: +- call_count = shard_call_count_return (frame); + if (local->lookup_shards_barriered) { + syncbarrier_wake (&local->barrier); + return 0; + } else { ++ call_count = shard_call_count_return (frame); + if (call_count == 0) { + if (!local->first_lookup_done) + local->first_lookup_done = _gf_true; +- if (local->op_ret < 0) +- goto unwind; +- else +- local->pls_fop_handler (frame, this); ++ local->pls_fop_handler (frame, this); + } + } + return 0; +- +-unwind: +- local->pls_fop_handler (frame, this); +- return 0; + } + + dict_t* +@@ -2237,6 +2297,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode, + int last_block = 0; + char path[PATH_MAX] = {0,}; + char *bname = NULL; ++ uuid_t gfid = {0,}; + loc_t loc = {0,}; + shard_local_t *local = NULL; + shard_priv_t *priv = NULL; +@@ -2252,6 +2313,11 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode, + if (local->lookup_shards_barriered) + local->barrier.waitfor = local->call_count; + ++ if (inode) ++ gf_uuid_copy (gfid, inode->gfid); ++ else ++ gf_uuid_copy (gfid, local->base_gfid); ++ + while (shard_idx_iter <= last_block) { + if (local->inode_list[i]) { + i++; +@@ -2267,7 +2333,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode, + goto next; + } + +- shard_make_block_abspath (shard_idx_iter, inode->gfid, path, ++ shard_make_block_abspath (shard_idx_iter, gfid, path, + sizeof(path)); + + bname = strrchr (path, '/') + 1; +@@ -2279,7 +2345,7 @@ shard_common_lookup_shards (call_frame_t *frame, xlator_t *this, inode_t *inode, + gf_msg (this->name, GF_LOG_ERROR, 0, + SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" + " on %s, base file gfid = %s", bname, +- uuid_utoa (inode->gfid)); ++ uuid_utoa (gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe (&loc); +@@ -2322,8 +2388,10 @@ next: + if (!--call_count) + break; + } +- if (local->lookup_shards_barriered) ++ if (local->lookup_shards_barriered) { + syncbarrier_wait (&local->barrier, count); ++ local->pls_fop_handler (frame, this); ++ } + return 0; + } + +@@ -2779,8 +2847,9 @@ shard_post_lookup_shards_unlink_handler (call_frame_t *frame, xlator_t *this) + local = frame->local; + + if ((local->op_ret < 0) && (local->op_errno != ENOENT)) { +- shard_common_failure_unwind (local->fop, frame, local->op_ret, +- local->op_errno); ++ gf_msg (this->name, GF_LOG_ERROR, local->op_errno, ++ SHARD_MSG_FOP_FAILED, "failed to delete shards of %s", ++ uuid_utoa (local->resolver_base_inode->gfid)); + return 0; + } + local->op_ret = 0; +@@ -2791,41 +2860,12 @@ shard_post_lookup_shards_unlink_handler (call_frame_t *frame, xlator_t *this) + } + + int +-shard_rename_cbk (call_frame_t *frame, xlator_t *this); +- +-int32_t +-shard_unlink_cbk (call_frame_t *frame, xlator_t *this); +- +-int + shard_post_resolve_unlink_handler (call_frame_t *frame, xlator_t *this) + { + shard_local_t *local = NULL; + + local = frame->local; +- +- if (local->op_ret < 0) { +- if (local->op_errno == ENOENT) { +- /* If lookup on /.shard fails with ENOENT, it probably +- * means that the file is being unlinked before it +- * could grow beyond its first block. In this case, +- * unlink boils down to unlinking the base file and +- * unwinding the call. +- */ +- local->op_ret = 0; +- local->first_block = local->last_block = 0; +- local->num_blocks = 1; +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_cbk (frame, this); +- else +- shard_rename_cbk (frame, this); +- return 0; +- } else { +- shard_common_failure_unwind (local->fop, frame, +- local->op_ret, +- local->op_errno); +- return 0; +- } +- } ++ local->lookup_shards_barriered = _gf_true; + + if (!local->call_count) + shard_unlink_shards_do (frame, this, +@@ -2841,6 +2881,7 @@ void + shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + { + char block_bname[256] = {0,}; ++ uuid_t gfid = {0,}; + inode_t *inode = NULL; + inode_t *base_inode = NULL; + xlator_t *this = NULL; +@@ -2854,12 +2895,17 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + + inode = local->inode_list[shard_block_num - local->first_block]; + base_inode = local->resolver_base_inode; ++ if (base_inode) ++ gf_uuid_copy (gfid, base_inode->gfid); ++ else ++ gf_uuid_copy (gfid, local->base_gfid); + +- shard_make_block_bname (shard_block_num, (local->loc.inode)->gfid, ++ shard_make_block_bname (shard_block_num, gfid, + block_bname, sizeof (block_bname)); + + LOCK(&priv->lock); +- LOCK(&base_inode->lock); ++ if (base_inode) ++ LOCK(&base_inode->lock); + LOCK(&inode->lock); + { + __shard_inode_ctx_get (inode, this, &ctx); +@@ -2870,14 +2916,18 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + unlink_unref_forget = _gf_true; + } + if (ctx->fsync_needed) { +- inode_unref (base_inode); ++ if (base_inode) ++ inode_unref (base_inode); + list_del_init (&ctx->to_fsync_list); +- __shard_inode_ctx_get (base_inode, this, &base_ictx); +- base_ictx->fsync_count--; ++ if (base_inode) { ++ __shard_inode_ctx_get (base_inode, this, &base_ictx); ++ base_ictx->fsync_count--; ++ } + } + } + UNLOCK(&inode->lock); +- UNLOCK(&base_inode->lock); ++ if (base_inode) ++ UNLOCK(&base_inode->lock); + if (unlink_unref_forget) { + inode_unlink (inode, priv->dot_shard_inode, block_bname); + inode_unref (inode); +@@ -2887,7 +2937,18 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + } + + int +-shard_rename_cbk (call_frame_t *frame, xlator_t *this); ++shard_rename_cbk (call_frame_t *frame, xlator_t *this) ++{ ++ shard_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, ++ &local->prebuf, &local->preoldparent, ++ &local->postoldparent, &local->prenewparent, ++ &local->postnewparent, local->xattr_rsp); ++ return 0; ++} + + int32_t + shard_unlink_cbk (call_frame_t *frame, xlator_t *this) +@@ -2906,7 +2967,6 @@ shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) + { +- int call_count = 0; + int shard_block_num = (long) cookie; + shard_local_t *local = NULL; + +@@ -2919,22 +2979,8 @@ shard_unlink_shards_do_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + } + + shard_unlink_block_inode (local, shard_block_num); +- + done: +- call_count = shard_call_count_return (frame); +- if (local->unlink_shards_barriered) { +- syncbarrier_wake (&local->barrier); +- } else { +- +- if (call_count == 0) { +- SHARD_UNSET_ROOT_FS_ID (frame, local); +- +- if (local->fop == GF_FOP_UNLINK) +- shard_unlink_cbk (frame, this); +- else if (local->fop == GF_FOP_RENAME) +- shard_rename_cbk (frame, this); +- } +- } ++ syncbarrier_wake (&local->barrier); + return 0; + } + +@@ -2944,11 +2990,11 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + int i = 0; + int ret = -1; + int count = 0; +- int call_count = 0; +- uint32_t last_block = 0; + uint32_t cur_block = 0; ++ uint32_t cur_block_idx = 0;/*this is idx into inode_list[] array */ + char *bname = NULL; + char path[PATH_MAX] = {0,}; ++ uuid_t gfid = {0,}; + loc_t loc = {0,}; + gf_boolean_t wind_failed = _gf_false; + shard_local_t *local = NULL; +@@ -2957,16 +3003,12 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + priv = this->private; + local = frame->local; + +- /* local->num_blocks includes the base file block. This function only +- * deletes the shards under /.shard. So subtract num_blocks by 1. +- */ +- local->call_count = call_count = local->num_blocks - 1; +- last_block = local->last_block; ++ if (inode) ++ gf_uuid_copy (gfid, inode->gfid); ++ else ++ gf_uuid_copy (gfid, local->base_gfid); + +- /* Ignore the inode associated with the base file and start counting +- * from 1. +- */ +- for (i = 1; i < local->num_blocks; i++) { ++ for (i = 0; i < local->num_blocks; i++) { + if (!local->inode_list[i]) + continue; + count++; +@@ -2975,35 +3017,21 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + if (!count) { + /* callcount = 0 implies that all of the shards that need to be + * unlinked are non-existent (in other words the file is full of +- * holes). So shard xlator can simply return the fop to its +- * parent now. ++ * holes). + */ + gf_msg_debug (this->name, 0, "All shards that need to be " + "unlinked are non-existent: %s", +- uuid_utoa (inode->gfid)); +- local->num_blocks = 1; +- if (local->fop == GF_FOP_UNLINK) { +- shard_unlink_cbk (frame, this); +- } else if (local->fop == GF_FOP_RENAME) { +- gf_msg_debug (this->name, 0, "Resuming rename()"); +- shard_rename_cbk (frame, this); +- } ++ uuid_utoa (gfid)); + return 0; + } + +- local->call_count = call_count = count; +- cur_block = 1; + SHARD_SET_ROOT_FS_ID (frame, local); +- if (local->unlink_shards_barriered) +- local->barrier.waitfor = count; ++ local->barrier.waitfor = count; ++ cur_block = cur_block_idx + local->first_block; + +- /* Ignore the base file and start iterating from the first block shard. +- */ +- while (cur_block <= last_block) { +- if (!local->inode_list[cur_block]) { +- cur_block++; +- continue; +- } ++ while (cur_block_idx < local->num_blocks) { ++ if (!local->inode_list[cur_block_idx]) ++ goto next; + + if (wind_failed) { + shard_unlink_shards_do_cbk (frame, +@@ -3013,8 +3041,7 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + goto next; + } + +- shard_make_block_abspath (cur_block, inode->gfid, path, +- sizeof (path)); ++ shard_make_block_abspath (cur_block, gfid, path, sizeof (path)); + bname = strrchr (path, '/') + 1; + loc.parent = inode_ref (priv->dot_shard_inode); + ret = inode_path (loc.parent, bname, (char **) &(loc.path)); +@@ -3022,7 +3049,7 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + gf_msg (this->name, GF_LOG_ERROR, 0, + SHARD_MSG_INODE_PATH_FAILED, "Inode path failed" + " on %s, base file gfid = %s", bname, +- uuid_utoa (inode->gfid)); ++ uuid_utoa (gfid)); + local->op_ret = -1; + local->op_errno = ENOMEM; + loc_wipe (&loc); +@@ -3037,26 +3064,505 @@ shard_unlink_shards_do (call_frame_t *frame, xlator_t *this, inode_t *inode) + loc.name = strrchr (loc.path, '/'); + if (loc.name) + loc.name++; +- loc.inode = inode_ref (local->inode_list[cur_block]); ++ loc.inode = inode_ref (local->inode_list[cur_block_idx]); + + STACK_WIND_COOKIE (frame, shard_unlink_shards_do_cbk, + (void *) (long) cur_block, FIRST_CHILD(this), + FIRST_CHILD (this)->fops->unlink, &loc, + local->xflag, local->xattr_req); + loc_wipe (&loc); +- + next: + cur_block++; +- if (!--call_count) +- break; ++ cur_block_idx++; + } +- if (local->unlink_shards_barriered) +- syncbarrier_wait (&local->barrier, count); ++ syncbarrier_wait (&local->barrier, count); ++ SHARD_UNSET_ROOT_FS_ID (frame, local); ++ return 0; ++} ++ ++int ++shard_regulated_shards_deletion (call_frame_t *cleanup_frame, xlator_t *this, ++ int now, int first_block, gf_dirent_t *entry) ++{ ++ int i = 0; ++ int ret = 0; ++ shard_local_t *local = NULL; ++ uuid_t gfid = {0,}; ++ ++ local = cleanup_frame->local; ++ ++ local->inode_list = GF_CALLOC (now, sizeof (inode_t *), ++ gf_shard_mt_inode_list); ++ if (!local->inode_list) ++ return -ENOMEM; ++ ++ local->first_block = first_block; ++ local->last_block = first_block + now - 1; ++ local->num_blocks = now; ++ gf_uuid_parse (entry->d_name, gfid); ++ gf_uuid_copy (local->base_gfid, gfid); ++ local->resolver_base_inode = inode_find (this->itable, gfid); ++ local->call_count = 0; ++ syncbarrier_init (&local->barrier); ++ ++ shard_common_resolve_shards (cleanup_frame, this, ++ shard_post_resolve_unlink_handler); ++ ++ for (i = 0; i < local->num_blocks; i++) { ++ if (local->inode_list[i]) ++ inode_unref (local->inode_list[i]); ++ } ++ GF_FREE (local->inode_list); ++ local->inode_list = NULL; ++ if (local->op_ret) ++ ret = -local->op_errno; ++ syncbarrier_destroy (&local->barrier); ++ inode_unref (local->resolver_base_inode); ++ local->resolver_base_inode = NULL; ++ STACK_RESET (cleanup_frame->root); ++ return ret; ++} ++ ++ ++int ++__shard_delete_shards_of_entry (call_frame_t *cleanup_frame, xlator_t *this, ++ gf_dirent_t *entry, inode_t *inode) ++{ ++ int ret = 0; ++ int shard_count = 0; ++ int first_block = 0; ++ int now = 0; ++ uint64_t size = 0; ++ uint64_t block_size = 0; ++ uint64_t size_array[4] = {0,}; ++ void *bsize = NULL; ++ void *size_attr = NULL; ++ dict_t *xattr_rsp = NULL; ++ loc_t loc = {0,}; ++ shard_local_t *local = NULL; ++ shard_priv_t *priv = NULL; + ++ priv = this->private; ++ local = cleanup_frame->local; ++ ret = dict_reset (local->xattr_req); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to reset dict"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = dict_set_uint64 (local->xattr_req, GF_XATTR_SHARD_BLOCK_SIZE, 0); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s", ++ GF_XATTR_SHARD_BLOCK_SIZE); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = dict_set_uint64 (local->xattr_req, GF_XATTR_SHARD_FILE_SIZE, ++ 8 * 4); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_WARNING, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to set dict value: key:%s", ++ GF_XATTR_SHARD_FILE_SIZE); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.inode = inode_ref (inode); ++ loc.parent = inode_ref (priv->dot_shard_rm_inode); ++ ret = inode_path (loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.name = strrchr (loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ret = syncop_lookup (FIRST_CHILD(this), &loc, NULL, NULL, ++ local->xattr_req, &xattr_rsp); ++ if (ret) ++ goto err; ++ ++ ret = dict_get_ptr (xattr_rsp, GF_XATTR_SHARD_BLOCK_SIZE, &bsize); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get dict value: key:%s", ++ GF_XATTR_SHARD_BLOCK_SIZE); ++ goto err; ++ } ++ block_size = ntoh64 (*((uint64_t *)bsize)); ++ ++ ret = dict_get_ptr (xattr_rsp, GF_XATTR_SHARD_FILE_SIZE, &size_attr); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, ++ "Failed to get dict value: key:%s", ++ GF_XATTR_SHARD_FILE_SIZE); ++ goto err; ++ } ++ ++ memcpy (size_array, size_attr, sizeof (size_array)); ++ size = ntoh64 (size_array[0]); ++ ++ shard_count = (size / block_size) - 1; ++ if (shard_count < 0) { ++ gf_msg_debug (this->name, 0, "Size of %s hasn't grown beyond " ++ "its shard-block-size. Nothing to delete. " ++ "Returning", entry->d_name); ++ /* File size < shard-block-size, so nothing to delete */ ++ ret = 0; ++ goto delete_marker; ++ } ++ if ((size % block_size) > 0) ++ shard_count++; ++ ++ if (shard_count == 0) { ++ gf_msg_debug (this->name, 0, "Size of %s is exactly equal to " ++ "its shard-block-size. Nothing to delete. " ++ "Returning", entry->d_name); ++ ret = 0; ++ goto delete_marker; ++ } ++ gf_msg_debug (this->name, 0, "base file = %s, " ++ "shard-block-size=%"PRIu64", file-size=%"PRIu64", " ++ "shard_count=%d", entry->d_name, block_size, size, ++ shard_count); ++ ++ /* Perform a gfid-based lookup to see if gfid corresponding to marker ++ * file's base name exists. ++ */ ++ loc_wipe (&loc); ++ loc.inode = inode_new (this->itable); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ gf_uuid_parse (entry->d_name, loc.gfid); ++ ret = syncop_lookup (FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); ++ if (!ret) { ++ gf_msg_debug (this->name, 0, "Base shard corresponding to gfid " ++ "%s is present. Skipping shard deletion. " ++ "Returning", entry->d_name); ++ ret = 0; ++ goto delete_marker; ++ } ++ ++ first_block = 1; ++ ++ while (shard_count) { ++ if (shard_count < local->deletion_rate) { ++ now = shard_count; ++ shard_count = 0; ++ } else { ++ now = local->deletion_rate; ++ shard_count -= local->deletion_rate; ++ } ++ ++ gf_msg_debug (this->name, 0, "deleting %d shards starting from " ++ "block %d of gfid %s", now, first_block, ++ entry->d_name); ++ ret = shard_regulated_shards_deletion (cleanup_frame, this, ++ now, first_block, ++ entry); ++ if (ret) ++ goto err; ++ first_block += now; ++ } ++ ++delete_marker: ++ loc_wipe (&loc); ++ loc.inode = inode_ref (inode); ++ loc.parent = inode_ref (priv->dot_shard_rm_inode); ++ ret = inode_path (loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ loc.name = strrchr (loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ret = syncop_unlink (FIRST_CHILD(this), &loc, NULL, NULL); ++ if (ret) ++ gf_msg (this->name, GF_LOG_ERROR, 0, ++ SHARD_MSG_SHARDS_DELETION_FAILED, "Failed to delete %s " ++ "from /%s", entry->d_name, GF_SHARD_REMOVE_ME_DIR); ++err: ++ if (xattr_rsp) ++ dict_unref (xattr_rsp); ++ loc_wipe (&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards_of_entry (call_frame_t *cleanup_frame, xlator_t *this, ++ gf_dirent_t *entry, inode_t *inode) ++{ ++ int ret = -1; ++ loc_t loc = {0,}; ++ shard_priv_t *priv = NULL; ++ ++ priv = this->private; ++ loc.inode = inode_ref (priv->dot_shard_rm_inode); ++ ++ ret = syncop_entrylk (FIRST_CHILD(this), this->name, &loc, ++ entry->d_name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, ++ NULL); ++ if (ret) ++ goto out; ++ { ++ ret = __shard_delete_shards_of_entry (cleanup_frame, this, ++ entry, inode); ++ } ++ syncop_entrylk (FIRST_CHILD(this), this->name, &loc, entry->d_name, ++ ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL, NULL); ++out: ++ loc_wipe (&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards_cbk (int ret, call_frame_t *frame, void *data) ++{ ++ xlator_t *this = NULL; ++ shard_priv_t *priv = NULL; ++ ++ this = frame->this; ++ priv = this->private; ++ ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_WARNING, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Background deletion of shards failed"); ++ priv->first_lookup = SHARD_FIRST_LOOKUP_PENDING; ++ } else { ++ priv->first_lookup = SHARD_FIRST_LOOKUP_DONE; ++ } ++ SHARD_STACK_DESTROY (frame); + return 0; + } + + int ++shard_resolve_internal_dir (xlator_t *this, shard_local_t *local, ++ shard_internal_dir_type_t type) ++{ ++ int ret = 0; ++ char *bname = NULL; ++ loc_t *loc = NULL; ++ shard_priv_t *priv = NULL; ++ uuid_t gfid = {0,}; ++ struct iatt stbuf = {0,}; ++ ++ priv = this->private; ++ ++ switch (type) { ++ case SHARD_INTERNAL_DIR_DOT_SHARD: ++ loc = &local->dot_shard_loc; ++ gf_uuid_copy (gfid, priv->dot_shard_gfid); ++ bname = GF_SHARD_DIR; ++ break; ++ case SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME: ++ loc = &local->dot_shard_rm_loc; ++ gf_uuid_copy (gfid, priv->dot_shard_rm_gfid); ++ bname = GF_SHARD_REMOVE_ME_DIR; ++ break; ++ default: ++ break; ++ } ++ ++ loc->inode = inode_find (this->itable, gfid); ++ if (!loc->inode) { ++ ret = shard_init_internal_dir_loc (this, local, type); ++ if (ret) ++ goto err; ++ ret = dict_reset (local->xattr_req); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_WARNING, 0, ++ SHARD_MSG_DICT_OP_FAILED, "Failed to reset " ++ "dict"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ret = dict_set_static_bin (local->xattr_req, "gfid-req", gfid, ++ 16); ++ ret = syncop_lookup (FIRST_CHILD(this), loc, &stbuf, NULL, ++ local->xattr_req, NULL); ++ if (ret < 0) { ++ if (ret != -ENOENT) ++ gf_msg (this->name, GF_LOG_ERROR, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Lookup on %s failed, exiting", bname); ++ goto err; ++ } else { ++ shard_link_internal_dir_inode (local, ++ loc->inode, &stbuf, ++ type); ++ } ++ } ++ ret = 0; ++err: ++ return ret; ++} ++ ++int ++shard_lookup_marker_entry (xlator_t *this, shard_local_t *local, ++ gf_dirent_t *entry) ++{ ++ int ret = 0; ++ loc_t loc = {0,}; ++ ++ loc.inode = inode_new (this->itable); ++ if (!loc.inode) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ loc.parent = inode_ref (local->fd->inode); ++ ++ ret = inode_path (loc.parent, entry->d_name, (char **)&(loc.path)); ++ if (ret < 0) { ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_INODE_PATH_FAILED, ++ "Inode path failed on %s", entry->d_name); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ loc.name = strrchr (loc.path, '/'); ++ if (loc.name) ++ loc.name++; ++ ++ ret = syncop_lookup (FIRST_CHILD(this), &loc, NULL, NULL, NULL, NULL); ++ if (ret < 0) { ++ goto err; ++ } ++ entry->inode = inode_ref (loc.inode); ++ ret = 0; ++err: ++ loc_wipe (&loc); ++ return ret; ++} ++ ++int ++shard_delete_shards (void *opaque) ++{ ++ int ret = 0; ++ off_t offset = 0; ++ loc_t loc = {0,}; ++ inode_t *link_inode = NULL; ++ xlator_t *this = NULL; ++ shard_priv_t *priv = NULL; ++ shard_local_t *local = NULL; ++ gf_dirent_t entries; ++ gf_dirent_t *entry = NULL; ++ call_frame_t *cleanup_frame = NULL; ++ ++ this = THIS; ++ priv = this->private; ++ INIT_LIST_HEAD (&entries.list); ++ ++ cleanup_frame = opaque; ++ ++ local = mem_get0 (this->local_pool); ++ if (!local) { ++ gf_msg (this->name, GF_LOG_WARNING, ENOMEM, ++ SHARD_MSG_MEMALLOC_FAILED, "Failed to create local to " ++ "delete shards"); ++ ret = -ENOMEM; ++ goto err; ++ } ++ cleanup_frame->local = local; ++ ++ local->xattr_req = dict_new (); ++ if (!local->xattr_req) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ local->deletion_rate = priv->deletion_rate; ++ ++ ret = shard_resolve_internal_dir (this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD); ++ if (ret == -ENOENT) { ++ gf_msg_debug (this->name, 0, ".shard absent. Nothing to" ++ " delete. Exiting"); ++ ret = 0; ++ goto err; ++ } else if (ret < 0) { ++ goto err; ++ } ++ ++ ret = shard_resolve_internal_dir (this, local, ++ SHARD_INTERNAL_DIR_DOT_SHARD_REMOVE_ME); ++ if (ret == -ENOENT) { ++ gf_msg_debug (this->name, 0, ".remove_me absent. " ++ "Nothing to delete. Exiting"); ++ ret = 0; ++ goto err; ++ } else if (ret < 0) { ++ goto err; ++ } ++ ++ local->fd = fd_anonymous (local->dot_shard_rm_loc.inode); ++ if (!local->fd) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ while ((ret = syncop_readdirp (FIRST_CHILD(this), local->fd, 131072, ++ offset, &entries, local->xattr_req, ++ NULL))) { ++ if (ret > 0) ++ ret = 0; ++ list_for_each_entry (entry, &entries.list, list) { ++ offset = entry->d_off; ++ ++ if (!strcmp (entry->d_name, ".") || ++ !strcmp (entry->d_name, "..")) ++ continue; ++ ++ if (!entry->inode) { ++ ret = shard_lookup_marker_entry (this, local, ++ entry); ++ if (ret < 0) ++ continue; ++ } ++ link_inode = inode_link (entry->inode, local->fd->inode, ++ entry->d_name, &entry->d_stat); ++ ++ gf_msg_debug (this->name, 0, "Initiating deletion of " ++ "shards of gfid %s", entry->d_name); ++ ret = shard_delete_shards_of_entry (cleanup_frame, this, ++ entry, link_inode); ++ inode_unlink (link_inode, local->fd->inode, ++ entry->d_name); ++ inode_unref (link_inode); ++ if (ret) { ++ gf_msg (this->name, GF_LOG_ERROR, -ret, ++ SHARD_MSG_SHARDS_DELETION_FAILED, ++ "Failed to clean up shards of gfid %s", ++ entry->d_name); ++ continue; ++ } ++ gf_msg (this->name, GF_LOG_INFO, 0, ++ SHARD_MSG_SHARDS_DELETION_COMPLETED, "Deleted " ++ "shards of gfid=%s from backend", ++ entry->d_name); ++ } ++ gf_dirent_free (&entries); ++ if (ret) ++ break; ++ } ++ ret = 0; ++err: ++ loc_wipe (&loc); ++ return ret; ++} ++ ++int + shard_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xdata) + { +@@ -3394,7 +3900,10 @@ shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->postoldparent = *postparent; + if (xdata) + local->xattr_rsp = dict_ref (xdata); ++ if (local->cleanup_required) ++ shard_start_background_deletion (this); + } ++ + if (local->entrylk_frame) { + ret = shard_unlock_entrylk (frame, this); + if (ret < 0) { +@@ -3408,6 +3917,7 @@ shard_unlink_base_file_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->op_ret = -1; + local->op_errno = -ret; + } ++ + shard_unlink_cbk (frame, this); + return 0; + } +@@ -3576,6 +4086,7 @@ shard_post_lookup_base_shard_rm_handler (call_frame_t *frame, xlator_t *this) + } else { + gf_msg_debug (this->name, 0, "link count on %s = 1, creating " + "file under .remove_me", local->int_inodelk.loc.path); ++ local->cleanup_required = _gf_true; + shard_acquire_entrylk (frame, this, priv->dot_shard_rm_inode, + local->prebuf.ia_gfid); + } +@@ -3788,20 +4299,6 @@ err: + } + + int +-shard_rename_cbk (call_frame_t *frame, xlator_t *this) +-{ +- shard_local_t *local = NULL; +- +- local = frame->local; +- +- SHARD_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, +- &local->prebuf, &local->preoldparent, +- &local->postoldparent, &local->prenewparent, +- &local->postnewparent, local->xattr_rsp); +- return 0; +-} +- +-int + shard_post_rename_lookup_handler (call_frame_t *frame, xlator_t *this) + { + shard_rename_cbk (frame, this); +@@ -3854,6 +4351,8 @@ shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->op_errno = -ret; + goto err; + } ++ if (local->cleanup_required) ++ shard_start_background_deletion (this); + } + + /* Now the base file of src, if sharded, is looked up to gather ia_size +@@ -4822,7 +5321,7 @@ shard_common_inode_write_do (call_frame_t *frame, xlator_t *this) + + if (dict_set_uint32 (local->xattr_req, + GLUSTERFS_WRITE_UPDATE_ATOMIC, 4)) { +- gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED, ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set "GLUSTERFS_WRITE_UPDATE_ATOMIC" into " + "dict: %s", uuid_utoa (fd->inode->gfid)); + local->op_ret = -1; +@@ -5141,7 +5640,7 @@ shard_mkdir_internal_dir (call_frame_t *frame, xlator_t *this, + + ret = dict_set_bin (xattr_req, "gfid-req", *gfid, 16); + if (ret) { +- gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_SET_FAILED, ++ gf_msg (this->name, GF_LOG_ERROR, 0, SHARD_MSG_DICT_OP_FAILED, + "Failed to set gfid-req for %s", + shard_internal_dir_string (type)); + goto err; +@@ -6186,6 +6685,8 @@ init (xlator_t *this) + + GF_OPTION_INIT ("shard-block-size", priv->block_size, size_uint64, out); + ++ GF_OPTION_INIT ("shard-deletion-rate", priv->deletion_rate, uint32, out); ++ + this->local_pool = mem_pool_new (shard_local_t, 128); + if (!this->local_pool) { + ret = -1; +@@ -6241,6 +6742,8 @@ reconfigure (xlator_t *this, dict_t *options) + GF_OPTION_RECONF ("shard-block-size", priv->block_size, options, size, + out); + ++ GF_OPTION_RECONF ("shard-deletion-rate", priv->deletion_rate, options, ++ uint32, out); + ret = 0; + + out: +@@ -6364,5 +6867,12 @@ struct volume_options options[] = { + .description = "The size unit used to break a file into multiple " + "chunks", + }, ++ { .key = {"shard-deletion-rate"}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "100", ++ .min = 100, ++ .max = INT_MAX, ++ .description = "The number of shards to send deletes on at a time", ++ }, + { .key = {NULL} }, + }; +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 1783ff6..5de098a 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -130,9 +130,9 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this); + sizeof (*__bs)); \ + if (__ret) { \ + gf_msg (this->name, GF_LOG_WARNING, 0, \ +- SHARD_MSG_DICT_SET_FAILED, "Failed to set key: %s " \ ++ SHARD_MSG_DICT_OP_FAILED, "Failed to set key: %s " \ + "on path %s", GF_XATTR_SHARD_BLOCK_SIZE, (loc)->path);\ +- GF_FREE (__bs); \ ++ GF_FREE (__bs); \ + goto label; \ + } \ + \ +@@ -144,7 +144,7 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this); + __size_attr, 8 * 4); \ + if (__ret) { \ + gf_msg (this->name, GF_LOG_WARNING, 0, \ +- SHARD_MSG_DICT_SET_FAILED, "Failed to set key: %s " \ ++ SHARD_MSG_DICT_OP_FAILED, "Failed to set key: %s " \ + "on path %s", GF_XATTR_SHARD_FILE_SIZE, (loc)->path); \ + GF_FREE (__size_attr); \ + goto label; \ +@@ -160,7 +160,7 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this); + local->op_ret = -1; \ + local->op_errno = ENOMEM; \ + gf_msg (this->name, GF_LOG_WARNING, 0, \ +- SHARD_MSG_DICT_SET_FAILED, "Failed to set dict value:"\ ++ SHARD_MSG_DICT_OP_FAILED, "Failed to set dict value:"\ + " key:%s for %s.", GF_XATTR_SHARD_FILE_SIZE, \ + uuid_utoa (gfid)); \ + goto label; \ +@@ -197,6 +197,12 @@ shard_unlock_entrylk (call_frame_t *frame, xlator_t *this); + } \ + } while (0) + ++typedef enum { ++ SHARD_FIRST_LOOKUP_PENDING = 0, ++ SHARD_FIRST_LOOKUP_IN_PROGRESS, ++ SHARD_FIRST_LOOKUP_DONE, ++} shard_first_lookup_state_t; ++ + /* rm = "remove me" */ + + typedef struct shard_priv { +@@ -208,6 +214,8 @@ typedef struct shard_priv { + gf_lock_t lock; + int inode_count; + struct list_head ilist_head; ++ uint32_t deletion_rate; ++ shard_first_lookup_state_t first_lookup; + } shard_priv_t; + + typedef struct { +@@ -303,6 +311,9 @@ typedef struct shard_local { + call_frame_t *main_frame; + call_frame_t *inodelk_frame; + call_frame_t *entrylk_frame; ++ uint32_t deletion_rate; ++ gf_boolean_t cleanup_required; ++ uuid_t base_gfid; + } shard_local_t; + + typedef struct shard_inode_ctx { +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index 5a697cf..4357562 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3298,6 +3298,11 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .op_version = GD_OP_VERSION_3_7_0, + .flags = OPT_FLAG_CLIENT_OPT + }, ++ { .key = "features.shard-deletion-rate", ++ .voltype = "features/shard", ++ .op_version = GD_OP_VERSION_4_2_0, ++ .flags = OPT_FLAG_CLIENT_OPT ++ }, + { .key = "features.scrub-throttle", + .voltype = "features/bit-rot", + .value = "lazy", +-- +1.8.3.1 + diff --git a/0431-glusterd-Reset-op-version-for-features.shard-deletio.patch b/0431-glusterd-Reset-op-version-for-features.shard-deletio.patch new file mode 100644 index 0000000..73eaf67 --- /dev/null +++ b/0431-glusterd-Reset-op-version-for-features.shard-deletio.patch @@ -0,0 +1,58 @@ +From c06048a218b4a2e56f72b05b4f9f5842eec611e4 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Wed, 12 Sep 2018 21:41:35 +0530 +Subject: [PATCH 431/444] glusterd: Reset op-version for + "features.shard-deletion-rate" + +The op-version for the "features.shard-deletion-rate" option was set to +4.2.0 in the upstream patch and backported at +e75be952569eb69325d5f505f7ab94aace31be52. +This commit reverts the op-version for this option to 3.13.3. + +Label: DOWNSTREAM ONLY + +Change-Id: Ie3d12f3119ad7a4b40d81bd8bd6ed591658e8371 +BUG: 1520882 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/154865 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + libglusterfs/src/globals.h | 3 ++- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 2 +- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index 699e73e..97c4fad 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -109,11 +109,12 @@ + + #define GD_OP_VERSION_3_13_2 31302 /* Op-version for GlusterFS 3.13.2 */ + +-#define GD_OP_VERSION_4_2_0 40200 /* Op-version for GlusterFs 4.2.0 */ ++#define GD_OP_VERSION_3_13_3 31303 /* Op-version for GlusterFS 3.13.3 */ + + /* Downstream only change */ + #define GD_OP_VERSION_3_11_2 31102 /* Op-version for RHGS 3.3.1-async */ + #define GD_OP_VERSION_3_13_3 31303 /* Op-version for RHGS-3.4-Batch Update-1*/ ++#define GD_OP_VERSION_3_13_4 31304 /* Op-version for RHGS-3.4-Batch Update-2*/ + + #include "xlator.h" + +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index 4357562..a825f52 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3300,7 +3300,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { + }, + { .key = "features.shard-deletion-rate", + .voltype = "features/shard", +- .op_version = GD_OP_VERSION_4_2_0, ++ .op_version = GD_OP_VERSION_3_13_4, + .flags = OPT_FLAG_CLIENT_OPT + }, + { .key = "features.scrub-throttle", +-- +1.8.3.1 + diff --git a/0432-features-shard-Fix-crash-and-test-case-in-RENAME-fop.patch b/0432-features-shard-Fix-crash-and-test-case-in-RENAME-fop.patch new file mode 100644 index 0000000..82a43f5 --- /dev/null +++ b/0432-features-shard-Fix-crash-and-test-case-in-RENAME-fop.patch @@ -0,0 +1,250 @@ +From 212e89f8b257463ace8093dfc72253f515adb234 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Thu, 2 Aug 2018 21:48:34 +0530 +Subject: [PATCH 432/444] features/shard: Fix crash and test case in RENAME fop + +> Upstream: https://review.gluster.org/20623 +> BUG: 1611692 +> Change-Id: Iaf85a5ee3dff8b01a76e11972f10f2bb9dcbd407 + +Setting the refresh flag in inode ctx in shard_rename_src_cbk() +is applicable only when the dst file exists and is sharded and +has a hard link > 1 at the time of rename. + +But this piece of code is exercised even when dst doesn't exist. +In this case, the mount crashes because local->int_inodelk.loc.inode +is NULL. + +Change-Id: Iaf85a5ee3dff8b01a76e11972f10f2bb9dcbd407 +BUG: 1520882 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/154866 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + tests/bugs/shard/unlinks-and-renames.t | 96 ++++++++++++++++++++-------------- + xlators/features/shard/src/shard.c | 7 ++- + 2 files changed, 61 insertions(+), 42 deletions(-) + +diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t +index 6e5164f..990ca69 100644 +--- a/tests/bugs/shard/unlinks-and-renames.t ++++ b/tests/bugs/shard/unlinks-and-renames.t +@@ -31,9 +31,10 @@ TEST mkdir $M0/dir + TEST touch $M0/dir/foo + TEST touch $M0/dir/new + +-###################################### +-##### Unlink with /.shard absent ##### +-###################################### ++########################################## ++##### 01. Unlink with /.shard absent ##### ++########################################## ++ + TEST truncate -s 5M $M0/dir/foo + TEST ! stat $B0/${V0}0/.shard + TEST ! stat $B0/${V0}1/.shard +@@ -45,9 +46,10 @@ TEST stat $B0/${V0}1/.shard/.remove_me + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo + +-################################################## +-##### Unlink of a sharded file without holes ##### +-################################################## ++###################################################### ++##### 02. Unlink of a sharded file without holes ##### ++###################################################### ++ + # Create a 9M sharded file + TEST dd if=/dev/zero of=$M0/dir/new bs=1024 count=9216 + gfid_new=$(get_gfid_string $M0/dir/new) +@@ -65,9 +67,10 @@ TEST ! stat $B0/${V0}1/dir/new + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new + +-####################################### +-##### Unlink with /.shard present ##### +-####################################### ++########################################### ++##### 03. Unlink with /.shard present ##### ++########################################### ++ + TEST truncate -s 5M $M0/dir/foo + gfid_foo=$(get_gfid_string $M0/dir/foo) + # Ensure its shards are absent. +@@ -81,9 +84,10 @@ TEST ! stat $M0/dir/foo + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo + +-############################################################# +-##### Unlink of a file with only one block (the zeroth) ##### +-############################################################# ++################################################################# ++##### 04. Unlink of a file with only one block (the zeroth) ##### ++################################################################# ++ + TEST touch $M0/dir/foo + gfid_foo=$(get_gfid_string $M0/dir/foo) + TEST dd if=/dev/zero of=$M0/dir/foo bs=1024 count=1024 +@@ -95,9 +99,10 @@ TEST ! stat $M0/dir/foo + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_foo + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_foo + +-#################################################### +-##### Unlink of a sharded file with hard-links ##### +-#################################################### ++######################################################## ++##### 05. Unlink of a sharded file with hard-links ##### ++######################################################## ++ + # Create a 9M sharded file + TEST dd if=/dev/zero of=$M0/dir/original bs=1024 count=9216 + gfid_original=$(get_gfid_string $M0/dir/original) +@@ -154,9 +159,10 @@ TEST mkdir $M0/dir + TEST touch $M0/dir/src + TEST touch $M0/dir/dst + +-###################################### +-##### Rename with /.shard absent ##### +-###################################### ++########################################## ++##### 06. Rename with /.shard absent ##### ++########################################## ++ + TEST truncate -s 5M $M0/dir/dst + gfid_dst=$(get_gfid_string $M0/dir/dst) + TEST ! stat $B0/${V0}0/.shard +@@ -172,9 +178,10 @@ TEST stat $B0/${V0}1/dir/dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + +-################################################## +-##### Rename to a sharded file without holes ##### +-################################################## ++###################################################### ++##### 07. Rename to a sharded file without holes ##### ++###################################################### ++ + TEST unlink $M0/dir/dst + TEST touch $M0/dir/src + # Create a 9M sharded file +@@ -197,9 +204,10 @@ TEST stat $B0/${V0}1/dir/dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + +-################################################### +-##### Rename of dst file with /.shard present ##### +-################################################### ++####################################################### ++##### 08. Rename of dst file with /.shard present ##### ++####################################################### ++ + TEST unlink $M0/dir/dst + TEST touch $M0/dir/src + TEST truncate -s 5M $M0/dir/dst +@@ -215,9 +223,10 @@ TEST stat $B0/${V0}1/dir/dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + +-############################################################### +-##### Rename of dst file with only one block (the zeroth) ##### +-############################################################### ++################################################################### ++##### 09. Rename of dst file with only one block (the zeroth) ##### ++################################################################### ++ + TEST unlink $M0/dir/dst + TEST touch $M0/dir/src + TEST dd if=/dev/zero of=$M0/dir/dst bs=1024 count=1024 +@@ -233,9 +242,10 @@ TEST stat $B0/${V0}1/dir/dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + +-######################################################## +-##### Rename to a dst sharded file with hard-links ##### +-######################################################## ++############################################################ ++##### 10. Rename to a dst sharded file with hard-links ##### ++############################################################ ++ + TEST unlink $M0/dir/dst + TEST touch $M0/dir/src + # Create a 9M sharded file +@@ -276,7 +286,10 @@ TEST ! stat $B0/${V0}1/dir/src2 + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_dst + EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_dst + +-# Rename with non-existent dst and a sharded src ++############################################################## ++##### 11. Rename with non-existent dst and a sharded src ##### ++##############################################################l ++ + TEST touch $M0/dir/src + TEST dd if=/dev/zero of=$M0/dir/src bs=1024 count=9216 + gfid_src=$(get_gfid_string $M0/dir/src) +@@ -286,7 +299,7 @@ TEST stat $B0/${V0}1/.shard/$gfid_src.1 + TEST stat $B0/${V0}0/.shard/$gfid_src.2 + TEST stat $B0/${V0}1/.shard/$gfid_src.2 + # Now rename src to the dst. +-TEST mv $M0/dir/src $M0/dir/dst ++TEST mv $M0/dir/src $M0/dir/dst2 + + TEST stat $B0/${V0}0/.shard/$gfid_src.1 + TEST stat $B0/${V0}1/.shard/$gfid_src.1 +@@ -295,23 +308,26 @@ TEST stat $B0/${V0}1/.shard/$gfid_src.2 + TEST ! stat $M0/dir/src + TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src +-TEST stat $M0/dir/dst +-TEST stat $B0/${V0}0/dir/dst +-TEST stat $B0/${V0}1/dir/dst ++TEST stat $M0/dir/dst2 ++TEST stat $B0/${V0}0/dir/dst2 ++TEST stat $B0/${V0}1/dir/dst2 ++ ++############################################################################# ++##### 12. Rename with non-existent dst and a sharded src with no shards ##### ++############################################################################# + +-# Rename with non-existent dst and a sharded src with no shards + TEST touch $M0/dir/src + TEST dd if=/dev/zero of=$M0/dir/src bs=1024 count=1024 + gfid_src=$(get_gfid_string $M0/dir/src) + TEST ! stat $B0/${V0}0/.shard/$gfid_src.1 + TEST ! stat $B0/${V0}1/.shard/$gfid_src.1 + # Now rename src to the dst. +-TEST mv $M0/dir/src $M0/dir/dst ++TEST mv $M0/dir/src $M0/dir/dst1 + TEST ! stat $M0/dir/src + TEST ! stat $B0/${V0}0/dir/src + TEST ! stat $B0/${V0}1/dir/src +-TEST stat $M0/dir/dst +-TEST stat $B0/${V0}0/dir/dst +-TEST stat $B0/${V0}1/dir/dst ++TEST stat $M0/dir/dst1 ++TEST stat $B0/${V0}0/dir/dst1 ++TEST stat $B0/${V0}1/dir/dst1 + + cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 2faf711..6066a54 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -4324,9 +4324,12 @@ shard_rename_src_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + } + /* Set ctx->refresh to TRUE to force a lookup on disk when + * shard_lookup_base_file() is called next to refresh the hard link +- * count in ctx ++ * count in ctx. Note that this is applicable only to the case where ++ * the rename dst is already existent and sharded. + */ +- shard_inode_ctx_set_refresh_flag (local->int_inodelk.loc.inode, this); ++ if ((local->dst_block_size) && (!local->cleanup_required)) ++ shard_inode_ctx_set_refresh_flag (local->int_inodelk.loc.inode, ++ this); + + local->prebuf = *buf; + local->preoldparent = *preoldparent; +-- +1.8.3.1 + diff --git a/0433-mgmt-glusterd-use-proper-path-to-the-volfile.patch b/0433-mgmt-glusterd-use-proper-path-to-the-volfile.patch new file mode 100644 index 0000000..9c1d8a2 --- /dev/null +++ b/0433-mgmt-glusterd-use-proper-path-to-the-volfile.patch @@ -0,0 +1,153 @@ +From fd9e0103cd5c3f2962e063dbc3083c451b7e592b Mon Sep 17 00:00:00 2001 +From: Raghavendra Bhat +Date: Thu, 4 Oct 2018 14:27:45 -0400 +Subject: [PATCH 433/444] mgmt/glusterd: use proper path to the volfile + + > Upstream: https://review.gluster.org/#/c/glusterfs/+/21314/ + > BUG: 1635050 + > Change-Id: I28b2dfa5d9b379fe943db92c2fdfea879a6a594e + +NOTE: This patch is actually directly applied from the patch that + was sent to the release-4.1 branch. The master branch patch + will have merge conflicts due to the clang format changes done + there. This is the patch which this commit is a backport of. + + upstream(4.1): https://review.gluster.org/#/c/glusterfs/+/21348/ + Chane-ID: I28b2dfa5d9b379fe943db92c2fdfea879a6a594e + +Till now, glusterd was generating the volfile path for the snapshot +volume's bricks like this. + +/snaps// + +But in reality, the path to the brick volfile for a snapshot volume is + +/snaps/// + +The above workaround was used to distinguish between a mount command used +to mount the snapshot volume, and a brick of the snapshot volume, so that +based on what is actually happening, glusterd can return the proper volfile +(client volfile for the former and the brick volfile for the latter). But, +this was causing problems for snapshot restore when brick multiplexing is +enabled. Because, with brick multiplexing, it tries to find the volfile +and sends GETSPEC rpc call to glusterd using the 2nd style of path i.e. + +/snaps/// + +So, when the snapshot brick (which is multiplexed) sends a GETSPEC rpc +request to glusterd for obtaining the brick volume file, glusterd was +returning the client volume file of the snapshot volume instead of the +brick volume file. + +Change-Id: I28b2dfa5d9b379fe943db92c2fdfea879a6a594e +BUG: 1636291 +Signed-off-by: Raghavendra Bhat +Reviewed-on: https://code.engineering.redhat.com/gerrit/155129 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + .../snapview-server/src/snapview-server-helpers.c | 5 +++-- + xlators/mgmt/glusterd/src/glusterd-handshake.c | 20 ++++++++++++++++++-- + xlators/mgmt/glusterd/src/glusterd-utils.c | 9 +++++---- + 3 files changed, 26 insertions(+), 8 deletions(-) + +diff --git a/xlators/features/snapview-server/src/snapview-server-helpers.c b/xlators/features/snapview-server/src/snapview-server-helpers.c +index 2ad74ef..4c2edc6 100644 +--- a/xlators/features/snapview-server/src/snapview-server-helpers.c ++++ b/xlators/features/snapview-server/src/snapview-server-helpers.c +@@ -481,8 +481,9 @@ __svs_initialise_snapshot_volume (xlator_t *this, const char *name, + goto out; + } + +- snprintf (volname, sizeof (volname), "/snaps/%s/%s", +- dirent->name, dirent->snap_volname); ++ snprintf (volname, sizeof (volname), "/snaps/%s/%s/%s", ++ dirent->name, dirent->snap_volname, ++ dirent->snap_volname); + + + fs = glfs_new (volname); +diff --git a/xlators/mgmt/glusterd/src/glusterd-handshake.c b/xlators/mgmt/glusterd/src/glusterd-handshake.c +index d5594d0..b2a9b20 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-handshake.c ++++ b/xlators/mgmt/glusterd/src/glusterd-handshake.c +@@ -52,6 +52,7 @@ get_snap_volname_and_volinfo (const char *volpath, char **volname, + char *vol = NULL; + glusterd_snap_t *snap = NULL; + xlator_t *this = NULL; ++ char *volfile_token = NULL; + + this = THIS; + GF_ASSERT (this); +@@ -101,12 +102,27 @@ get_snap_volname_and_volinfo (const char *volpath, char **volname, + */ + ret = glusterd_volinfo_find (volname_token, volinfo); + if (ret) { +- *volname = gf_strdup (volname_token); ++ gf_msg (this->name, GF_LOG_WARNING, 0, GD_MSG_VOLINFO_GET_FAIL, ++ "failed to get the volinfo for the volume %s", ++ volname_token); ++ ++ /* Get the actual volfile name */ ++ volfile_token = strtok_r (NULL, "/", &save_ptr); ++ *volname = gf_strdup (volfile_token); + if (NULL == *volname) { + ret = -1; + goto out; + } + ++ /* ++ * Ideally, this should succeed as volname_token now ++ * contains the name of the snap volume (i.e. name of ++ * the volume that represents the snapshot). ++ * But, if for some reason, volinfo for the snap volume ++ * is not found, then try to get from the name of the ++ * volfile. Name of the volfile is like this. ++ * ...vol ++ */ + ret = glusterd_snap_volinfo_find (volname_token, snap, + volinfo); + if (ret) { +@@ -115,7 +131,7 @@ get_snap_volname_and_volinfo (const char *volpath, char **volname, + if (!vol) { + gf_msg (this->name, GF_LOG_ERROR, EINVAL, + GD_MSG_INVALID_ENTRY, "Invalid " +- "volname (%s)", volname_token); ++ "volname (%s)", volfile_token); + goto out; + } + +diff --git a/xlators/mgmt/glusterd/src/glusterd-utils.c b/xlators/mgmt/glusterd/src/glusterd-utils.c +index 04fae63..7179a68 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-utils.c ++++ b/xlators/mgmt/glusterd/src/glusterd-utils.c +@@ -2068,10 +2068,10 @@ retry: + } + + if (volinfo->is_snap_volume) { +- snprintf (volfile, PATH_MAX,"/%s/%s/%s.%s.%s", ++ snprintf (volfile, PATH_MAX, "/%s/%s/%s/%s.%s.%s", + GLUSTERD_VOL_SNAP_DIR_PREFIX, + volinfo->snapshot->snapname, volinfo->volname, +- brickinfo->hostname, exp_path); ++ volinfo->volname, brickinfo->hostname, exp_path); + } else { + snprintf (volfile, PATH_MAX, "%s.%s.%s", volinfo->volname, + brickinfo->hostname, exp_path); +@@ -5676,10 +5676,11 @@ attach_brick (xlator_t *this, + GLUSTERD_GET_BRICK_PIDFILE (pidfile2, volinfo, brickinfo, conf); + + if (volinfo->is_snap_volume) { +- snprintf (full_id, sizeof(full_id), "/%s/%s/%s.%s.%s", ++ snprintf (full_id, sizeof(full_id), "/%s/%s/%s/%s.%s.%s", + GLUSTERD_VOL_SNAP_DIR_PREFIX, + volinfo->snapshot->snapname, +- volinfo->volname, brickinfo->hostname, unslashed); ++ volinfo->volname, volinfo->volname, ++ brickinfo->hostname, unslashed); + } else { + snprintf (full_id, sizeof(full_id), "%s.%s.%s", + volinfo->volname, brickinfo->hostname, unslashed); +-- +1.8.3.1 + diff --git a/0434-cluster-afr-s-uuid_is_null-gf_uuid_is_null.patch b/0434-cluster-afr-s-uuid_is_null-gf_uuid_is_null.patch new file mode 100644 index 0000000..22c3883 --- /dev/null +++ b/0434-cluster-afr-s-uuid_is_null-gf_uuid_is_null.patch @@ -0,0 +1,32 @@ +From f0914172f481bb32b202612b080f7902ac31ad30 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K +Date: Tue, 6 Nov 2018 21:32:55 +0530 +Subject: [PATCH 434/444] cluster/afr: s/uuid_is_null/gf_uuid_is_null + +BUG: 1619357 +Upstream-patch: https://review.gluster.org/c/glusterfs/+/21571 +Change-Id: I006116d329ac96268db132ae3aac06cc2be70e75 +Signed-off-by: Pranith Kumar K +Reviewed-on: https://code.engineering.redhat.com/gerrit/155128 +Reviewed-by: Atin Mukherjee +Tested-by: RHGS Build Bot +--- + xlators/cluster/afr/src/afr-common.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index ce2b17a..10d9620 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -2768,7 +2768,7 @@ afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this) + continue; + + if (replies[i].op_ret == 0) { +- if (uuid_is_null (gfid)) { ++ if (gf_uuid_is_null (gfid)) { + gf_uuid_copy (gfid, + replies[i].poststat.ia_gfid); + } +-- +1.8.3.1 + diff --git a/0435-geo-rep-Fix-traceback-with-symlink-metadata-sync.patch b/0435-geo-rep-Fix-traceback-with-symlink-metadata-sync.patch new file mode 100644 index 0000000..5399ba4 --- /dev/null +++ b/0435-geo-rep-Fix-traceback-with-symlink-metadata-sync.patch @@ -0,0 +1,93 @@ +From 7e7ffc4cc56b6b6ed460a49344082c3c25c1a23d Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Mon, 5 Nov 2018 11:46:41 +0530 +Subject: [PATCH 435/444] geo-rep: Fix traceback with symlink metadata sync + +While syncing metadata, 'os.chmod', 'os.chown', +'os.utime' should be used without de-reference. +But python supports only 'os.chown' without +de-reference. That's mostly because Linux +doesn't support 'chmod' on symlink file itself +but it does support 'chown'. + +So while syncing metadata ops, if it's symlink +we should only sync 'chown' and not do 'chmod' +and 'utime'. It will lead to tracebacks with +errors like EROFS, EPERM, ACCESS, ENOENT. +All the three errors (EPERM, ACCESS, ENOENT) +were handled except EROFS. But the way it was +handled was not fool proof. The operation is +tried and failure was handled based on the errors. +All the errors with symlink file for 'chown', +'utime' had to be passed to safe errors list of +'errno_wrap'. This patch handles it better by +avoiding 'chmod' and 'utime' if it's symlink +file. + +Backport of: + > Patch: https://review.gluster.org/21546 + > fixes: bz#1646104 + > Change-Id: Ic354206455cdc7ab2a87d741d81f4efe1f19d77d + > Signed-off-by: Kotresh HR + +BUG: 1645916 +Change-Id: Ic354206455cdc7ab2a87d741d81f4efe1f19d77d +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/155049 +Tested-by: RHGS Build Bot +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Reviewed-by: Sunny Kumar +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/resource.py | 26 +++++++++++--------------- + 1 file changed, 11 insertions(+), 15 deletions(-) + +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index eb696f3..b289b3b 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -790,10 +790,8 @@ class Server(object): + # 'lchown' 'lchmod' 'utime with no-deference' blindly. + # But since 'lchmod' and 'utime with no de-reference' is + # not supported in python3, we have to rely on 'chmod' +- # and 'utime with de-reference'. But 'chmod' +- # de-reference the symlink and gets ENOENT, EACCES, +- # EPERM errors, hence ignoring those errors if it's on +- # symlink file. ++ # and 'utime with de-reference'. Hence avoiding 'chmod' ++ # and 'utime' if it's symlink file. + + is_symlink = False + cmd_ret = errno_wrap(os.lchown, [go, uid, gid], [ENOENT], +@@ -801,19 +799,17 @@ class Server(object): + if isinstance(cmd_ret, int): + continue + +- cmd_ret = errno_wrap(os.chmod, [go, mode], +- [ENOENT, EACCES, EPERM], [ESTALE, EINVAL]) +- if isinstance(cmd_ret, int): +- is_symlink = os.path.islink(go) +- if not is_symlink: ++ is_symlink = os.path.islink(go) ++ ++ if not is_symlink: ++ cmd_ret = errno_wrap(os.chmod, [go, mode], ++ [ENOENT, EACCES, EPERM], [ESTALE, EINVAL]) ++ if isinstance(cmd_ret, int): + failures.append((e, cmd_ret, "chmod")) + +- cmd_ret = errno_wrap(os.utime, [go, (atime, mtime)], +- [ENOENT, EACCES, EPERM], [ESTALE, EINVAL]) +- if isinstance(cmd_ret, int): +- if not is_symlink: +- is_symlink = os.path.islink(go) +- if not is_symlink: ++ cmd_ret = errno_wrap(os.utime, [go, (atime, mtime)], ++ [ENOENT, EACCES, EPERM], [ESTALE, EINVAL]) ++ if isinstance(cmd_ret, int): + failures.append((e, cmd_ret, "utime")) + return failures + +-- +1.8.3.1 + diff --git a/0436-geo-rep-Fix-issue-in-gfid-conflict-resolution.patch b/0436-geo-rep-Fix-issue-in-gfid-conflict-resolution.patch new file mode 100644 index 0000000..33b8721 --- /dev/null +++ b/0436-geo-rep-Fix-issue-in-gfid-conflict-resolution.patch @@ -0,0 +1,204 @@ +From f42b8789cdcd93cb9fa93f35ed067268ce75f789 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Thu, 25 Oct 2018 03:23:56 -0400 +Subject: [PATCH 436/444] geo-rep: Fix issue in gfid-conflict-resolution + +Problem: +During gfid-conflict-resolution, geo-rep crashes +with 'ValueError: list.remove(x): x not in list' + +Cause and Analysis: +During gfid-conflict-resolution, the entry blob is +passed back to master along with additional +information to verify it's integrity. If everything +looks fine, the entry creation is ignored and is +deleted from the original list. But it is crashing +during removal of entry from the list saying entry +not in list. The reason is that the stat information +in the entry blob was modified and sent back to +master if present. + +Fix: +Send back the correct stat information for +gfid-conflict-resolution. + +Backport of: + > Patch: https://review.gluster.org/21483 + > fixes: bz#1642865 + > Change-Id: I47a6aa60b2a495465aa9314eebcb4085f0b1c4fd + > Signed-off-by: Kotresh HR + +BUG: 1640347 +Change-Id: I47a6aa60b2a495465aa9314eebcb4085f0b1c4fd +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/155038 +Tested-by: RHGS Build Bot +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Reviewed-by: Sunny Kumar +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/resource.py | 42 +++++++++++++++++++--------------- + 1 file changed, 24 insertions(+), 18 deletions(-) + +diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py +index b289b3b..f16066e 100644 +--- a/geo-replication/syncdaemon/resource.py ++++ b/geo-replication/syncdaemon/resource.py +@@ -456,7 +456,7 @@ class Server(object): + st['uid'], st['gid'], + gf, st['mode'], bn, lnk) + +- def entry_purge(op, entry, gfid, e): ++ def entry_purge(op, entry, gfid, e, uid, gid): + # This is an extremely racy code and needs to be fixed ASAP. + # The GFID check here is to be sure that the pargfid/bname + # to be purged is the GFID gotten from the changelog. +@@ -470,7 +470,7 @@ class Server(object): + return + + if not matching_disk_gfid(gfid, entry): +- collect_failure(e, EEXIST) ++ collect_failure(e, EEXIST, uid, gid) + return + + if op == 'UNLINK': +@@ -486,7 +486,7 @@ class Server(object): + if er == ENOTEMPTY: + return er + +- def collect_failure(e, cmd_ret, dst=False): ++ def collect_failure(e, cmd_ret, uid, gid, dst=False): + slv_entry_info = {} + slv_entry_info['gfid_mismatch'] = False + slv_entry_info['name_mismatch'] = False +@@ -499,6 +499,11 @@ class Server(object): + if cmd_ret is None: + return False + ++ if e.get("stat", {}): ++ # Copy actual UID/GID value back to entry stat ++ e['stat']['uid'] = uid ++ e['stat']['gid'] = gid ++ + if cmd_ret == EEXIST: + if dst: + en = e['entry1'] +@@ -559,7 +564,7 @@ class Server(object): + + errno_wrap(os.rmdir, [path], [ENOENT, ESTALE], [EBUSY]) + +- def rename_with_disk_gfid_confirmation(gfid, entry, en): ++ def rename_with_disk_gfid_confirmation(gfid, entry, en, uid, gid): + if not matching_disk_gfid(gfid, entry): + logging.error(lf("RENAME ignored: source entry does not match " + "with on-disk gfid", +@@ -567,14 +572,13 @@ class Server(object): + gfid=gfid, + disk_gfid=get_gfid_from_mnt(entry), + target=en)) +- collect_failure(e, EEXIST) ++ collect_failure(e, EEXIST, uid, gid) + return + + cmd_ret = errno_wrap(os.rename, + [entry, en], + [ENOENT, EEXIST], [ESTALE, EBUSY]) +- collect_failure(e, cmd_ret) +- ++ collect_failure(e, cmd_ret, uid, gid) + + for e in entries: + blob = None +@@ -595,7 +599,7 @@ class Server(object): + if op in ['RMDIR', 'UNLINK']: + # Try once, if rmdir failed with ENOTEMPTY + # then delete recursively. +- er = entry_purge(op, entry, gfid, e) ++ er = entry_purge(op, entry, gfid, e, uid, gid) + if isinstance(er, int): + if er == ENOTEMPTY and op == 'RMDIR': + # Retry if ENOTEMPTY, ESTALE +@@ -632,7 +636,7 @@ class Server(object): + cmd_ret = errno_wrap(os.link, + [slink, entry], + [ENOENT, EEXIST], [ESTALE]) +- collect_failure(e, cmd_ret) ++ collect_failure(e, cmd_ret, uid, gid) + elif op == 'MKDIR': + en = e['entry'] + slink = os.path.join(pfx, gfid) +@@ -676,7 +680,7 @@ class Server(object): + cmd_ret = errno_wrap(os.link, + [slink, entry], + [ENOENT, EEXIST], [ESTALE]) +- collect_failure(e, cmd_ret) ++ collect_failure(e, cmd_ret, uid, gid) + elif op == 'SYMLINK': + en = e['entry'] + st = lstat(entry) +@@ -684,7 +688,7 @@ class Server(object): + blob = entry_pack_symlink(gfid, bname, e['link'], + e['stat']) + elif not matching_disk_gfid(gfid, en): +- collect_failure(e, EEXIST) ++ collect_failure(e, EEXIST, uid, gid) + elif op == 'RENAME': + en = e['entry1'] + # The matching disk gfid check validates two things +@@ -704,7 +708,7 @@ class Server(object): + blob = entry_pack_symlink(gfid, bname, + e['link'], e['stat']) + elif not matching_disk_gfid(gfid, en): +- collect_failure(e, EEXIST, True) ++ collect_failure(e, EEXIST, uid, gid, True) + else: + slink = os.path.join(pfx, gfid) + st = lstat(slink) +@@ -716,12 +720,13 @@ class Server(object): + else: + cmd_ret = errno_wrap(os.link, [slink, en], + [ENOENT, EEXIST], [ESTALE]) +- collect_failure(e, cmd_ret) ++ collect_failure(e, cmd_ret, uid, gid) + else: + st = lstat(entry) + st1 = lstat(en) + if isinstance(st1, int): +- rename_with_disk_gfid_confirmation(gfid, entry, en) ++ rename_with_disk_gfid_confirmation(gfid, entry, en, ++ uid, gid) + else: + if st.st_ino == st1.st_ino: + # we have a hard link, we can now unlink source +@@ -746,15 +751,16 @@ class Server(object): + else: + raise + elif not matching_disk_gfid(gfid, en): +- collect_failure(e, EEXIST, True) ++ collect_failure(e, EEXIST, uid, gid, True) + else: +- rename_with_disk_gfid_confirmation(gfid, entry, en) ++ rename_with_disk_gfid_confirmation(gfid, entry, en, ++ uid, gid) + if blob: + cmd_ret = errno_wrap(Xattr.lsetxattr, + [pg, 'glusterfs.gfid.newfile', blob], + [EEXIST, ENOENT], + [ESTALE, EINVAL, EBUSY]) +- failed = collect_failure(e, cmd_ret) ++ collect_failure(e, cmd_ret, uid, gid) + + # If UID/GID is different than zero that means we are trying + # create Entry with different UID/GID. Create Entry with +@@ -763,7 +769,7 @@ class Server(object): + path = os.path.join(pfx, gfid) + cmd_ret = errno_wrap(os.lchown, [path, uid, gid], [ENOENT], + [ESTALE, EINVAL]) +- collect_failure(e, cmd_ret) ++ collect_failure(e, cmd_ret, uid, gid) + + return failures + +-- +1.8.3.1 + diff --git a/0437-geo-rep-Add-more-intelligence-to-automatic-error-han.patch b/0437-geo-rep-Add-more-intelligence-to-automatic-error-han.patch new file mode 100644 index 0000000..d05394e --- /dev/null +++ b/0437-geo-rep-Add-more-intelligence-to-automatic-error-han.patch @@ -0,0 +1,144 @@ +From 85da98b9c54889139822b5c3d351a0249abf75b0 Mon Sep 17 00:00:00 2001 +From: Kotresh HR +Date: Fri, 26 Oct 2018 03:45:46 -0400 +Subject: [PATCH 437/444] geo-rep: Add more intelligence to automatic error + handling + +Geo-rep's automatic error handling does gfid conflict +resolution. But if there are ENOENT errors because the +parent is not synced to slave, it doesn' handle them. +This patch adds the intelligence to create missing +parent directories on slave. It can create the missing +directories upto the depth of 10. + +Backport of: + > Patch: https://review.gluster.org/21498 + > fixes: bz#1643402 + > Change-Id: Ic97ed1fa5899c087e404d559e04f7963ed7bb54c + > Signed-off-by: Kotresh HR + +BUG: 1638069 +Change-Id: Ic97ed1fa5899c087e404d559e04f7963ed7bb54c +Signed-off-by: Kotresh HR +Reviewed-on: https://code.engineering.redhat.com/gerrit/155039 +Tested-by: RHGS Build Bot +Reviewed-by: Aravinda Vishwanathapura Krishna Murthy +Reviewed-by: Sunny Kumar +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + geo-replication/syncdaemon/master.py | 68 ++++++++++++++++++++++++------------ + 1 file changed, 46 insertions(+), 22 deletions(-) + +diff --git a/geo-replication/syncdaemon/master.py b/geo-replication/syncdaemon/master.py +index cd135df..bdb4da2 100644 +--- a/geo-replication/syncdaemon/master.py ++++ b/geo-replication/syncdaemon/master.py +@@ -693,7 +693,7 @@ class GMasterChangelogMixin(GMasterCommon): + TYPE_ENTRY = "E " + + MAX_EF_RETRIES = 10 +- MAX_OE_RETRIES = 5 ++ MAX_OE_RETRIES = 10 + + # flat directory hierarchy for gfid based access + FLAT_DIR_HIERARCHY = '.' +@@ -836,11 +836,12 @@ class GMasterChangelogMixin(GMasterCommon): + # The file exists on master but with different name. + # Probably renamed and got missed during xsync crawl. + elif failure[2]['slave_isdir']: +- realpath = os.readlink(os.path.join(gconf.local_path, +- ".glusterfs", +- slave_gfid[0:2], +- slave_gfid[2:4], +- slave_gfid)) ++ realpath = os.readlink(os.path.join( ++ gconf.local_path, ++ ".glusterfs", ++ slave_gfid[0:2], ++ slave_gfid[2:4], ++ slave_gfid)) + dst_entry = os.path.join(pfx, realpath.split('/')[-2], + realpath.split('/')[-1]) + src_entry = pbname +@@ -881,25 +882,37 @@ class GMasterChangelogMixin(GMasterCommon): + gfid=failure[2]['slave_gfid'], + entry=pbname)) + elif failure[1] == ENOENT: +- # Ignore ENOENT error for fix_entry_ops aka retry_count > 1 +- if retry_count > 1: +- logging.info(lf('ENOENT error while fixing entry ops. ' +- 'Safe to ignore, take out entry', ++ if op in ['RENAME']: ++ pbname = failure[0]['entry1'] ++ else: ++ pbname = failure[0]['entry'] ++ ++ pargfid = pbname.split('/')[1] ++ st = lstat(os.path.join(pfx, pargfid)) ++ # Safe to ignore the failure as master doesn't contain ++ # parent directory. ++ if isinstance(st, int): ++ logging.info(lf('Fixing ENOENT error in slave. Parent ' ++ 'does not exist on master. Safe to ' ++ 'ignore, take out entry', + retry_count=retry_count, + entry=repr(failure))) + entries.remove(failure[0]) +- elif op in ('MKNOD', 'CREATE', 'MKDIR'): +- pargfid = pbname.split('/')[1] +- st = lstat(os.path.join(pfx, pargfid)) +- # Safe to ignore the failure as master doesn't contain +- # parent directory. +- if isinstance(st, int): +- logging.info(lf('Fixing ENOENT error in slave. Parent ' +- 'does not exist on master. Safe to ' +- 'ignore, take out entry', +- retry_count=retry_count, +- entry=repr(failure))) +- entries.remove(failure[0]) ++ else: ++ logging.info(lf('Fixing ENOENT error in slave. Create ' ++ 'parent directory on slave.', ++ retry_count=retry_count, ++ entry=repr(failure))) ++ realpath = os.readlink(os.path.join(gconf.local_path, ++ ".glusterfs", ++ pargfid[0:2], ++ pargfid[2:4], ++ pargfid)) ++ dir_entry = os.path.join(pfx, realpath.split('/')[-2], ++ realpath.split('/')[-1]) ++ fix_entry_ops.append( ++ edct('MKDIR', gfid=pargfid, entry=dir_entry, ++ mode=st.st_mode, uid=st.st_uid, gid=st.st_gid)) + + if fix_entry_ops: + # Process deletions of entries whose gfids are mismatched +@@ -1077,6 +1090,11 @@ class GMasterChangelogMixin(GMasterCommon): + os.path.join(pfx, ec[self.POS_ENTRY1 - 1])) + entries.append(edct(ty, gfid=gfid, entry=e1, entry1=en, + stat=st, link=rl)) ++ # If src doesn't exist while doing rename, destination ++ # is created. If data is not followed by rename, this ++ # remains zero byte file on slave. Hence add data entry ++ # for renames ++ datas.add(os.path.join(pfx, gfid)) + else: + # stat() to get mode and other information + if not matching_disk_gfid(gfid, en): +@@ -1100,6 +1118,12 @@ class GMasterChangelogMixin(GMasterCommon): + rl = None + entries.append(edct(ty, stat=st, entry=en, gfid=gfid, + link=rl)) ++ # If src doesn't exist while doing link, destination ++ # is created based on file type. If data is not ++ # followed by link, this remains zero byte file on ++ # slave. Hence add data entry for links ++ if rl is None: ++ datas.add(os.path.join(pfx, gfid)) + elif ty == 'SYMLINK': + rl = errno_wrap(os.readlink, [en], [ENOENT], + [ESTALE, EINTR]) +-- +1.8.3.1 + diff --git a/0438-cluster-dht-In-rename-unlink-after-creating-linkto-f.patch b/0438-cluster-dht-In-rename-unlink-after-creating-linkto-f.patch new file mode 100644 index 0000000..7c9ffe8 --- /dev/null +++ b/0438-cluster-dht-In-rename-unlink-after-creating-linkto-f.patch @@ -0,0 +1,365 @@ +From e76a777f3820e62948256a45a38d5e97f3eb08a9 Mon Sep 17 00:00:00 2001 +From: N Balachandran +Date: Tue, 28 Aug 2018 12:00:33 +0530 +Subject: [PATCH 438/444] cluster/dht: In rename, unlink after creating linkto + file + +The linkto file creation for the dst was done in parallel with +the unlink of the old src linkto. If these operations reached +the brick out of order, we end up with a dst linkto file without +a .glusterfs handle. + +Fixed by unlinking only after the linkto file creation has +completed. + +upstream patch: https://review.gluster.org/#/c/glusterfs/+/21023/ + +> Change-Id: I4246f7655f5bc180f5ded7fd34d263b7828a8110 +> fixes: bz#1621981 +> Signed-off-by: N Balachandran + +Change-Id: Ia845a68bb314997cadab57887a84dff9373400c4 +BUG: 1622001 +Signed-off-by: N Balachandran +Reviewed-on: https://code.engineering.redhat.com/gerrit/154933 +Tested-by: RHGS Build Bot +Reviewed-by: Raghavendra Gowdappa +--- + tests/bugs/posix/bug-1619720.t | 1 + + xlators/cluster/dht/src/dht-rename.c | 254 ++++++++++++++++++----------------- + 2 files changed, 133 insertions(+), 122 deletions(-) + +diff --git a/tests/bugs/posix/bug-1619720.t b/tests/bugs/posix/bug-1619720.t +index 5e0d0f7..bfd304d 100755 +--- a/tests/bugs/posix/bug-1619720.t ++++ b/tests/bugs/posix/bug-1619720.t +@@ -48,6 +48,7 @@ TEST mv $M0/tmp/file-2 $M0/tmp/file-3 + + TEST mv -f $M0/tmp/file-1 $M0/tmp/file-3 + ++ + TEST getfattr -n $pgfid_xattr_name $B0/${V0}0/tmp/file-3 + TEST getfattr -n $pgfid_xattr_name $B0/${V0}1/tmp/file-3 + +diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c +index 1d0c2bb..378cb0a 100644 +--- a/xlators/cluster/dht/src/dht-rename.c ++++ b/xlators/cluster/dht/src/dht-rename.c +@@ -849,8 +849,8 @@ dht_rename_cleanup (call_frame_t *frame) + if (src_cached == dst_cached) + goto nolinks; + +- if (local->linked && (dst_hashed != src_hashed )&& +- (dst_hashed != src_cached)) { ++ if (local->linked && (dst_hashed != src_hashed) && ++ (dst_hashed != src_cached)) { + call_cnt++; + } + +@@ -935,6 +935,120 @@ nolinks: + + + int ++dht_rename_unlink (call_frame_t *frame, xlator_t *this) ++{ ++ dht_local_t *local = NULL; ++ xlator_t *src_hashed = NULL; ++ xlator_t *src_cached = NULL; ++ xlator_t *dst_hashed = NULL; ++ xlator_t *dst_cached = NULL; ++ xlator_t *rename_subvol = NULL; ++ dict_t *xattr = NULL; ++ ++ local = frame->local; ++ ++ src_hashed = local->src_hashed; ++ src_cached = local->src_cached; ++ dst_hashed = local->dst_hashed; ++ dst_cached = local->dst_cached; ++ ++ local->call_cnt = 0; ++ ++ /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk ++ * is called. since rename has already happened on rename_subvol, ++ * unlink shouldn't be sent for oldpath (either linkfile or cached-file) ++ * on rename_subvol. */ ++ if (src_cached == dst_cached) ++ rename_subvol = src_cached; ++ else ++ rename_subvol = dst_hashed; ++ ++ /* TODO: delete files in background */ ++ ++ if (src_cached != dst_hashed && src_cached != dst_cached) ++ local->call_cnt++; ++ ++ if (src_hashed != rename_subvol && src_hashed != src_cached) ++ local->call_cnt++; ++ ++ if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) ++ local->call_cnt++; ++ ++ if (local->call_cnt == 0) ++ goto unwind; ++ ++ DHT_MARK_FOP_INTERNAL (xattr); ++ ++ if (src_cached != dst_hashed && src_cached != dst_cached) { ++ dict_t *xattr_new = NULL; ++ ++ xattr_new = dict_copy_with_ref (xattr, NULL); ++ ++ gf_msg_trace (this->name, 0, ++ "deleting old src datafile %s @ %s", ++ local->loc.path, src_cached->name); ++ ++ if (gf_uuid_compare (local->loc.pargfid, ++ local->loc2.pargfid) == 0) { ++ DHT_MARKER_DONT_ACCOUNT(xattr_new); ++ } ++ ++ DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, ++ &local->loc2); ++ STACK_WIND_COOKIE (frame, dht_rename_unlink_cbk, src_cached, ++ src_cached, src_cached->fops->unlink, ++ &local->loc, 0, xattr_new); ++ ++ dict_unref (xattr_new); ++ xattr_new = NULL; ++ } ++ ++ if (src_hashed != rename_subvol && src_hashed != src_cached) { ++ dict_t *xattr_new = NULL; ++ ++ xattr_new = dict_copy_with_ref (xattr, NULL); ++ ++ gf_msg_trace (this->name, 0, ++ "deleting old src linkfile %s @ %s", ++ local->loc.path, src_hashed->name); ++ ++ DHT_MARKER_DONT_ACCOUNT(xattr_new); ++ ++ STACK_WIND_COOKIE (frame, dht_rename_unlink_cbk, src_hashed, ++ src_hashed, src_hashed->fops->unlink, ++ &local->loc, 0, xattr_new); ++ ++ dict_unref (xattr_new); ++ xattr_new = NULL; ++ } ++ ++ if (dst_cached && ++ (dst_cached != dst_hashed) && ++ (dst_cached != src_cached)) { ++ gf_msg_trace (this->name, 0, ++ "deleting old dst datafile %s @ %s", ++ local->loc2.path, dst_cached->name); ++ ++ STACK_WIND_COOKIE (frame, dht_rename_unlink_cbk, dst_cached, ++ dst_cached, dst_cached->fops->unlink, ++ &local->loc2, 0, xattr); ++ } ++ if (xattr) ++ dict_unref (xattr); ++ return 0; ++ ++unwind: ++ WIPE (&local->preoldparent); ++ WIPE (&local->postoldparent); ++ WIPE (&local->preparent); ++ WIPE (&local->postparent); ++ ++ dht_rename_done (frame, this); ++ ++ return 0; ++} ++ ++int + dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct iatt *stbuf, +@@ -947,6 +1061,7 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + prev = cookie; + local = frame->local; + ++ /* TODO: Handle this case in lookup-optimize */ + if (op_ret == -1) { + gf_msg (this->name, GF_LOG_WARNING, op_errno, + DHT_MSG_CREATE_LINK_FAILED, +@@ -958,8 +1073,8 @@ dht_rename_links_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->linked = _gf_false; + dht_linkfile_attr_heal (frame, this); + } +- DHT_STACK_DESTROY (frame); + ++ dht_rename_unlink (frame, this); + return 0; + } + +@@ -973,19 +1088,14 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + { + dht_local_t *local = NULL; + xlator_t *prev = NULL; +- xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; +- xlator_t *rename_subvol = NULL; +- call_frame_t *link_frame = NULL; +- dht_local_t *link_local = NULL; +- dict_t *xattr = NULL; ++ loc_t link_loc = {0}; + + local = frame->local; + prev = cookie; + +- src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; +@@ -1043,31 +1153,6 @@ dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + local->xattr = dict_copy_with_ref (xdata, local->xattr); + } + +- if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { +- link_frame = copy_frame (frame); +- if (!link_frame) { +- goto err; +- } +- +- /* fop value sent as maxvalue because it is not used +- anywhere in this case */ +- link_local = dht_local_init (link_frame, &local->loc2, NULL, +- GF_FOP_MAXVALUE); +- if (!link_local) { +- goto err; +- } +- +- if (link_local->loc.inode) +- inode_unref (link_local->loc.inode); +- link_local->loc.inode = inode_ref (local->loc.inode); +- gf_uuid_copy (link_local->gfid, local->loc.inode->gfid); +- +- dht_linkfile_create (link_frame, dht_rename_links_create_cbk, +- this, src_cached, dst_hashed, +- &link_local->loc); +- } +- +-err: + /* Merge attrs only from src_cached. In case there of src_cached != + * dst_hashed, this ignores linkfile attrs. */ + if (prev == src_cached) { +@@ -1080,98 +1165,23 @@ err: + dht_iatt_merge (this, &local->postparent, postnewparent, prev); + } + ++ /* Create the linkto file for the dst file */ ++ if ((src_cached == dst_cached) && (dst_hashed != dst_cached)) { + +- /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk +- * is called. since rename has already happened on rename_subvol, +- * unlink should not be sent for oldpath (either linkfile or cached-file) +- * on rename_subvol. */ +- if (src_cached == dst_cached) +- rename_subvol = src_cached; +- else +- rename_subvol = dst_hashed; +- +- /* TODO: delete files in background */ +- +- if (src_cached != dst_hashed && src_cached != dst_cached) +- local->call_cnt++; +- +- if (src_hashed != rename_subvol && src_hashed != src_cached) +- local->call_cnt++; +- +- if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) +- local->call_cnt++; +- +- if (local->call_cnt == 0) +- goto unwind; +- +- DHT_MARK_FOP_INTERNAL (xattr); +- +- if (src_cached != dst_hashed && src_cached != dst_cached) { +- dict_t *xattr_new = NULL; +- +- xattr_new = dict_copy_with_ref (xattr, NULL); +- +- gf_msg_trace (this->name, 0, +- "deleting old src datafile %s @ %s", +- local->loc.path, src_cached->name); +- +- if (gf_uuid_compare (local->loc.pargfid, +- local->loc2.pargfid) == 0) { +- DHT_MARKER_DONT_ACCOUNT(xattr_new); +- } +- +- DHT_CHANGELOG_TRACK_AS_RENAME(xattr_new, &local->loc, +- &local->loc2); +- STACK_WIND_COOKIE (frame, dht_rename_unlink_cbk, src_cached, +- src_cached, src_cached->fops->unlink, +- &local->loc, 0, xattr_new); +- +- dict_unref (xattr_new); +- xattr_new = NULL; +- } +- +- if (src_hashed != rename_subvol && src_hashed != src_cached) { +- dict_t *xattr_new = NULL; +- +- xattr_new = dict_copy_with_ref (xattr, NULL); +- +- gf_msg_trace (this->name, 0, +- "deleting old src linkfile %s @ %s", +- local->loc.path, src_hashed->name); +- +- DHT_MARKER_DONT_ACCOUNT(xattr_new); +- +- STACK_WIND_COOKIE (frame, dht_rename_unlink_cbk, src_hashed, +- src_hashed, src_hashed->fops->unlink, +- &local->loc, 0, xattr_new); +- +- dict_unref (xattr_new); +- xattr_new = NULL; +- } ++ loc_copy (&link_loc, &local->loc2); ++ if (link_loc.inode) ++ inode_unref (link_loc.inode); ++ link_loc.inode = inode_ref (local->loc.inode); ++ gf_uuid_copy (local->gfid, local->loc.inode->gfid); ++ gf_uuid_copy (link_loc.gfid, local->loc.inode->gfid); + +- if (dst_cached +- && (dst_cached != dst_hashed) +- && (dst_cached != src_cached)) { +- gf_msg_trace (this->name, 0, +- "deleting old dst datafile %s @ %s", +- local->loc2.path, dst_cached->name); +- +- STACK_WIND_COOKIE (frame, dht_rename_unlink_cbk, dst_cached, +- dst_cached, dst_cached->fops->unlink, +- &local->loc2, 0, xattr); ++ dht_linkfile_create (frame, dht_rename_links_create_cbk, ++ this, src_cached, dst_hashed, ++ &link_loc); ++ return 0; + } +- if (xattr) +- dict_unref (xattr); +- return 0; +- +-unwind: +- WIPE (&local->preoldparent); +- WIPE (&local->postoldparent); +- WIPE (&local->preparent); +- WIPE (&local->postparent); +- +- dht_rename_done (frame, this); + ++ dht_rename_unlink (frame, this); + return 0; + + cleanup: +-- +1.8.3.1 + diff --git a/0439-cluster-dht-fixes-to-unlinking-invalid-linkto-file.patch b/0439-cluster-dht-fixes-to-unlinking-invalid-linkto-file.patch new file mode 100644 index 0000000..e508553 --- /dev/null +++ b/0439-cluster-dht-fixes-to-unlinking-invalid-linkto-file.patch @@ -0,0 +1,70 @@ +From 07ae526af10de814d174189ff41709cf781ace9c Mon Sep 17 00:00:00 2001 +From: Raghavendra Gowdappa +Date: Tue, 30 Oct 2018 12:15:35 +0530 +Subject: [PATCH 439/444] cluster/dht: fixes to unlinking invalid linkto file + +If unlinking of an invalid linkto file failed in lookup-everywhere +codepath, lookup was failed with EIO. The rational as per the comment +was, + + + +/* When dht_lookup_everywhere is performed, one cached + *and one hashed file was found and hashed file does + *not point to the above mentioned cached node. So it + *was considered as stale and an unlink was performed. + *But unlink fails. So may be rebalance is in progress. + *now ideally we have two data-files. One obtained during + *lookup_everywhere and one where unlink-failed. So + *at this point in time we cannot decide which one to + *choose because there are chances of first cached + *file is truncated after rebalance and if it is chosen + *as cached node, application will fail. So return EIO. +*/ + + + +However, this reasoning is only valid when +* op_errno is EBUSY, indicating rebalance is in progress +* op_errno is ENOTCONN as wecannot determine what was the status of + file on brick. + +Hence this patch doesn't fail lookup unless unlink fails with an +either EBUSY or ENOTCONN + +>Change-Id: Ife55f3d97fe557f3db05beae0c2d786df31e8e55 +>Fixes: bz#1635145 +>Signed-off-by: Raghavendra Gowdappa + +Change-Id: Ife55f3d97fe557f3db05beae0c2d786df31e8e55 +BUG: 1634649 +upstream patch: http://review.gluster.org/r/Ife55f3d97fe557f3db05beae0c2d786df31e8e55 +Signed-off-by: Raghavendra Gowdappa +Reviewed-on: https://code.engineering.redhat.com/gerrit/155102 +Tested-by: RHGS Build Bot +Reviewed-by: Nithya Balachandran +--- + xlators/cluster/dht/src/dht-common.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c +index 0984f8f..d3a0c8b 100644 +--- a/xlators/cluster/dht/src/dht-common.c ++++ b/xlators/cluster/dht/src/dht-common.c +@@ -2104,10 +2104,9 @@ dht_lookup_unlink_of_false_linkto_cbk (call_frame_t *frame, void *cookie, + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { +- +- if (op_ret == 0) { +- dht_lookup_everywhere_done (frame, this); +- } else { ++ if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) { ++ dht_lookup_everywhere_done (frame, this); ++ } else { + /*When dht_lookup_everywhere is performed, one cached + *and one hashed file was found and hashed file does + *not point to the above mentioned cached node. So it +-- +1.8.3.1 + diff --git a/0440-features-locks-Use-pthread_mutex_unlock-instead-of-p.patch b/0440-features-locks-Use-pthread_mutex_unlock-instead-of-p.patch new file mode 100644 index 0000000..62b4f4f --- /dev/null +++ b/0440-features-locks-Use-pthread_mutex_unlock-instead-of-p.patch @@ -0,0 +1,34 @@ +From efd713e5f9067a743f532923c529416fc5f5189e Mon Sep 17 00:00:00 2001 +From: Susant Palai +Date: Wed, 7 Nov 2018 13:06:07 +0530 +Subject: [PATCH 440/444] features/locks:Use pthread_mutex_unlock() instead of + pthread_mutex_lock() + +Upstream patch: https://review.gluster.org/#/c/glusterfs/+/21579/ + +Change-Id: I85ea6e351f07cc289245cfb501f027942ee3176c +BUG: 1647675 +Signed-off-by: Susant Palai +Reviewed-on: https://code.engineering.redhat.com/gerrit/155326 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/locks/src/posix.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index b434a08..2cc2837 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -2966,7 +2966,7 @@ pl_metalk (call_frame_t *frame, xlator_t *this, inode_t *inode) + ret = -1; + } + } +- pthread_mutex_lock (&pl_inode->mutex); ++ pthread_mutex_unlock (&pl_inode->mutex); + + if (ret == -1) { + goto out; +-- +1.8.3.1 + diff --git a/0441-features-shard-Make-lru-limit-of-inode-list-configur.patch b/0441-features-shard-Make-lru-limit-of-inode-list-configur.patch new file mode 100644 index 0000000..019251b --- /dev/null +++ b/0441-features-shard-Make-lru-limit-of-inode-list-configur.patch @@ -0,0 +1,201 @@ +From 7b12a7ea7a6b4945ad52f218b187ca440dfbef63 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Fri, 20 Jul 2018 10:52:22 +0530 +Subject: [PATCH 441/444] features/shard: Make lru limit of inode list + configurable + +> Upstream: https://review.gluster.org/20544 +> BUG: 1605056 +> Change-Id: Ifdcc2099f634314fafe8444e2d676e192e89e295 + +Currently this lru limit is hard-coded to 16384. This patch makes it +configurable to make it easier to hit the lru limit and enable testing +of different cases that arise when the limit is reached. + +The option is features.shard-lru-limit. It is by design allowed to +be configured only in init() but not in reconfigure(). This is to avoid +all the complexity associated with eviction of least recently used shards +when the list is shrunk. + +Change-Id: Ifdcc2099f634314fafe8444e2d676e192e89e295 +BUG: 1603118 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/155126 +Reviewed-by: Xavi Hernandez +Tested-by: RHGS Build Bot +--- + libglusterfs/src/globals.h | 4 ++- + tests/bugs/shard/configure-lru-limit.t | 48 +++++++++++++++++++++++++ + xlators/features/shard/src/shard.c | 19 ++++++++-- + xlators/features/shard/src/shard.h | 3 +- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 6 ++++ + 5 files changed, 75 insertions(+), 5 deletions(-) + create mode 100644 tests/bugs/shard/configure-lru-limit.t + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index 97c4fad..555f44b 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -43,7 +43,7 @@ + */ + #define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly + should not change */ +-#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_3 /* MAX VERSION is the maximum ++#define GD_OP_VERSION_MAX GD_OP_VERSION_4_2_0 /* MAX VERSION is the maximum + count in VME table, should + keep changing with + introduction of newer +@@ -111,6 +111,8 @@ + + #define GD_OP_VERSION_3_13_3 31303 /* Op-version for GlusterFS 3.13.3 */ + ++#define GD_OP_VERSION_4_2_0 40200 /* Op-version for GlusterFS 4.2.0 */ ++ + /* Downstream only change */ + #define GD_OP_VERSION_3_11_2 31102 /* Op-version for RHGS 3.3.1-async */ + #define GD_OP_VERSION_3_13_3 31303 /* Op-version for RHGS-3.4-Batch Update-1*/ +diff --git a/tests/bugs/shard/configure-lru-limit.t b/tests/bugs/shard/configure-lru-limit.t +new file mode 100644 +index 0000000..a8ba8ed +--- /dev/null ++++ b/tests/bugs/shard/configure-lru-limit.t +@@ -0,0 +1,48 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 features.shard-lru-limit 25 ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++# Perform a write that would cause 25 shards to be created, 24 of them under .shard ++TEST dd if=/dev/zero of=$M0/foo bs=1M count=100 ++ ++statedump=$(generate_mount_statedump $V0) ++sleep 1 ++EXPECT "25" echo $(grep "lru-max-limit" $statedump | cut -f2 -d'=' | tail -1) ++ ++# Base shard is never added to this list. So all other shards should make up for 24 inodes in lru list ++EXPECT "24" echo $(grep "inode-count" $statedump | cut -f2 -d'=' | tail -1) ++ ++rm -f $statedump ++ ++# Test to ensure there's no "reconfiguration" of the value once set. ++TEST $CLI volume set $V0 features.shard-lru-limit 30 ++statedump=$(generate_mount_statedump $V0) ++sleep 1 ++EXPECT "25" echo $(grep "lru-max-limit" $statedump | cut -f2 -d'=' | tail -1) ++rm -f $statedump ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++statedump=$(generate_mount_statedump $V0) ++sleep 1 ++EXPECT "30" echo $(grep "lru-max-limit" $statedump | cut -f2 -d'=' | tail -1) ++rm -f $statedump ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index 6066a54..eb32168 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -668,7 +668,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + shard_inode_ctx_get (linked_inode, this, &ctx); + + if (list_empty (&ctx->ilist)) { +- if (priv->inode_count + 1 <= SHARD_MAX_INODES) { ++ if (priv->inode_count + 1 <= priv->lru_limit) { + /* If this inode was linked here for the first time (indicated + * by empty list), and if there is still space in the priv list, + * add this ctx to the tail of the list. +@@ -6690,6 +6690,8 @@ init (xlator_t *this) + + GF_OPTION_INIT ("shard-deletion-rate", priv->deletion_rate, uint32, out); + ++ GF_OPTION_INIT ("shard-lru-limit", priv->lru_limit, uint64, out); ++ + this->local_pool = mem_pool_new (shard_local_t, 128); + if (!this->local_pool) { + ret = -1; +@@ -6808,7 +6810,7 @@ shard_priv_dump (xlator_t *this) + gf_uint64_2human_readable (priv->block_size)); + gf_proc_dump_write ("inode-count", "%d", priv->inode_count); + gf_proc_dump_write ("ilist_head", "%p", &priv->ilist_head); +- gf_proc_dump_write ("lru-max-limit", "%d", SHARD_MAX_INODES); ++ gf_proc_dump_write ("lru-max-limit", "%d", priv->lru_limit); + + return 0; + } +@@ -6877,5 +6879,18 @@ struct volume_options options[] = { + .max = INT_MAX, + .description = "The number of shards to send deletes on at a time", + }, ++ { .key = {"shard-lru-limit"}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "16384", ++ .min = 20, ++ .max = INT_MAX, ++ .description = "The number of resolved shard inodes to keep in " ++ "memory. A higher number means shards that are " ++ "resolved will remain in memory longer, avoiding " ++ "frequent lookups on them when they participate in " ++ "file operations. The option also has a bearing on " ++ "amount of memory consumed by these inodes and their " ++ "internal metadata", ++ }, + { .key = {NULL} }, + }; +diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h +index 5de098a..ac3813c 100644 +--- a/xlators/features/shard/src/shard.h ++++ b/xlators/features/shard/src/shard.h +@@ -23,8 +23,6 @@ + #define SHARD_MAX_BLOCK_SIZE (4 * GF_UNIT_TB) + #define SHARD_XATTR_PREFIX "trusted.glusterfs.shard." + #define GF_XATTR_SHARD_BLOCK_SIZE "trusted.glusterfs.shard.block-size" +-#define SHARD_INODE_LRU_LIMIT 4096 +-#define SHARD_MAX_INODES 16384 + /** + * Bit masks for the valid flag, which is used while updating ctx + **/ +@@ -216,6 +214,7 @@ typedef struct shard_priv { + struct list_head ilist_head; + uint32_t deletion_rate; + shard_first_lookup_state_t first_lookup; ++ uint64_t lru_limit; + } shard_priv_t; + + typedef struct { +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index a825f52..d442fe0 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3298,6 +3298,12 @@ struct volopt_map_entry glusterd_volopt_map[] = { + .op_version = GD_OP_VERSION_3_7_0, + .flags = OPT_FLAG_CLIENT_OPT + }, ++ { .key = "features.shard-lru-limit", ++ .voltype = "features/shard", ++ .op_version = GD_OP_VERSION_4_2_0, ++ .flags = OPT_FLAG_CLIENT_OPT, ++ .type = NO_DOC, ++ }, + { .key = "features.shard-deletion-rate", + .voltype = "features/shard", + .op_version = GD_OP_VERSION_3_13_4, +-- +1.8.3.1 + diff --git a/0442-glusterd-Reset-op-version-for-features.shard-lru-lim.patch b/0442-glusterd-Reset-op-version-for-features.shard-lru-lim.patch new file mode 100644 index 0000000..f12bdd6 --- /dev/null +++ b/0442-glusterd-Reset-op-version-for-features.shard-lru-lim.patch @@ -0,0 +1,63 @@ +From dd1d565505d1f9c41dd6f151341f9337d89aa7cf Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Tue, 6 Nov 2018 18:44:55 +0530 +Subject: [PATCH 442/444] glusterd: Reset op-version for + "features.shard-lru-limit" + +The op-version for the "features.shard-lru-limit" option was set to +4.2.0 in the upstream patch and backported at +41e7e33c6512e98a1567e5a5532d3898b59cfa98 + +This commit reverts the op-version for this option to 3.13.4. + +Label: DOWNSTREAM ONLY + +Change-Id: I7d3ed6b373851267c78fc6815a83bee2c0906413 +BUG: 1603118 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/155127 +Tested-by: RHGS Build Bot +Reviewed-by: Xavi Hernandez +--- + libglusterfs/src/globals.h | 4 +--- + xlators/mgmt/glusterd/src/glusterd-volume-set.c | 2 +- + 2 files changed, 2 insertions(+), 4 deletions(-) + +diff --git a/libglusterfs/src/globals.h b/libglusterfs/src/globals.h +index 555f44b..1bede2e 100644 +--- a/libglusterfs/src/globals.h ++++ b/libglusterfs/src/globals.h +@@ -43,7 +43,7 @@ + */ + #define GD_OP_VERSION_MIN 1 /* MIN is the fresh start op-version, mostly + should not change */ +-#define GD_OP_VERSION_MAX GD_OP_VERSION_4_2_0 /* MAX VERSION is the maximum ++#define GD_OP_VERSION_MAX GD_OP_VERSION_3_13_4 /* MAX VERSION is the maximum + count in VME table, should + keep changing with + introduction of newer +@@ -111,8 +111,6 @@ + + #define GD_OP_VERSION_3_13_3 31303 /* Op-version for GlusterFS 3.13.3 */ + +-#define GD_OP_VERSION_4_2_0 40200 /* Op-version for GlusterFS 4.2.0 */ +- + /* Downstream only change */ + #define GD_OP_VERSION_3_11_2 31102 /* Op-version for RHGS 3.3.1-async */ + #define GD_OP_VERSION_3_13_3 31303 /* Op-version for RHGS-3.4-Batch Update-1*/ +diff --git a/xlators/mgmt/glusterd/src/glusterd-volume-set.c b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +index d442fe0..1175f1d 100644 +--- a/xlators/mgmt/glusterd/src/glusterd-volume-set.c ++++ b/xlators/mgmt/glusterd/src/glusterd-volume-set.c +@@ -3300,7 +3300,7 @@ struct volopt_map_entry glusterd_volopt_map[] = { + }, + { .key = "features.shard-lru-limit", + .voltype = "features/shard", +- .op_version = GD_OP_VERSION_4_2_0, ++ .op_version = GD_OP_VERSION_3_13_4, + .flags = OPT_FLAG_CLIENT_OPT, + .type = NO_DOC, + }, +-- +1.8.3.1 + diff --git a/0443-features-shard-Hold-a-ref-on-base-inode-when-adding-.patch b/0443-features-shard-Hold-a-ref-on-base-inode-when-adding-.patch new file mode 100644 index 0000000..0bfe143 --- /dev/null +++ b/0443-features-shard-Hold-a-ref-on-base-inode-when-adding-.patch @@ -0,0 +1,367 @@ +From 72ce80749fca03ab97a63af79d4e6bc76a49ab64 Mon Sep 17 00:00:00 2001 +From: Krutika Dhananjay +Date: Fri, 5 Oct 2018 11:32:21 +0530 +Subject: [PATCH 443/444] features/shard: Hold a ref on base inode when adding + a shard to lru list + + > Upstream: https://review.gluster.org/21454 + > BUG: 1605056 + > Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499 + +In __shard_update_shards_inode_list(), previously shard translator +was not holding a ref on the base inode whenever a shard was added to +the lru list. But if the base shard is forgotten and destroyed either +by fuse due to memory pressure or due to the file being deleted at some +point by a different client with this client still containing stale +shards in its lru list, the client would crash at the time of locking +lru_base_inode->lock owing to illegal memory access. + +So now the base shard is ref'd into the inode ctx of every shard that +is added to lru list until it gets lru'd out. + +The patch also handles the case where none of the shards associated +with a file that is about to be deleted are part of the LRU list and +where an unlink at the beginning of the operation destroys the base +inode (because there are no refkeepers) and hence all of the shards +that are about to be deleted will be resolved without the existence +of a base shard in-memory. This, if not handled properly, could lead +to a crash. + +Change-Id: Ic15ca41444dd04684a9458bd4a526b1d3e160499 +BUG: 1603118 +Signed-off-by: Krutika Dhananjay +Reviewed-on: https://code.engineering.redhat.com/gerrit/155318 +Reviewed-by: Xavi Hernandez +Tested-by: RHGS Build Bot +--- + tests/bugs/shard/bug-1605056-2.t | 34 +++++++++++++++ + tests/bugs/shard/bug-1605056.t | 63 ++++++++++++++++++++++++++++ + tests/bugs/shard/shard-inode-refcount-test.t | 2 +- + tests/volume.rc | 12 ++++-- + xlators/features/shard/src/shard.c | 48 +++++++++++++++------ + 5 files changed, 141 insertions(+), 18 deletions(-) + create mode 100644 tests/bugs/shard/bug-1605056-2.t + create mode 100644 tests/bugs/shard/bug-1605056.t + +diff --git a/tests/bugs/shard/bug-1605056-2.t b/tests/bugs/shard/bug-1605056-2.t +new file mode 100644 +index 0000000..a9c10fe +--- /dev/null ++++ b/tests/bugs/shard/bug-1605056-2.t +@@ -0,0 +1,34 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 features.shard-lru-limit 25 ++TEST $CLI volume set $V0 performance.write-behind off ++ ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++ ++# Perform a write that would cause 25 shards to be created under .shard ++TEST dd if=/dev/zero of=$M0/foo bs=1M count=104 ++ ++# Write into another file bar to ensure all of foo's shards are evicted from lru list of $M0 ++TEST dd if=/dev/zero of=$M0/bar bs=1M count=104 ++ ++# Delete foo from $M0. If there's a bug, the mount will crash. ++TEST unlink $M0/foo ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/tests/bugs/shard/bug-1605056.t b/tests/bugs/shard/bug-1605056.t +new file mode 100644 +index 0000000..c2329ea +--- /dev/null ++++ b/tests/bugs/shard/bug-1605056.t +@@ -0,0 +1,63 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++SHARD_COUNT_TIME=5 ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} ++TEST $CLI volume set $V0 features.shard on ++TEST $CLI volume set $V0 features.shard-block-size 4MB ++TEST $CLI volume set $V0 features.shard-lru-limit 25 ++TEST $CLI volume set $V0 performance.write-behind off ++ ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0 ++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M1 ++ ++# Perform a write that would cause 25 shards to be created under .shard ++TEST dd if=/dev/zero of=$M0/foo bs=1M count=104 ++ ++# Read the file from $M1, indirectly filling up the lru list. ++TEST `cat $M1/foo > /dev/null` ++statedump=$(generate_mount_statedump $V0 $M1) ++sleep 1 ++EXPECT "25" echo $(grep "inode-count" $statedump | cut -f2 -d'=' | tail -1) ++rm -f $statedump ++ ++# Delete foo from $M0. ++TEST unlink $M0/foo ++ ++# Send stat on foo from $M1 to force $M1 to "forget" inode associated with foo. ++# Now the ghost shards associated with "foo" are still in lru list of $M1. ++TEST ! stat $M1/foo ++ ++# Let's force the ghost shards of "foo" out of lru list by looking up more shards ++# through I/O on a file named "bar" from $M1. This should crash if the base inode ++# had been destroyed by now. ++ ++TEST dd if=/dev/zero of=$M1/bar bs=1M count=104 ++ ++############################################### ++#### Now for some inode ref-leak tests ... #### ++############################################### ++ ++# Expect there to be 29 active inodes - 26 belonging to "bar", 1 for .shard, ++# 1 for .shard/remove_me and 1 for '/' ++EXPECT_WITHIN $SHARD_COUNT_TIME `expr 26 + 3` get_mount_active_size_value $V0 $M1 ++ ++TEST rm -f $M1/bar ++EXPECT_WITHIN $SHARD_COUNT_TIME 3 get_mount_active_size_value $V0 $M1 ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M1 ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume delete $V0 ++ ++cleanup +diff --git a/tests/bugs/shard/shard-inode-refcount-test.t b/tests/bugs/shard/shard-inode-refcount-test.t +index 087c8ba..3fd181b 100644 +--- a/tests/bugs/shard/shard-inode-refcount-test.t ++++ b/tests/bugs/shard/shard-inode-refcount-test.t +@@ -21,7 +21,7 @@ TEST dd if=/dev/zero conv=fsync of=$M0/one-plus-five-shards bs=1M count=23 + ACTIVE_INODES_BEFORE=$(get_mount_active_size_value $V0) + TEST rm -f $M0/one-plus-five-shards + # Expect 5 inodes less. But one inode more than before because .remove_me would be created. +-EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0 ++EXPECT_WITHIN $SHARD_COUNT_TIME `expr $ACTIVE_INODES_BEFORE - 5 + 1` get_mount_active_size_value $V0 $M0 + + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 + TEST $CLI volume stop $V0 +diff --git a/tests/volume.rc b/tests/volume.rc +index bba7e4e..6a983fd 100644 +--- a/tests/volume.rc ++++ b/tests/volume.rc +@@ -93,7 +93,8 @@ function remove_brick_status_completed_field { + + function get_mount_process_pid { + local vol=$1 +- ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol " | awk '{print $2}' | head -1 ++ local mnt=$2 ++ ps auxww | grep glusterfs | grep -E "volfile-id[ =]/?$vol .*$mnt" | awk '{print $2}' | head -1 + } + + function get_nfs_pid () +@@ -126,7 +127,8 @@ function generate_statedump { + + function generate_mount_statedump { + local vol=$1 +- generate_statedump $(get_mount_process_pid $vol) ++ local mnt=$2 ++ generate_statedump $(get_mount_process_pid $vol $mnt) + } + + function cleanup_mount_statedump { +@@ -850,7 +852,8 @@ function get_active_fd_count { + + function get_mount_active_size_value { + local vol=$1 +- local statedump=$(generate_mount_statedump $vol) ++ local mount=$2 ++ local statedump=$(generate_mount_statedump $vol $mount) + sleep 1 + local val=$(grep "active_size" $statedump | cut -f2 -d'=' | tail -1) + rm -f $statedump +@@ -859,7 +862,8 @@ function get_mount_active_size_value { + + function get_mount_lru_size_value { + local vol=$1 +- local statedump=$(generate_mount_statedump $vol) ++ local mount=$2 ++ local statedump=$(generate_mount_statedump $vol $mount) + sleep 1 + local val=$(grep "lru_size" $statedump | cut -f2 -d'=' | tail -1) + rm -f $statedump +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index eb32168..fb88315 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -651,7 +651,8 @@ out: + + inode_t * + __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, +- inode_t *base_inode, int block_num) ++ inode_t *base_inode, int block_num, ++ uuid_t gfid) + { + char block_bname[256] = {0,}; + inode_t *lru_inode = NULL; +@@ -679,10 +680,13 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + inode_ref (linked_inode); + if (base_inode) + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); ++ else ++ gf_uuid_copy(ctx->base_gfid, gfid); + ctx->block_num = block_num; + list_add_tail (&ctx->ilist, &priv->ilist_head); + priv->inode_count++; +- ctx->base_inode = base_inode; ++ if (base_inode) ++ ctx->base_inode = inode_ref (base_inode); + } else { + /*If on the other hand there is no available slot for this inode + * in the list, delete the lru inode from the head of the list, +@@ -701,6 +705,8 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + * deleted from fsync list and fsync'd in a new frame, + * and then unlinked in memory and forgotten. + */ ++ if (!lru_base_inode) ++ goto after_fsync_check; + LOCK (&lru_base_inode->lock); + LOCK (&lru_inode->lock); + { +@@ -715,6 +721,7 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + UNLOCK (&lru_inode->lock); + UNLOCK (&lru_base_inode->lock); + ++after_fsync_check: + if (!do_fsync) { + shard_make_block_bname (lru_inode_ctx->block_num, + lru_inode_ctx->base_gfid, +@@ -729,20 +736,31 @@ __shard_update_shards_inode_list (inode_t *linked_inode, xlator_t *this, + inode_forget (lru_inode, 0); + } else { + fsync_inode = lru_inode; +- inode_unref (lru_base_inode); ++ if (lru_base_inode) ++ inode_unref (lru_base_inode); + } + /* The following unref corresponds to the ref + * held by inode_find() above. + */ + inode_unref (lru_inode); ++ ++ /* The following unref corresponds to the ref held on ++ * the base shard at the time of adding shard inode to ++ * lru list ++ */ ++ if (lru_base_inode) ++ inode_unref (lru_base_inode); + /* For as long as an inode is in lru list, we try to + * keep it alive by holding a ref on it. + */ + inode_ref (linked_inode); + if (base_inode) + gf_uuid_copy (ctx->base_gfid, base_inode->gfid); ++ else ++ gf_uuid_copy (ctx->base_gfid, gfid); + ctx->block_num = block_num; +- ctx->base_inode = base_inode; ++ if (base_inode) ++ ctx->base_inode = inode_ref (base_inode); + list_add_tail (&ctx->ilist, &priv->ilist_head); + } + } else { +@@ -1027,7 +1045,7 @@ shard_common_resolve_shards (call_frame_t *frame, xlator_t *this, + fsync_inode = __shard_update_shards_inode_list (inode, + this, + res_inode, +- shard_idx_iter); ++ shard_idx_iter, gfid); + } + UNLOCK(&priv->lock); + shard_idx_iter++; +@@ -2173,7 +2191,8 @@ shard_link_block_inode (shard_local_t *local, int block_num, inode_t *inode, + fsync_inode = __shard_update_shards_inode_list (linked_inode, + this, + local->loc.inode, +- block_num); ++ block_num, ++ gfid); + } + UNLOCK(&priv->lock); + if (fsync_inode) +@@ -2881,6 +2900,7 @@ void + shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + { + char block_bname[256] = {0,}; ++ int unref_base_inode = 0; + uuid_t gfid = {0,}; + inode_t *inode = NULL; + inode_t *base_inode = NULL; +@@ -2894,11 +2914,12 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + priv = this->private; + + inode = local->inode_list[shard_block_num - local->first_block]; +- base_inode = local->resolver_base_inode; ++ shard_inode_ctx_get (inode, this, &ctx); ++ base_inode = ctx->base_inode; + if (base_inode) + gf_uuid_copy (gfid, base_inode->gfid); + else +- gf_uuid_copy (gfid, local->base_gfid); ++ gf_uuid_copy (gfid, ctx->base_gfid); + + shard_make_block_bname (shard_block_num, gfid, + block_bname, sizeof (block_bname)); +@@ -2912,17 +2933,16 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + if (!list_empty (&ctx->ilist)) { + list_del_init (&ctx->ilist); + priv->inode_count--; ++ unref_base_inode++; + GF_ASSERT (priv->inode_count >= 0); + unlink_unref_forget = _gf_true; + } + if (ctx->fsync_needed) { +- if (base_inode) +- inode_unref (base_inode); ++ unref_base_inode++; + list_del_init (&ctx->to_fsync_list); +- if (base_inode) { ++ if (base_inode) + __shard_inode_ctx_get (base_inode, this, &base_ictx); +- base_ictx->fsync_count--; +- } ++ base_ictx->fsync_count--; + } + } + UNLOCK(&inode->lock); +@@ -2933,6 +2953,8 @@ shard_unlink_block_inode (shard_local_t *local, int shard_block_num) + inode_unref (inode); + inode_forget (inode, 0); + } ++ if (base_inode && unref_base_inode) ++ inode_ref_reduce_by_n (base_inode, unref_base_inode); + UNLOCK(&priv->lock); + } + +-- +1.8.3.1 + diff --git a/0444-features-shard-fix-formatting-warning.patch b/0444-features-shard-fix-formatting-warning.patch new file mode 100644 index 0000000..8e29c61 --- /dev/null +++ b/0444-features-shard-fix-formatting-warning.patch @@ -0,0 +1,31 @@ +From 6dbeac0371f3f2b42c0b428ba9f95b4ac3bc889d Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez +Date: Thu, 8 Nov 2018 18:42:26 +0100 +Subject: [PATCH 444/444] features/shard: fix formatting warning + +BUG: 1603118 +Change-Id: I6191351f824901a45416ffe7610ad2b964645012 +Signed-off-by: Xavi Hernandez +Reviewed-on: https://code.engineering.redhat.com/gerrit/155395 +Tested-by: RHGS Build Bot +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya +--- + xlators/features/shard/src/shard.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c +index fb88315..5b72399 100644 +--- a/xlators/features/shard/src/shard.c ++++ b/xlators/features/shard/src/shard.c +@@ -6832,7 +6832,7 @@ shard_priv_dump (xlator_t *this) + gf_uint64_2human_readable (priv->block_size)); + gf_proc_dump_write ("inode-count", "%d", priv->inode_count); + gf_proc_dump_write ("ilist_head", "%p", &priv->ilist_head); +- gf_proc_dump_write ("lru-max-limit", "%d", priv->lru_limit); ++ gf_proc_dump_write ("lru-max-limit", "%" PRIu64, priv->lru_limit); + + return 0; + } +-- +1.8.3.1 + diff --git a/glusterfs.spec b/glusterfs.spec index c89d438..861089b 100644 --- a/glusterfs.spec +++ b/glusterfs.spec @@ -192,7 +192,7 @@ Release: 0.1%{?prereltag:.%{prereltag}}%{?dist} %else Name: glusterfs Version: 3.12.2 -Release: 25%{?dist} +Release: 26%{?dist} %endif License: GPLv2 or LGPLv3+ Group: System Environment/Base @@ -672,6 +672,43 @@ Patch0404: 0404-core-Resolve-some-warnings-to-release-a-build.patch Patch0405: 0405-glusterfsd-add-missing-UNLOCK.patch Patch0406: 0406-glusterd-improve-logging-for-stage_deleted-flag.patch Patch0407: 0407-spec-update-RHGS-version-for-RHGSWA.patch +# Patch0408: 0408-Update-rfc.sh-to-rhgs-3.4.2.patch +Patch0409: 0409-Update-database-profile-group.patch +Patch0410: 0410-cli-fix-glusterd-memory-leak-cause-by-gluster-v-stat.patch +Patch0411: 0411-glusterd-ensure-volinfo-caps-is-set-to-correct-value.patch +Patch0412: 0412-glusterd-set-fsid-while-performing-replace-brick.patch +Patch0413: 0413-glusterfind-add-logs-to-identify-parsing-phases.patch +Patch0414: 0414-logrotate-utilize-the-new-maxsize-option.patch +Patch0415: 0415-statedump-fix-clang-null-dereference-error.patch +Patch0416: 0416-glusterd-ignore-RPC-events-when-glusterd-is-shutting.patch +Patch0417: 0417-cli-Add-warning-message-while-converting-to-replica-.patch +Patch0418: 0418-cli-correct-rebalance-status-elapsed-check.patch +Patch0419: 0419-glusterfs-During-reconfigure-set-log-level-per-xlato.patch +Patch0420: 0420-Modify-log-message-DH-ciphers-are-disabled-from-ERRO.patch +Patch0421: 0421-rpc-handle-EAGAIN-when-SSL_ERROR_SYSCALL-is-returned.patch +Patch0422: 0422-glusterd-raise-default-transport.listen-backlog.patch +Patch0423: 0423-glusterd-acquire-lock-to-update-volinfo-structure.patch +Patch0424: 0424-cluster-afr-Delegate-metadata-heal-with-pending-xatt.patch +Patch0425: 0425-cluster-afr-Delegate-name-heal-when-possible.patch +Patch0426: 0426-features-shard-Make-operations-on-internal-directori.patch +Patch0427: 0427-features-shard-Add-option-to-barrier-parallel-lookup.patch +Patch0428: 0428-libglusterfs-syncop-Handle-barrier_-init-destroy-in-.patch +Patch0429: 0429-features-shard-Introducing-.shard-.remove_me-for-ato.patch +Patch0430: 0430-features-shard-Perform-shards-deletion-in-the-backgr.patch +Patch0431: 0431-glusterd-Reset-op-version-for-features.shard-deletio.patch +Patch0432: 0432-features-shard-Fix-crash-and-test-case-in-RENAME-fop.patch +Patch0433: 0433-mgmt-glusterd-use-proper-path-to-the-volfile.patch +Patch0434: 0434-cluster-afr-s-uuid_is_null-gf_uuid_is_null.patch +Patch0435: 0435-geo-rep-Fix-traceback-with-symlink-metadata-sync.patch +Patch0436: 0436-geo-rep-Fix-issue-in-gfid-conflict-resolution.patch +Patch0437: 0437-geo-rep-Add-more-intelligence-to-automatic-error-han.patch +Patch0438: 0438-cluster-dht-In-rename-unlink-after-creating-linkto-f.patch +Patch0439: 0439-cluster-dht-fixes-to-unlinking-invalid-linkto-file.patch +Patch0440: 0440-features-locks-Use-pthread_mutex_unlock-instead-of-p.patch +Patch0441: 0441-features-shard-Make-lru-limit-of-inode-list-configur.patch +Patch0442: 0442-glusterd-Reset-op-version-for-features.shard-lru-lim.patch +Patch0443: 0443-features-shard-Hold-a-ref-on-base-inode-when-adding-.patch +Patch0444: 0444-features-shard-fix-formatting-warning.patch %description GlusterFS is a distributed file-system capable of scaling to several @@ -2620,6 +2657,12 @@ fi %endif %changelog +* Fri Nov 09 2018 Milind Changire - 3.12.2-26 +- fixes bugs bz#1479446 bz#1520882 bz#1579758 bz#1598407 bz#1599808 + bz#1603118 bz#1619357 bz#1622001 bz#1622308 bz#1631166 bz#1631418 bz#1632563 + bz#1634649 bz#1635071 bz#1635100 bz#1635136 bz#1636291 bz#1638069 bz#1640347 + bz#1642854 bz#1643035 bz#1644120 bz#1644279 bz#1645916 bz#1647675 + * Thu Oct 25 2018 Milind Changire - 3.12.2-25 - fixes bugs bz#1641586