6adc3cae7f
Resolves: bz#1488120 bz#1565577 bz#1568297 bz#1570586 bz#1572043 Resolves: bz#1572075 bz#1575840 bz#1575877 Signed-off-by: Milind Changire <mchangir@redhat.com>
1202 lines
51 KiB
Diff
1202 lines
51 KiB
Diff
From 3b37bb9892cd89169d8b4bd308cdca2543fee08c Mon Sep 17 00:00:00 2001
|
|
From: Raghavendra G <rgowdapp@redhat.com>
|
|
Date: Thu, 8 Feb 2018 17:12:41 +0530
|
|
Subject: [PATCH 264/271] cluster/dht: fixes to parallel renames to same
|
|
destination codepath
|
|
|
|
Test case:
|
|
# while true; do uuid="`uuidgen`"; echo "some data" > "test$uuid"; mv
|
|
"test$uuid" "test" -f || break; echo "done:$uuid"; done
|
|
|
|
This script was run in parallel from multiple mountpoints
|
|
|
|
Along the course of getting the above usecase working, many issues
|
|
were found:
|
|
|
|
Issue 1:
|
|
=======
|
|
consider a case of rename (src, dst). We can encounter a situation
|
|
where,
|
|
* dst is a file present at the time of lookup
|
|
* dst is removed by the time rename fop reaches glusterfs
|
|
|
|
In this scenario, acquring inodelk on dst fails with ESTALE resulting
|
|
in failure of rename. However, as per POSIX irrespective of whether
|
|
dst is present or not, rename should be successful. Acquiring entrylk
|
|
provides synchronization even in races like this.
|
|
|
|
Algorithm:
|
|
1. Take inodelks on src and dst (if dst is present) on respective
|
|
cached subvols. These inodelks are done to preserve backward
|
|
compatibility with older clients, so that synchronization is
|
|
preserved when a volume is mounted by clients of different
|
|
versions. Once relevant older versions (3.10, 3.12, 3.13) reach
|
|
EOL, this code can be removed.
|
|
2. Ignore ENOENT/ESTALE errors of inodelk on dst.
|
|
3. protect namespace of src and dst. To protect namespace of a file,
|
|
take inodelk on parent on hashed subvol, then take entrylk on the
|
|
same subvol on parent with basename of file. inodelk on parent is
|
|
done to guard against changes to parent layout so that hashed
|
|
subvol won't change during rename.
|
|
4. <rest of rename continues>
|
|
5. unlock all locks
|
|
|
|
Issue 2:
|
|
========
|
|
linkfile creation in lookup codepath can race with a rename. Imagine
|
|
the following scenario:
|
|
* lookup finds a data-file with gfid - gfid-dst - without a
|
|
corresponding linkto file on hashed-subvol. It decides to create
|
|
linkto file with gfid - gfid-dst.
|
|
- Note that some codepaths of dht-rename deletes linkto file of
|
|
dst as first step. So, a lookup racing with an in-progress
|
|
rename can easily run into this situation.
|
|
* a rename (src-path:gfid-src, dst-path:gfid-dst) renames data-file
|
|
and hence gfid of data-file changes to gfid-src with path dst-path.
|
|
* lookup proceeds and creates linkto file - dst-path - with gfid -
|
|
dst-gfid - on hashed-subvol.
|
|
* rename tries to create a linkto file dst-path with src-gfid on
|
|
hashed-subvol, but it fails with EEXIST. But EEXIST is ignored
|
|
during linkto file creation.
|
|
|
|
Now we've ended with dst-path having different gfids - dst-gfid on
|
|
linkto file and src-gfid on data file. Future lookups on dst-path will
|
|
always fail with ESTALE, due to differing gfids.
|
|
|
|
The fix is to synchronize linkfile creation in lookup path with rename
|
|
using the same mechanism of protecting namespace explained in solution
|
|
of Issue 1. Once locks are acquired, before proceeding with linkfile
|
|
creation, we check whether conditions for linkto file creation are
|
|
still valid. If not, we skip linkto file creation.
|
|
|
|
Issue 3:
|
|
========
|
|
gfid of dst-path can change by the time locks are acquired. This
|
|
means, either another rename overwrote dst-path or dst-path was
|
|
deleted and recreated by a different client. When this happens,
|
|
cached-subvol for dst can change. If rename proceeds with old-gfid and
|
|
old-cached subvol, we'll end up in inconsistent state(s) like dst-path
|
|
with different gfids on different subvols, more than one data-file
|
|
being present etc.
|
|
|
|
Fix is to do the lookup with a new inode after protecting namespace of
|
|
dst. Post lookup, we've to compare gfids and correct local state
|
|
appropriately to be in sync with backend.
|
|
|
|
Issue 4:
|
|
========
|
|
During revalidate lookup, if following a linkto file doesn't lead to a
|
|
valid data-file, local->cached-subvol was not reset to NULL. This
|
|
means we would be operating on a stale state which can lead to
|
|
inconsistency. As a fix, reset it to NULL before proceeding with
|
|
lookup everywhere.
|
|
|
|
Issue 5:
|
|
========
|
|
Stale dentries left out in inode table on brick resulted in failures
|
|
of link fop even though the file/dentry didn't exist on backend fs. A
|
|
patch is submitted to fix this issue. Please check the dependency tree
|
|
of current patch on gerrit for details
|
|
|
|
In short, we fix the problem by not blindly trusting the
|
|
inode-table. Instead we validate whether dentry is present by doing
|
|
lookup on backend fs.
|
|
|
|
>Change-Id: I832e5c47d232f90c4edb1fafc512bf19bebde165
|
|
>updates: bz#1543279
|
|
>BUG: 1543279
|
|
>Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
|
|
|
|
upstream patch: https://review.gluster.org/19547/
|
|
Change-Id: Ief74bd920e807e88eef3f5cf33ba0bf2f0f248f6
|
|
BUG: 1488120
|
|
Signed-off-by: Raghavendra G <rgowdapp@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/138154
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
|
|
---
|
|
tests/bugs/distribute/bug-1543279.t | 65 ++++++
|
|
tests/include.rc | 3 +-
|
|
xlators/cluster/dht/src/dht-common.c | 175 ++++++++++++++--
|
|
xlators/cluster/dht/src/dht-common.h | 10 +-
|
|
xlators/cluster/dht/src/dht-helper.c | 1 +
|
|
xlators/cluster/dht/src/dht-lock.c | 29 ++-
|
|
xlators/cluster/dht/src/dht-rebalance.c | 63 +++++-
|
|
xlators/cluster/dht/src/dht-rename.c | 361 +++++++++++++++++++++++++++-----
|
|
8 files changed, 625 insertions(+), 82 deletions(-)
|
|
create mode 100644 tests/bugs/distribute/bug-1543279.t
|
|
|
|
diff --git a/tests/bugs/distribute/bug-1543279.t b/tests/bugs/distribute/bug-1543279.t
|
|
new file mode 100644
|
|
index 0000000..67cc0f5
|
|
--- /dev/null
|
|
+++ b/tests/bugs/distribute/bug-1543279.t
|
|
@@ -0,0 +1,65 @@
|
|
+#!/bin/bash
|
|
+
|
|
+. $(dirname $0)/../../include.rc
|
|
+. $(dirname $0)/../../volume.rc
|
|
+. $(dirname $0)/../../dht.rc
|
|
+
|
|
+TESTS_EXPECTED_IN_LOOP=44
|
|
+SCRIPT_TIMEOUT=600
|
|
+
|
|
+rename_files() {
|
|
+ MOUNT=$1
|
|
+ ITERATIONS=$2
|
|
+ for i in $(seq 1 $ITERATIONS); do uuid="`uuidgen`"; echo "some data" > $MOUNT/test$uuid; mv $MOUNT/test$uuid $MOUNT/test -f || return $?; done
|
|
+}
|
|
+
|
|
+run_test_for_volume() {
|
|
+ VOLUME=$1
|
|
+ ITERATIONS=$2
|
|
+ TEST_IN_LOOP $CLI volume start $VOLUME
|
|
+
|
|
+ TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M0
|
|
+ TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M1
|
|
+ TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M2
|
|
+ TEST_IN_LOOP glusterfs -s $H0 --volfile-id $VOLUME $M3
|
|
+
|
|
+ rename_files $M0 $ITERATIONS &
|
|
+ M0_RENAME_PID=$!
|
|
+
|
|
+ rename_files $M1 $ITERATIONS &
|
|
+ M1_RENAME_PID=$!
|
|
+
|
|
+ rename_files $M2 $ITERATIONS &
|
|
+ M2_RENAME_PID=$!
|
|
+
|
|
+ rename_files $M3 $ITERATIONS &
|
|
+ M3_RENAME_PID=$!
|
|
+
|
|
+ TEST_IN_LOOP wait $M0_RENAME_PID
|
|
+ TEST_IN_LOOP wait $M1_RENAME_PID
|
|
+ TEST_IN_LOOP wait $M2_RENAME_PID
|
|
+ TEST_IN_LOOP wait $M3_RENAME_PID
|
|
+
|
|
+ TEST_IN_LOOP $CLI volume stop $VOLUME
|
|
+ TEST_IN_LOOP $CLI volume delete $VOLUME
|
|
+ umount $M0 $M1 $M2 $M3
|
|
+}
|
|
+
|
|
+cleanup
|
|
+
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+
|
|
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0..8} force
|
|
+run_test_for_volume $V0 200
|
|
+
|
|
+TEST $CLI volume create $V0 replica 3 arbiter 1 $H0:$B0/${V0}{0..8} force
|
|
+run_test_for_volume $V0 200
|
|
+
|
|
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0..8} force
|
|
+run_test_for_volume $V0 200
|
|
+
|
|
+TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5} force
|
|
+run_test_for_volume $V0 200
|
|
+
|
|
+cleanup
|
|
diff --git a/tests/include.rc b/tests/include.rc
|
|
index 45392e0..aca4c4a 100644
|
|
--- a/tests/include.rc
|
|
+++ b/tests/include.rc
|
|
@@ -1,6 +1,7 @@
|
|
M0=${M0:=/mnt/glusterfs/0}; # 0th mount point for FUSE
|
|
M1=${M1:=/mnt/glusterfs/1}; # 1st mount point for FUSE
|
|
M2=${M2:=/mnt/glusterfs/2}; # 2nd mount point for FUSE
|
|
+M3=${M3:=/mnt/glusterfs/3}; # 3rd mount point for FUSE
|
|
N0=${N0:=/mnt/nfs/0}; # 0th mount point for NFS
|
|
N1=${N1:=/mnt/nfs/1}; # 1st mount point for NFS
|
|
V0=${V0:=patchy}; # volume name to use in tests
|
|
@@ -8,7 +9,7 @@ V1=${V1:=patchy1}; # volume name to use in tests
|
|
GMV0=${GMV0:=master}; # master volume name to use in geo-rep tests
|
|
GSV0=${GSV0:=slave}; # slave volume name to use in geo-rep tests
|
|
B0=${B0:=/d/backends}; # top level of brick directories
|
|
-WORKDIRS="$B0 $M0 $M1 $M2 $N0 $N1"
|
|
+WORKDIRS="$B0 $M0 $M1 $M2 $M3 $N0 $N1"
|
|
|
|
ROOT_GFID="00000000-0000-0000-0000-000000000001"
|
|
DOT_SHARD_GFID="be318638-e8a0-4c6d-977d-7a937aa84806"
|
|
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
|
|
index 5b2c897..ec1628a 100644
|
|
--- a/xlators/cluster/dht/src/dht-common.c
|
|
+++ b/xlators/cluster/dht/src/dht-common.c
|
|
@@ -1931,7 +1931,6 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
|
|
GF_VALIDATE_OR_GOTO ("dht", this, out);
|
|
GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
|
|
GF_VALIDATE_OR_GOTO ("dht", this->private, out);
|
|
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
|
|
|
|
local = frame->local;
|
|
cached_subvol = local->cached_subvol;
|
|
@@ -1939,6 +1938,9 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
|
|
|
|
gf_uuid_unparse(local->loc.gfid, gfid);
|
|
|
|
+ if (local->locked)
|
|
+ dht_unlock_namespace (frame, &local->lock[0]);
|
|
+
|
|
ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode);
|
|
if (ret < 0) {
|
|
gf_msg_debug (this->name, EINVAL,
|
|
@@ -1962,6 +1964,7 @@ dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
|
|
postparent, 1);
|
|
}
|
|
|
|
+
|
|
unwind:
|
|
gf_msg_debug (this->name, 0,
|
|
"creation of linkto on hashed subvol:%s, "
|
|
@@ -2133,6 +2136,134 @@ err:
|
|
return -1;
|
|
|
|
}
|
|
+
|
|
+int32_t
|
|
+dht_linkfile_create_lookup_cbk (call_frame_t *frame, void *cookie,
|
|
+ xlator_t *this, int32_t op_ret,
|
|
+ int32_t op_errno, inode_t *inode,
|
|
+ struct iatt *buf, dict_t *xdata,
|
|
+ struct iatt *postparent)
|
|
+{
|
|
+ dht_local_t *local = NULL;
|
|
+ int call_cnt = 0, ret = 0;
|
|
+ xlator_t *subvol = NULL;
|
|
+ uuid_t gfid = {0, };
|
|
+ char gfid_str[GF_UUID_BUF_SIZE] = {0};
|
|
+
|
|
+ subvol = cookie;
|
|
+ local = frame->local;
|
|
+
|
|
+ if (subvol == local->hashed_subvol) {
|
|
+ if ((op_ret == 0) || (op_errno != ENOENT))
|
|
+ local->dont_create_linkto = _gf_true;
|
|
+ } else {
|
|
+ if (gf_uuid_is_null (local->gfid))
|
|
+ gf_uuid_copy (gfid, local->loc.gfid);
|
|
+ else
|
|
+ gf_uuid_copy (gfid, local->gfid);
|
|
+
|
|
+ if ((op_ret == 0) && gf_uuid_compare (gfid, buf->ia_gfid)) {
|
|
+ gf_uuid_unparse (gfid, gfid_str);
|
|
+ gf_msg_debug (this->name, 0,
|
|
+ "gfid (%s) different on cached subvol "
|
|
+ "(%s) and looked up inode (%s), not "
|
|
+ "creating linkto",
|
|
+ uuid_utoa (buf->ia_gfid), subvol->name,
|
|
+ gfid_str);
|
|
+ local->dont_create_linkto = _gf_true;
|
|
+ } else if (op_ret == -1) {
|
|
+ local->dont_create_linkto = _gf_true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ call_cnt = dht_frame_return (frame);
|
|
+ if (is_last_call (call_cnt)) {
|
|
+ if (local->dont_create_linkto)
|
|
+ goto no_linkto;
|
|
+ else {
|
|
+ gf_msg_debug (this->name, 0,
|
|
+ "Creating linkto file on %s(hash) to "
|
|
+ "%s on %s (gfid = %s)",
|
|
+ local->hashed_subvol->name,
|
|
+ local->loc.path,
|
|
+ local->cached_subvol->name, gfid);
|
|
+
|
|
+ ret = dht_linkfile_create
|
|
+ (frame, dht_lookup_linkfile_create_cbk,
|
|
+ this, local->cached_subvol,
|
|
+ local->hashed_subvol, &local->loc);
|
|
+
|
|
+ if (ret < 0)
|
|
+ goto no_linkto;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+
|
|
+no_linkto:
|
|
+ gf_msg_debug (this->name, 0,
|
|
+ "skipped linkto creation (path:%s) (gfid:%s) "
|
|
+ "(hashed-subvol:%s) (cached-subvol:%s)",
|
|
+ local->loc.path, gfid_str, local->hashed_subvol->name,
|
|
+ local->cached_subvol->name);
|
|
+
|
|
+ dht_lookup_linkfile_create_cbk (frame, NULL, this, 0, 0,
|
|
+ local->loc.inode, &local->stbuf,
|
|
+ &local->preparent, &local->postparent,
|
|
+ local->xattr);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+
|
|
+int32_t
|
|
+dht_call_lookup_linkfile_create (call_frame_t *frame, void *cookie,
|
|
+ xlator_t *this, int32_t op_ret,
|
|
+ int32_t op_errno, dict_t *xdata)
|
|
+{
|
|
+ dht_local_t *local = NULL;
|
|
+ char gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ int i = 0;
|
|
+ xlator_t *subvol = NULL;
|
|
+
|
|
+ local = frame->local;
|
|
+ if (gf_uuid_is_null (local->gfid))
|
|
+ gf_uuid_unparse (local->loc.gfid, gfid);
|
|
+ else
|
|
+ gf_uuid_unparse (local->gfid, gfid);
|
|
+
|
|
+ if (op_ret < 0) {
|
|
+ gf_log (this->name, GF_LOG_WARNING,
|
|
+ "protecting namespace failed, skipping linkto "
|
|
+ "creation (path:%s)(gfid:%s)(hashed-subvol:%s)"
|
|
+ "(cached-subvol:%s)", local->loc.path, gfid,
|
|
+ local->hashed_subvol->name, local->cached_subvol->name);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ local->locked = _gf_true;
|
|
+
|
|
+
|
|
+ local->call_cnt = 2;
|
|
+
|
|
+ for (i = 0; i < 2; i++) {
|
|
+ subvol = (subvol == NULL) ? local->hashed_subvol
|
|
+ : local->cached_subvol;
|
|
+
|
|
+ STACK_WIND_COOKIE (frame, dht_linkfile_create_lookup_cbk,
|
|
+ subvol, subvol, subvol->fops->lookup,
|
|
+ &local->loc, NULL);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+
|
|
+err:
|
|
+ dht_lookup_linkfile_create_cbk (frame, NULL, this, 0, 0,
|
|
+ local->loc.inode,
|
|
+ &local->stbuf, &local->preparent,
|
|
+ &local->postparent, local->xattr);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
|
|
* contains a non-linkto file. After migration it is converted to linkto and
|
|
* then unlinked. And at hashed_subvolume, first a linkto file is present,
|
|
@@ -2176,12 +2307,12 @@ err:
|
|
int
|
|
dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
|
|
{
|
|
- int ret = 0;
|
|
- dht_local_t *local = NULL;
|
|
- xlator_t *hashed_subvol = NULL;
|
|
- xlator_t *cached_subvol = NULL;
|
|
- dht_layout_t *layout = NULL;
|
|
- char gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ int ret = 0;
|
|
+ dht_local_t *local = NULL;
|
|
+ xlator_t *hashed_subvol = NULL;
|
|
+ xlator_t *cached_subvol = NULL;
|
|
+ dht_layout_t *layout = NULL;
|
|
+ char gfid[GF_UUID_BUF_SIZE] = {0};
|
|
gf_boolean_t found_non_linkto_on_hashed = _gf_false;
|
|
|
|
local = frame->local;
|
|
@@ -2273,8 +2404,8 @@ dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
|
|
"unlink on hashed is not skipped %s",
|
|
local->loc.path);
|
|
|
|
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL,
|
|
- NULL, NULL);
|
|
+ DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
|
|
+ NULL, NULL, NULL, NULL);
|
|
}
|
|
return 0;
|
|
}
|
|
@@ -2490,14 +2621,23 @@ preset_layout:
|
|
return 0;
|
|
}
|
|
|
|
- gf_msg_debug (this->name, 0,
|
|
- "Creating linkto file on %s(hash) to %s on %s (gfid = %s)",
|
|
- hashed_subvol->name, local->loc.path,
|
|
- cached_subvol->name, gfid);
|
|
+ if (frame->root->op != GF_FOP_RENAME) {
|
|
+ local->current = &local->lock[0];
|
|
+ ret = dht_protect_namespace (frame, &local->loc, hashed_subvol,
|
|
+ &local->current->ns,
|
|
+ dht_call_lookup_linkfile_create);
|
|
+ } else {
|
|
+ gf_msg_debug (this->name, 0,
|
|
+ "Creating linkto file on %s(hash) to %s on %s "
|
|
+ "(gfid = %s)",
|
|
+ hashed_subvol->name, local->loc.path,
|
|
+ cached_subvol->name, gfid);
|
|
|
|
- ret = dht_linkfile_create (frame,
|
|
- dht_lookup_linkfile_create_cbk, this,
|
|
- cached_subvol, hashed_subvol, &local->loc);
|
|
+ ret = dht_linkfile_create (frame,
|
|
+ dht_lookup_linkfile_create_cbk, this,
|
|
+ cached_subvol, hashed_subvol,
|
|
+ &local->loc);
|
|
+ }
|
|
|
|
return ret;
|
|
|
|
@@ -2800,6 +2940,7 @@ dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
|
|
removed, which can take away the namespace, and subvol is
|
|
anyways down. */
|
|
|
|
+ local->cached_subvol = NULL;
|
|
if (op_errno != ENOTCONN)
|
|
goto err;
|
|
else
|
|
@@ -8175,7 +8316,7 @@ out:
|
|
|
|
int
|
|
dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child,
|
|
- int32_t *op_errno)
|
|
+ int32_t *op_errno)
|
|
{
|
|
inode_table_t *table = NULL;
|
|
int ret = -1;
|
|
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
|
|
index fbc1e29..10b7c7e 100644
|
|
--- a/xlators/cluster/dht/src/dht-common.h
|
|
+++ b/xlators/cluster/dht/src/dht-common.h
|
|
@@ -175,7 +175,8 @@ typedef enum {
|
|
typedef enum {
|
|
REACTION_INVALID,
|
|
FAIL_ON_ANY_ERROR,
|
|
- IGNORE_ENOENT_ESTALE
|
|
+ IGNORE_ENOENT_ESTALE,
|
|
+ IGNORE_ENOENT_ESTALE_EIO,
|
|
} dht_reaction_type_t;
|
|
|
|
struct dht_skip_linkto_unlink {
|
|
@@ -367,6 +368,10 @@ struct dht_local {
|
|
|
|
dht_dir_transaction_t lock[2], *current;
|
|
|
|
+ /* inodelks during filerename for backward compatibility */
|
|
+ dht_lock_t **rename_inodelk_backward_compatible;
|
|
+ int rename_inodelk_bc_count;
|
|
+
|
|
short lock_type;
|
|
|
|
call_stub_t *stub;
|
|
@@ -385,6 +390,9 @@ struct dht_local {
|
|
int32_t valid;
|
|
gf_boolean_t heal_layout;
|
|
int32_t mds_heal_fresh_lookup;
|
|
+ loc_t loc2_copy;
|
|
+ gf_boolean_t locked;
|
|
+ gf_boolean_t dont_create_linkto;
|
|
};
|
|
typedef struct dht_local dht_local_t;
|
|
|
|
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
|
|
index 6e20aea..09ca966 100644
|
|
--- a/xlators/cluster/dht/src/dht-helper.c
|
|
+++ b/xlators/cluster/dht/src/dht-helper.c
|
|
@@ -735,6 +735,7 @@ dht_local_wipe (xlator_t *this, dht_local_t *local)
|
|
|
|
loc_wipe (&local->loc);
|
|
loc_wipe (&local->loc2);
|
|
+ loc_wipe (&local->loc2_copy);
|
|
|
|
if (local->xattr)
|
|
dict_unref (local->xattr);
|
|
diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c
|
|
index 3e82c98..3f389ea 100644
|
|
--- a/xlators/cluster/dht/src/dht-lock.c
|
|
+++ b/xlators/cluster/dht/src/dht-lock.c
|
|
@@ -1015,10 +1015,11 @@ static int32_t
|
|
dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
int32_t op_ret, int32_t op_errno, dict_t *xdata)
|
|
{
|
|
- int lk_index = 0;
|
|
- int i = 0;
|
|
- dht_local_t *local = NULL;
|
|
- char gfid[GF_UUID_BUF_SIZE] = {0,};
|
|
+ int lk_index = 0;
|
|
+ int i = 0;
|
|
+ dht_local_t *local = NULL;
|
|
+ char gfid[GF_UUID_BUF_SIZE] = {0,};
|
|
+ dht_reaction_type_t reaction = 0;
|
|
|
|
lk_index = (long) cookie;
|
|
|
|
@@ -1029,8 +1030,9 @@ dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
switch (op_errno) {
|
|
case ESTALE:
|
|
case ENOENT:
|
|
- if (local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure
|
|
- != IGNORE_ENOENT_ESTALE) {
|
|
+ reaction = local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure;
|
|
+ if ((reaction != IGNORE_ENOENT_ESTALE) &&
|
|
+ (reaction != IGNORE_ENOENT_ESTALE_EIO)) {
|
|
gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid);
|
|
local->lock[0].layout.my_layout.op_ret = -1;
|
|
local->lock[0].layout.my_layout.op_errno = op_errno;
|
|
@@ -1042,6 +1044,21 @@ dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
goto cleanup;
|
|
}
|
|
break;
|
|
+ case EIO:
|
|
+ reaction = local->lock[0].layout.my_layout.locks[lk_index]->do_on_failure;
|
|
+ if (reaction != IGNORE_ENOENT_ESTALE_EIO) {
|
|
+ gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid);
|
|
+ local->lock[0].layout.my_layout.op_ret = -1;
|
|
+ local->lock[0].layout.my_layout.op_errno = op_errno;
|
|
+ gf_msg (this->name, GF_LOG_ERROR, op_errno,
|
|
+ DHT_MSG_INODELK_FAILED,
|
|
+ "inodelk failed on subvol %s. gfid:%s",
|
|
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
|
|
+ gfid);
|
|
+ goto cleanup;
|
|
+ }
|
|
+ break;
|
|
+
|
|
default:
|
|
gf_uuid_unparse (local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid, gfid);
|
|
local->lock[0].layout.my_layout.op_ret = -1;
|
|
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
|
|
index 51af11c..f03931f 100644
|
|
--- a/xlators/cluster/dht/src/dht-rebalance.c
|
|
+++ b/xlators/cluster/dht/src/dht-rebalance.c
|
|
@@ -1470,7 +1470,9 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
struct gf_flock flock = {0, };
|
|
struct gf_flock plock = {0, };
|
|
loc_t tmp_loc = {0, };
|
|
- gf_boolean_t locked = _gf_false;
|
|
+ loc_t parent_loc = {0, };
|
|
+ gf_boolean_t inodelk_locked = _gf_false;
|
|
+ gf_boolean_t entrylk_locked = _gf_false;
|
|
gf_boolean_t p_locked = _gf_false;
|
|
int lk_ret = -1;
|
|
gf_defrag_info_t *defrag = NULL;
|
|
@@ -1484,6 +1486,7 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
gf_boolean_t target_changed = _gf_false;
|
|
xlator_t *new_target = NULL;
|
|
xlator_t *old_target = NULL;
|
|
+ xlator_t *hashed_subvol = NULL;
|
|
fd_t *linkto_fd = NULL;
|
|
|
|
|
|
@@ -1552,6 +1555,28 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
" for file: %s", loc->path);
|
|
}
|
|
|
|
+ ret = dht_build_parent_loc (this, &parent_loc, loc, fop_errno);
|
|
+ if (ret < 0) {
|
|
+ ret = -1;
|
|
+ gf_msg (this->name, GF_LOG_WARNING, *fop_errno,
|
|
+ DHT_MSG_MIGRATE_FILE_FAILED,
|
|
+ "%s: failed to build parent loc, which is needed to "
|
|
+ "acquire entrylk to synchronize with renames on this "
|
|
+ "path. Skipping migration", loc->path);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ hashed_subvol = dht_subvol_get_hashed (this, loc);
|
|
+ if (hashed_subvol == NULL) {
|
|
+ ret = -1;
|
|
+ gf_msg (this->name, GF_LOG_WARNING, EINVAL,
|
|
+ DHT_MSG_MIGRATE_FILE_FAILED,
|
|
+ "%s: cannot find hashed subvol which is needed to "
|
|
+ "synchronize with renames on this path. "
|
|
+ "Skipping migration", loc->path);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
flock.l_type = F_WRLCK;
|
|
|
|
tmp_loc.inode = inode_ref (loc->inode);
|
|
@@ -1576,7 +1601,26 @@ dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
|
|
goto out;
|
|
}
|
|
|
|
- locked = _gf_true;
|
|
+ inodelk_locked = _gf_true;
|
|
+
|
|
+ /* dht_rename has changed to use entrylk on hashed subvol for
|
|
+ * synchronization. So, rebalance too has to acquire an entrylk on
|
|
+ * hashed subvol.
|
|
+ */
|
|
+ ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc,
|
|
+ loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL,
|
|
+ NULL);
|
|
+ if (ret < 0) {
|
|
+ *fop_errno = -ret;
|
|
+ ret = -1;
|
|
+ gf_msg (this->name, GF_LOG_WARNING, *fop_errno,
|
|
+ DHT_MSG_MIGRATE_FILE_FAILED,
|
|
+ "%s: failed to acquire entrylk on subvol %s",
|
|
+ loc->path, hashed_subvol->name);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ entrylk_locked = _gf_true;
|
|
|
|
/* Phase 1 - Data migration is in progress from now on */
|
|
ret = syncop_lookup (from, loc, &stbuf, NULL, dict, &xattr_rsp);
|
|
@@ -2231,7 +2275,7 @@ out:
|
|
}
|
|
}
|
|
|
|
- if (locked) {
|
|
+ if (inodelk_locked) {
|
|
flock.l_type = F_UNLCK;
|
|
|
|
lk_ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN,
|
|
@@ -2244,6 +2288,18 @@ out:
|
|
}
|
|
}
|
|
|
|
+ if (entrylk_locked) {
|
|
+ lk_ret = syncop_entrylk (hashed_subvol, DHT_ENTRY_SYNC_DOMAIN,
|
|
+ &parent_loc, loc->name, ENTRYLK_UNLOCK,
|
|
+ ENTRYLK_UNLOCK, NULL, NULL);
|
|
+ if (lk_ret < 0) {
|
|
+ gf_msg (this->name, GF_LOG_WARNING, -lk_ret,
|
|
+ DHT_MSG_MIGRATE_FILE_FAILED,
|
|
+ "%s: failed to unlock entrylk on %s",
|
|
+ loc->path, hashed_subvol->name);
|
|
+ }
|
|
+ }
|
|
+
|
|
if (p_locked) {
|
|
plock.l_type = F_UNLCK;
|
|
lk_ret = syncop_lk (from, src_fd, F_SETLK, &plock, NULL, NULL);
|
|
@@ -2272,6 +2328,7 @@ out:
|
|
syncop_close (linkto_fd);
|
|
|
|
loc_wipe (&tmp_loc);
|
|
+ loc_wipe (&parent_loc);
|
|
|
|
return ret;
|
|
}
|
|
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
|
|
index 3dc042e..d311ac6 100644
|
|
--- a/xlators/cluster/dht/src/dht-rename.c
|
|
+++ b/xlators/cluster/dht/src/dht-rename.c
|
|
@@ -18,6 +18,9 @@
|
|
#include "defaults.h"
|
|
|
|
int dht_rename_unlock (call_frame_t *frame, xlator_t *this);
|
|
+int32_t
|
|
+dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
|
|
|
|
int
|
|
dht_rename_unlock_cbk (call_frame_t *frame, void *cookie,
|
|
@@ -44,7 +47,7 @@ dht_rename_unlock_cbk (call_frame_t *frame, void *cookie,
|
|
}
|
|
|
|
static void
|
|
-dht_rename_unlock_src (call_frame_t *frame, xlator_t *this)
|
|
+dht_rename_dir_unlock_src (call_frame_t *frame, xlator_t *this)
|
|
{
|
|
dht_local_t *local = NULL;
|
|
|
|
@@ -54,7 +57,7 @@ dht_rename_unlock_src (call_frame_t *frame, xlator_t *this)
|
|
}
|
|
|
|
static void
|
|
-dht_rename_unlock_dst (call_frame_t *frame, xlator_t *this)
|
|
+dht_rename_dir_unlock_dst (call_frame_t *frame, xlator_t *this)
|
|
{
|
|
dht_local_t *local = NULL;
|
|
int op_ret = -1;
|
|
@@ -107,8 +110,8 @@ static int
|
|
dht_rename_dir_unlock (call_frame_t *frame, xlator_t *this)
|
|
{
|
|
|
|
- dht_rename_unlock_src (frame, this);
|
|
- dht_rename_unlock_dst (frame, this);
|
|
+ dht_rename_dir_unlock_src (frame, this);
|
|
+ dht_rename_dir_unlock_dst (frame, this);
|
|
return 0;
|
|
}
|
|
int
|
|
@@ -721,12 +724,13 @@ dht_rename_unlock (call_frame_t *frame, xlator_t *this)
|
|
int op_ret = -1;
|
|
char src_gfid[GF_UUID_BUF_SIZE] = {0};
|
|
char dst_gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ dht_ilock_wrap_t inodelk_wrapper = {0, };
|
|
|
|
local = frame->local;
|
|
- op_ret = dht_unlock_inodelk (frame,
|
|
- local->lock[0].layout.parent_layout.locks,
|
|
- local->lock[0].layout.parent_layout.lk_count,
|
|
- dht_rename_unlock_cbk);
|
|
+ inodelk_wrapper.locks = local->rename_inodelk_backward_compatible;
|
|
+ inodelk_wrapper.lk_count = local->rename_inodelk_bc_count;
|
|
+
|
|
+ op_ret = dht_unlock_inodelk_wrapper (frame, &inodelk_wrapper);
|
|
if (op_ret < 0) {
|
|
uuid_utoa_r (local->loc.inode->gfid, src_gfid);
|
|
|
|
@@ -752,10 +756,13 @@ dht_rename_unlock (call_frame_t *frame, xlator_t *this)
|
|
"stale locks left on bricks",
|
|
local->loc.path, src_gfid,
|
|
local->loc2.path, dst_gfid);
|
|
-
|
|
- dht_rename_unlock_cbk (frame, NULL, this, 0, 0, NULL);
|
|
}
|
|
|
|
+ dht_unlock_namespace (frame, &local->lock[0]);
|
|
+ dht_unlock_namespace (frame, &local->lock[1]);
|
|
+
|
|
+ dht_rename_unlock_cbk (frame, NULL, this, local->op_ret,
|
|
+ local->op_errno, NULL);
|
|
return 0;
|
|
}
|
|
|
|
@@ -1470,6 +1477,8 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
char gfid_local[GF_UUID_BUF_SIZE] = {0};
|
|
char gfid_server[GF_UUID_BUF_SIZE] = {0};
|
|
int child_index = -1;
|
|
+ gf_boolean_t is_src = _gf_false;
|
|
+ loc_t *loc = NULL;
|
|
|
|
|
|
child_index = (long)cookie;
|
|
@@ -1477,22 +1486,98 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
local = frame->local;
|
|
conf = this->private;
|
|
|
|
+ is_src = (child_index == 0);
|
|
+ if (is_src)
|
|
+ loc = &local->loc;
|
|
+ else
|
|
+ loc = &local->loc2;
|
|
+
|
|
+ if (op_ret >= 0) {
|
|
+ if (is_src)
|
|
+ local->src_cached
|
|
+ = dht_subvol_get_cached (this,
|
|
+ local->loc.inode);
|
|
+ else {
|
|
+ if (loc->inode)
|
|
+ gf_uuid_unparse (loc->inode->gfid, gfid_local);
|
|
+
|
|
+ gf_msg_debug (this->name, 0,
|
|
+ "dst_cached before lookup: %s, "
|
|
+ "(path:%s)(gfid:%s),",
|
|
+ local->loc2.path,
|
|
+ local->dst_cached
|
|
+ ? local->dst_cached->name :
|
|
+ NULL,
|
|
+ local->dst_cached ? gfid_local : NULL);
|
|
+
|
|
+ local->dst_cached
|
|
+ = dht_subvol_get_cached (this,
|
|
+ local->loc2_copy.inode);
|
|
+
|
|
+ gf_uuid_unparse (stbuf->ia_gfid, gfid_local);
|
|
+
|
|
+ gf_msg_debug (this->name, GF_LOG_WARNING,
|
|
+ "dst_cached after lookup: %s, "
|
|
+ "(path:%s)(gfid:%s)",
|
|
+ local->loc2.path,
|
|
+ local->dst_cached
|
|
+ ? local->dst_cached->name :
|
|
+ NULL,
|
|
+ local->dst_cached ? gfid_local : NULL);
|
|
+
|
|
+
|
|
+ if ((local->loc2.inode == NULL)
|
|
+ || gf_uuid_compare (stbuf->ia_gfid,
|
|
+ local->loc2.inode->gfid)) {
|
|
+ if (local->loc2.inode != NULL) {
|
|
+ inode_unlink (local->loc2.inode,
|
|
+ local->loc2.parent,
|
|
+ local->loc2.name);
|
|
+ inode_unref (local->loc2.inode);
|
|
+ }
|
|
+
|
|
+ local->loc2.inode
|
|
+ = inode_link (local->loc2_copy.inode,
|
|
+ local->loc2_copy.parent,
|
|
+ local->loc2_copy.name,
|
|
+ stbuf);
|
|
+ gf_uuid_copy (local->loc2.gfid,
|
|
+ stbuf->ia_gfid);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
if (op_ret < 0) {
|
|
- /* The meaning of is_linkfile is overloaded here. For locking
|
|
- * to work properly both rebalance and rename should acquire
|
|
- * lock on datafile. The reason for sending this lookup is to
|
|
- * find out whether we've acquired a lock on data file.
|
|
- * Between the lookup before rename and this rename, the
|
|
- * file could be migrated by a rebalance process and now this
|
|
- * file this might be a linkto file. We verify that by sending
|
|
- * this lookup. However, if this lookup fails we cannot really
|
|
- * say whether we've acquired lock on a datafile or linkto file.
|
|
- * So, we act conservatively and _assume_
|
|
- * that this is a linkfile and fail the rename operation.
|
|
- */
|
|
- local->is_linkfile = _gf_true;
|
|
- local->op_errno = op_errno;
|
|
- } else if (xattr && check_is_linkfile (inode, stbuf, xattr,
|
|
+ if (is_src) {
|
|
+ /* The meaning of is_linkfile is overloaded here. For locking
|
|
+ * to work properly both rebalance and rename should acquire
|
|
+ * lock on datafile. The reason for sending this lookup is to
|
|
+ * find out whether we've acquired a lock on data file.
|
|
+ * Between the lookup before rename and this rename, the
|
|
+ * file could be migrated by a rebalance process and now this
|
|
+ * file this might be a linkto file. We verify that by sending
|
|
+ * this lookup. However, if this lookup fails we cannot really
|
|
+ * say whether we've acquired lock on a datafile or linkto file.
|
|
+ * So, we act conservatively and _assume_
|
|
+ * that this is a linkfile and fail the rename operation.
|
|
+ */
|
|
+ local->is_linkfile = _gf_true;
|
|
+ local->op_errno = op_errno;
|
|
+ } else {
|
|
+ if (local->dst_cached)
|
|
+ gf_msg_debug (this->name, op_errno,
|
|
+ "file %s (gfid:%s) was present "
|
|
+ "(hashed-subvol=%s, "
|
|
+ "cached-subvol=%s) before rename,"
|
|
+ " but lookup failed",
|
|
+ local->loc2.path,
|
|
+ uuid_utoa (local->loc2.inode->gfid),
|
|
+ local->dst_hashed->name,
|
|
+ local->dst_cached->name);
|
|
+ if (dht_inode_missing (op_errno))
|
|
+ local->dst_cached = NULL;
|
|
+ }
|
|
+ } else if (is_src && xattr && check_is_linkfile (inode, stbuf, xattr,
|
|
conf->link_xattr_name)) {
|
|
local->is_linkfile = _gf_true;
|
|
/* Found linkto file instead of data file, passdown ENOENT
|
|
@@ -1500,11 +1585,9 @@ dht_rename_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
local->op_errno = ENOENT;
|
|
}
|
|
|
|
- if (!local->is_linkfile &&
|
|
- gf_uuid_compare (local->lock[0].layout.parent_layout.locks[child_index]->loc.gfid,
|
|
- stbuf->ia_gfid)) {
|
|
- gf_uuid_unparse (local->lock[0].layout.parent_layout.locks[child_index]->loc.gfid,
|
|
- gfid_local);
|
|
+ if (!local->is_linkfile && (op_ret >= 0) &&
|
|
+ gf_uuid_compare (loc->gfid, stbuf->ia_gfid)) {
|
|
+ gf_uuid_unparse (loc->gfid, gfid_local);
|
|
gf_uuid_unparse (stbuf->ia_gfid, gfid_server);
|
|
|
|
gf_msg (this->name, GF_LOG_WARNING, 0,
|
|
@@ -1537,6 +1620,123 @@ fail:
|
|
return 0;
|
|
}
|
|
|
|
+int
|
|
+dht_rename_file_lock1_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
|
|
+{
|
|
+ dht_local_t *local = NULL;
|
|
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ int ret = 0;
|
|
+ loc_t *loc = NULL;
|
|
+ xlator_t *subvol = NULL;
|
|
+
|
|
+ local = frame->local;
|
|
+
|
|
+ if (op_ret < 0) {
|
|
+ uuid_utoa_r (local->loc.inode->gfid, src_gfid);
|
|
+
|
|
+ if (local->loc2.inode)
|
|
+ uuid_utoa_r (local->loc2.inode->gfid, dst_gfid);
|
|
+
|
|
+ gf_msg (this->name, GF_LOG_WARNING, op_errno,
|
|
+ DHT_MSG_INODE_LK_ERROR,
|
|
+ "protecting namespace of %s failed"
|
|
+ "rename (%s:%s:%s %s:%s:%s)",
|
|
+ local->current == &local->lock[0] ? local->loc.path
|
|
+ : local->loc2.path,
|
|
+ local->loc.path, src_gfid, local->src_hashed->name,
|
|
+ local->loc2.path, dst_gfid,
|
|
+ local->dst_hashed ? local->dst_hashed->name : NULL);
|
|
+
|
|
+ local->op_ret = -1;
|
|
+ local->op_errno = op_errno;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (local->current == &local->lock[0]) {
|
|
+ loc = &local->loc2;
|
|
+ subvol = local->dst_hashed;
|
|
+ local->current = &local->lock[1];
|
|
+ } else {
|
|
+ loc = &local->loc;
|
|
+ subvol = local->src_hashed;
|
|
+ local->current = &local->lock[0];
|
|
+ }
|
|
+
|
|
+ ret = dht_protect_namespace (frame, loc, subvol, &local->current->ns,
|
|
+ dht_rename_lock_cbk);
|
|
+ if (ret < 0) {
|
|
+ op_errno = EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+err:
|
|
+ /* No harm in calling an extra unlock */
|
|
+ dht_rename_unlock (frame, this);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int32_t
|
|
+dht_rename_file_protect_namespace (call_frame_t *frame, void *cookie,
|
|
+ xlator_t *this, int32_t op_ret,
|
|
+ int32_t op_errno, dict_t *xdata)
|
|
+{
|
|
+ dht_local_t *local = NULL;
|
|
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ int ret = 0;
|
|
+ loc_t *loc = NULL;
|
|
+ xlator_t *subvol = NULL;
|
|
+
|
|
+ local = frame->local;
|
|
+
|
|
+ if (op_ret < 0) {
|
|
+ uuid_utoa_r (local->loc.inode->gfid, src_gfid);
|
|
+
|
|
+ if (local->loc2.inode)
|
|
+ uuid_utoa_r (local->loc2.inode->gfid, dst_gfid);
|
|
+
|
|
+ gf_msg (this->name, GF_LOG_WARNING, op_errno,
|
|
+ DHT_MSG_INODE_LK_ERROR,
|
|
+ "acquiring inodelk failed "
|
|
+ "rename (%s:%s:%s %s:%s:%s)",
|
|
+ local->loc.path, src_gfid, local->src_cached->name,
|
|
+ local->loc2.path, dst_gfid,
|
|
+ local->dst_cached ? local->dst_cached->name : NULL);
|
|
+
|
|
+ local->op_ret = -1;
|
|
+ local->op_errno = op_errno;
|
|
+
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /* Locks on src and dst needs to ordered which otherwise might cause
|
|
+ * deadlocks when rename (src, dst) and rename (dst, src) is done from
|
|
+ * two different clients
|
|
+ */
|
|
+ dht_order_rename_lock (frame, &loc, &subvol);
|
|
+
|
|
+ ret = dht_protect_namespace (frame, loc, subvol,
|
|
+ &local->current->ns,
|
|
+ dht_rename_file_lock1_cbk);
|
|
+ if (ret < 0) {
|
|
+ op_errno = EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+
|
|
+err:
|
|
+ /* Its fine to call unlock even when no locks are acquired, as we check
|
|
+ * for lock->locked before winding a unlock call.
|
|
+ */
|
|
+ dht_rename_unlock (frame, this);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
int32_t
|
|
dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
int32_t op_ret, int32_t op_errno, dict_t *xdata)
|
|
@@ -1547,8 +1747,8 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
dict_t *xattr_req = NULL;
|
|
dht_conf_t *conf = NULL;
|
|
int i = 0;
|
|
- int count = 0;
|
|
-
|
|
+ xlator_t *subvol = NULL;
|
|
+ dht_lock_t *lock = NULL;
|
|
|
|
local = frame->local;
|
|
conf = this->private;
|
|
@@ -1561,11 +1761,13 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
|
|
gf_msg (this->name, GF_LOG_WARNING, op_errno,
|
|
DHT_MSG_INODE_LK_ERROR,
|
|
- "acquiring inodelk failed "
|
|
+ "protecting namespace of %s failed. "
|
|
"rename (%s:%s:%s %s:%s:%s)",
|
|
- local->loc.path, src_gfid, local->src_cached->name,
|
|
+ local->current == &local->lock[0] ? local->loc.path
|
|
+ : local->loc2.path,
|
|
+ local->loc.path, src_gfid, local->src_hashed->name,
|
|
local->loc2.path, dst_gfid,
|
|
- local->dst_cached ? local->dst_cached->name : NULL);
|
|
+ local->dst_hashed ? local->dst_hashed->name : NULL);
|
|
|
|
local->op_ret = -1;
|
|
local->op_errno = op_errno;
|
|
@@ -1588,7 +1790,19 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
goto done;
|
|
}
|
|
|
|
- count = local->call_cnt = local->lock[0].layout.parent_layout.lk_count;
|
|
+ /* dst_cached might've changed. This normally happens for two reasons:
|
|
+ * 1. rebalance migrated dst
|
|
+ * 2. Another parallel rename was done overwriting dst
|
|
+ *
|
|
+ * Doing a lookup on local->loc2 when dst exists, but is associated
|
|
+ * with a different gfid will result in an ESTALE error. So, do a fresh
|
|
+ * lookup with a new inode on dst-path and handle change of dst-cached
|
|
+ * in the cbk. Also, to identify dst-cached changes we do a lookup on
|
|
+ * "this" rather than the subvol.
|
|
+ */
|
|
+ loc_copy (&local->loc2_copy, &local->loc2);
|
|
+ inode_unref (local->loc2_copy.inode);
|
|
+ local->loc2_copy.inode = inode_new (local->loc.inode->table);
|
|
|
|
/* Why not use local->lock.locks[?].loc for lookup post lock phase
|
|
* ---------------------------------------------------------------
|
|
@@ -1608,13 +1822,26 @@ dht_rename_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
|
|
* exists with the name that the client requested with.
|
|
* */
|
|
|
|
- for (i = 0; i < count; i++) {
|
|
- STACK_WIND_COOKIE (frame, dht_rename_lookup_cbk, (void *)(long)i,
|
|
- local->lock[0].layout.parent_layout.locks[i]->xl,
|
|
- local->lock[0].layout.parent_layout.locks[i]->xl->fops->lookup,
|
|
- ((gf_uuid_compare (local->loc.gfid, \
|
|
- local->lock[0].layout.parent_layout.locks[i]->loc.gfid) == 0) ?
|
|
- &local->loc : &local->loc2), xattr_req);
|
|
+ local->call_cnt = 2;
|
|
+ for (i = 0; i < 2; i++) {
|
|
+ if (i == 0) {
|
|
+ lock = local->rename_inodelk_backward_compatible[0];
|
|
+ if (gf_uuid_compare (local->loc.gfid,
|
|
+ lock->loc.gfid) == 0)
|
|
+ subvol = lock->xl;
|
|
+ else {
|
|
+ lock = local->rename_inodelk_backward_compatible[1];
|
|
+ subvol = lock->xl;
|
|
+ }
|
|
+ } else {
|
|
+ subvol = this;
|
|
+ }
|
|
+
|
|
+ STACK_WIND_COOKIE (frame, dht_rename_lookup_cbk,
|
|
+ (void *)(long)i, subvol,
|
|
+ subvol->fops->lookup,
|
|
+ (i == 0) ? &local->loc : &local->loc2_copy,
|
|
+ xattr_req);
|
|
}
|
|
|
|
dict_unref (xattr_req);
|
|
@@ -1644,7 +1871,8 @@ dht_rename_lock (call_frame_t *frame)
|
|
if (local->dst_cached)
|
|
count++;
|
|
|
|
- lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_pointer);
|
|
+ lk_array = GF_CALLOC (count, sizeof (*lk_array),
|
|
+ gf_common_mt_pointer);
|
|
if (lk_array == NULL)
|
|
goto err;
|
|
|
|
@@ -1655,22 +1883,40 @@ dht_rename_lock (call_frame_t *frame)
|
|
goto err;
|
|
|
|
if (local->dst_cached) {
|
|
+ /* dst might be removed by the time inodelk reaches bricks,
|
|
+ * which can result in ESTALE errors. POSIX imposes no
|
|
+ * restriction for dst to be present for renames to be
|
|
+ * successful. So, we'll ignore ESTALE errors. As far as
|
|
+ * synchronization on dst goes, we'll achieve the same by
|
|
+ * holding entrylk on parent directory of dst in the namespace
|
|
+ * of basename(dst). Also, there might not be quorum in cluster
|
|
+ * xlators like EC/disperse on errno, in which case they return
|
|
+ * EIO. For eg., in a disperse (4 + 2), 3 might return success
|
|
+ * and three might return ESTALE. Disperse, having no Quorum
|
|
+ * unwinds inodelk with EIO. So, ignore EIO too.
|
|
+ */
|
|
lk_array[1] = dht_lock_new (frame->this, local->dst_cached,
|
|
&local->loc2, F_WRLCK,
|
|
DHT_FILE_MIGRATE_DOMAIN, NULL,
|
|
- FAIL_ON_ANY_ERROR);
|
|
+ IGNORE_ENOENT_ESTALE_EIO);
|
|
if (lk_array[1] == NULL)
|
|
goto err;
|
|
}
|
|
|
|
- local->lock[0].layout.parent_layout.locks = lk_array;
|
|
- local->lock[0].layout.parent_layout.lk_count = count;
|
|
+ local->rename_inodelk_backward_compatible = lk_array;
|
|
+ local->rename_inodelk_bc_count = count;
|
|
|
|
+ /* retaining inodelks for the sake of backward compatibility. Please
|
|
+ * make sure to remove this inodelk once all of 3.10, 3.12 and 3.13
|
|
+ * reach EOL. Better way of getting synchronization would be to acquire
|
|
+ * entrylks on src and dst parent directories in the namespace of
|
|
+ * basenames of src and dst
|
|
+ */
|
|
ret = dht_blocking_inodelk (frame, lk_array, count,
|
|
- dht_rename_lock_cbk);
|
|
+ dht_rename_file_protect_namespace);
|
|
if (ret < 0) {
|
|
- local->lock[0].layout.parent_layout.locks = NULL;
|
|
- local->lock[0].layout.parent_layout.lk_count = 0;
|
|
+ local->rename_inodelk_backward_compatible = NULL;
|
|
+ local->rename_inodelk_bc_count = 0;
|
|
goto err;
|
|
}
|
|
|
|
@@ -1701,6 +1947,7 @@ dht_rename (call_frame_t *frame, xlator_t *this,
|
|
dht_local_t *local = NULL;
|
|
dht_conf_t *conf = NULL;
|
|
char gfid[GF_UUID_BUF_SIZE] = {0};
|
|
+ char newgfid[GF_UUID_BUF_SIZE] = {0};
|
|
|
|
VALIDATE_OR_GOTO (frame, err);
|
|
VALIDATE_OR_GOTO (this, err);
|
|
@@ -1772,11 +2019,15 @@ dht_rename (call_frame_t *frame, xlator_t *this,
|
|
if (xdata)
|
|
local->xattr_req = dict_ref (xdata);
|
|
|
|
+ if (newloc->inode)
|
|
+ gf_uuid_unparse(newloc->inode->gfid, newgfid);
|
|
+
|
|
gf_msg (this->name, GF_LOG_INFO, 0,
|
|
DHT_MSG_RENAME_INFO,
|
|
- "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)",
|
|
- oldloc->path, src_hashed->name, src_cached->name,
|
|
- newloc->path, dst_hashed->name,
|
|
+ "renaming %s (%s) (hash=%s/cache=%s) => %s (%s) "
|
|
+ "(hash=%s/cache=%s) ",
|
|
+ oldloc->path, gfid, src_hashed->name, src_cached->name,
|
|
+ newloc->path, newloc->inode ? newgfid : NULL, dst_hashed->name,
|
|
dst_cached ? dst_cached->name : "<nul>");
|
|
|
|
if (IA_ISDIR (oldloc->inode->ia_type)) {
|
|
@@ -1784,8 +2035,10 @@ dht_rename (call_frame_t *frame, xlator_t *this,
|
|
} else {
|
|
local->op_ret = 0;
|
|
ret = dht_rename_lock (frame);
|
|
- if (ret < 0)
|
|
+ if (ret < 0) {
|
|
+ op_errno = ENOMEM;
|
|
goto err;
|
|
+ }
|
|
}
|
|
|
|
return 0;
|
|
--
|
|
1.8.3.1
|
|
|