713 lines
23 KiB
Diff
713 lines
23 KiB
Diff
|
From cb0d240004e6d40f8d7f30d177d5970ebc8e25fb Mon Sep 17 00:00:00 2001
|
||
|
From: Vinayak hariharmath <65405035+VHariharmath-rh@users.noreply.github.com>
|
||
|
Date: Wed, 3 Feb 2021 17:04:25 +0530
|
||
|
Subject: [PATCH 574/584] features/shard: delay unlink of a file that has
|
||
|
fd_count > 0
|
||
|
|
||
|
When there are multiple processes working on a file and if any
|
||
|
process unlinks that file then unlink operation shouldn't harm
|
||
|
other processes working on it. This is a posix a compliant
|
||
|
behavior and this should be supported when shard feature is
|
||
|
enabled also.
|
||
|
|
||
|
Problem description:
|
||
|
Let's consider 2 clients C1 and C2 working on a file F1 with 5
|
||
|
shards on gluster mount and gluster server has 4 bricks
|
||
|
B1, B2, B3, B4.
|
||
|
|
||
|
Assume that base file/shard is present on B1, 1st, 2nd shards
|
||
|
on B2, 3rd and 4th shards on B3 and 5th shard falls on B4 C1
|
||
|
has opened the F1 in append mode and is writing to it. The
|
||
|
write FOP goes to 5th shard in this case. So the
|
||
|
inode->fd_count = 1 on B1(base file) and B4 (5th shard).
|
||
|
|
||
|
C2 at the same time issued unlink to F1. On the server, the
|
||
|
base file has fd_count = 1 (since C1 has opened the file),
|
||
|
the base file is renamed under .glusterfs/unlink and
|
||
|
returned to C2. Then unlink will be sent to shards on all
|
||
|
bricks and shards on B2 and B3 will be deleted which have
|
||
|
no open reference yet. C1 starts getting errors while
|
||
|
accessing the remaining shards though it has open references
|
||
|
for the file.
|
||
|
|
||
|
This is one such undefined behavior. Likewise we will
|
||
|
encounter many such undefined behaviors as we dont have one
|
||
|
global lock to access all shards as one. Of Course having such
|
||
|
global lock will lead to performance hit as it reduces window
|
||
|
for parallel access of shards.
|
||
|
|
||
|
Solution:
|
||
|
The above undefined behavior can be addressed by delaying the
|
||
|
unlink of a file when there are open references on it.
|
||
|
File unlink happens in 2 steps.
|
||
|
step 1: client creates marker file under .shard/remove_me and
|
||
|
sends unlink on base file to the server
|
||
|
step 2: on return from the server, the associated shards will
|
||
|
be cleaned up and finally marker file will be removed.
|
||
|
|
||
|
In step 2, the back ground deletion process does nameless
|
||
|
lookup using marker file name (marker file is named after the
|
||
|
gfid of the base file) in glusterfs/unlink dir. If the nameless
|
||
|
look up is successful then that means the gfid still has open
|
||
|
fds and deletion of shards has to be delayed. If nameless
|
||
|
lookup fails then that indicates the gfid is unlinked and no
|
||
|
open fds on that file (the gfid path is unlinked during final
|
||
|
close on the file). The shards on which deletion is delayed
|
||
|
are unlinked one the all open fds are closed and this is
|
||
|
done through a thread which wakes up every 10 mins.
|
||
|
|
||
|
Also removed active_fd_count from inode structure and
|
||
|
referring fd_count wherever active_fd_count was used.
|
||
|
|
||
|
Backport of:
|
||
|
> Upstream-patch: https://github.com/gluster/glusterfs/pull/1563
|
||
|
> Fixes: #1358
|
||
|
> Change-Id: I8985093386e26215e0b0dce294c534a66f6ca11c
|
||
|
> Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
|
||
|
|
||
|
BUG: 1782428
|
||
|
Change-Id: I8985093386e26215e0b0dce294c534a66f6ca11c
|
||
|
Signed-off-by: Vinayakswami Hariharmath <vharihar@redhat.com>
|
||
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/c/rhs-glusterfs/+/244967
|
||
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
||
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
||
|
---
|
||
|
libglusterfs/src/glusterfs/glusterfs.h | 1 +
|
||
|
tests/bugs/shard/issue-1358.t | 100 +++++++++++++
|
||
|
tests/bugs/shard/unlinks-and-renames.t | 5 +
|
||
|
xlators/features/shard/src/shard.c | 199 ++++++++++++++++++++++++-
|
||
|
xlators/features/shard/src/shard.h | 11 ++
|
||
|
xlators/storage/posix/src/posix-entry-ops.c | 36 +++++
|
||
|
xlators/storage/posix/src/posix-inode-fd-ops.c | 64 +++++---
|
||
|
7 files changed, 391 insertions(+), 25 deletions(-)
|
||
|
create mode 100644 tests/bugs/shard/issue-1358.t
|
||
|
|
||
|
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
|
||
|
index d3400bf..4401cf6 100644
|
||
|
--- a/libglusterfs/src/glusterfs/glusterfs.h
|
||
|
+++ b/libglusterfs/src/glusterfs/glusterfs.h
|
||
|
@@ -261,6 +261,7 @@ enum gf_internal_fop_indicator {
|
||
|
#define GF_XATTROP_PURGE_INDEX "glusterfs.xattrop-purge-index"
|
||
|
|
||
|
#define GF_GFIDLESS_LOOKUP "gfidless-lookup"
|
||
|
+#define GF_UNLINKED_LOOKUP "unlinked-lookup"
|
||
|
/* replace-brick and pump related internal xattrs */
|
||
|
#define RB_PUMP_CMD_START "glusterfs.pump.start"
|
||
|
#define RB_PUMP_CMD_PAUSE "glusterfs.pump.pause"
|
||
|
diff --git a/tests/bugs/shard/issue-1358.t b/tests/bugs/shard/issue-1358.t
|
||
|
new file mode 100644
|
||
|
index 0000000..1838e06
|
||
|
--- /dev/null
|
||
|
+++ b/tests/bugs/shard/issue-1358.t
|
||
|
@@ -0,0 +1,100 @@
|
||
|
+#!/bin/bash
|
||
|
+
|
||
|
+. $(dirname $0)/../../include.rc
|
||
|
+. $(dirname $0)/../../volume.rc
|
||
|
+
|
||
|
+cleanup;
|
||
|
+
|
||
|
+FILE_COUNT_TIME=5
|
||
|
+
|
||
|
+function get_file_count {
|
||
|
+ ls $1* | wc -l
|
||
|
+}
|
||
|
+
|
||
|
+TEST glusterd
|
||
|
+TEST pidof glusterd
|
||
|
+TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
|
||
|
+TEST $CLI volume set $V0 features.shard on
|
||
|
+TEST $CLI volume set $V0 features.shard-block-size 4MB
|
||
|
+TEST $CLI volume set $V0 performance.quick-read off
|
||
|
+TEST $CLI volume set $V0 performance.io-cache off
|
||
|
+TEST $CLI volume set $V0 performance.read-ahead off
|
||
|
+TEST $CLI volume set $V0 performance.write-behind off
|
||
|
+TEST $CLI volume start $V0
|
||
|
+
|
||
|
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
|
||
|
+
|
||
|
+TEST mkdir $M0/dir
|
||
|
+TEST dd if=/dev/urandom of=$M0/dir/foo bs=4M count=5
|
||
|
+gfid_new=$(get_gfid_string $M0/dir/foo)
|
||
|
+
|
||
|
+# Ensure its shards dir is created now.
|
||
|
+TEST stat $B0/${V0}0/.shard/$gfid_new.1
|
||
|
+TEST stat $B0/${V0}1/.shard/$gfid_new.1
|
||
|
+TEST stat $B0/${V0}0/.shard/$gfid_new.2
|
||
|
+TEST stat $B0/${V0}1/.shard/$gfid_new.2
|
||
|
+
|
||
|
+# Open a file and store descriptor in fd = 5
|
||
|
+exec 5>$M0/dir/foo
|
||
|
+
|
||
|
+# Write something on the file using the open fd = 5
|
||
|
+echo "issue-1358" >&5
|
||
|
+
|
||
|
+# Write on the descriptor should be succesful
|
||
|
+EXPECT 0 echo $?
|
||
|
+
|
||
|
+# Unlink the same file which is opened in prev step
|
||
|
+TEST unlink $M0/dir/foo
|
||
|
+
|
||
|
+# Check the base file
|
||
|
+TEST ! stat $M0/dir/foo
|
||
|
+TEST ! stat $B0/${V0}0/foo
|
||
|
+TEST ! stat $B0/${V0}1/foo
|
||
|
+
|
||
|
+# Write something on the file using the open fd = 5
|
||
|
+echo "issue-1281" >&5
|
||
|
+
|
||
|
+# Write on the descriptor should be succesful
|
||
|
+EXPECT 0 echo $?
|
||
|
+
|
||
|
+# Check ".shard/.remove_me"
|
||
|
+EXPECT_WITHIN $FILE_COUNT_TIME 1 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new
|
||
|
+EXPECT_WITHIN $FILE_COUNT_TIME 1 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new
|
||
|
+
|
||
|
+# Close the fd = 5
|
||
|
+exec 5>&-
|
||
|
+
|
||
|
+###### To see the shards deleted, wait for 10 mins or repeat the same steps i.e open a file #####
|
||
|
+###### write something to it, unlink it and close it. This will wake up the thread that is ######
|
||
|
+###### responsible to delete the shards
|
||
|
+
|
||
|
+TEST touch $M0/dir/new
|
||
|
+exec 6>$M0/dir/new
|
||
|
+echo "issue-1358" >&6
|
||
|
+EXPECT 0 echo $?
|
||
|
+TEST unlink $M0/dir/new
|
||
|
+exec 6>&-
|
||
|
+
|
||
|
+# Now check the ".shard/remove_me" and the gfid will not be there
|
||
|
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}0/.shard/.remove_me/$gfid_new
|
||
|
+EXPECT_WITHIN $FILE_COUNT_TIME 0 get_file_count $B0/${V0}1/.shard/.remove_me/$gfid_new
|
||
|
+
|
||
|
+# check for the absence of shards
|
||
|
+TEST ! stat $B0/${V0}0/.shard/$gfid_new.1
|
||
|
+TEST ! stat $B0/${V0}1/.shard/$gfid_new.1
|
||
|
+TEST ! stat $B0/${V0}0/.shard/$gfid_new.2
|
||
|
+TEST ! stat $B0/${V0}1/.shard/$gfid_new.2
|
||
|
+
|
||
|
+#### Create the file with same name and check creation and deletion works fine ######
|
||
|
+TEST dd if=/dev/urandom of=$M0/dir/foo bs=4M count=5
|
||
|
+gfid_new=$(get_gfid_string $M0/dir/foo)
|
||
|
+
|
||
|
+# Ensure its shards dir is created now.
|
||
|
+TEST stat $B0/${V0}0/.shard/$gfid_new.1
|
||
|
+TEST stat $B0/${V0}1/.shard/$gfid_new.1
|
||
|
+TEST stat $B0/${V0}0/.shard/$gfid_new.2
|
||
|
+TEST stat $B0/${V0}1/.shard/$gfid_new.2
|
||
|
+
|
||
|
+TEST unlink $M0/dir/foo
|
||
|
+cleanup
|
||
|
+
|
||
|
diff --git a/tests/bugs/shard/unlinks-and-renames.t b/tests/bugs/shard/unlinks-and-renames.t
|
||
|
index 990ca69..3280fcb 100644
|
||
|
--- a/tests/bugs/shard/unlinks-and-renames.t
|
||
|
+++ b/tests/bugs/shard/unlinks-and-renames.t
|
||
|
@@ -24,6 +24,11 @@ TEST pidof glusterd
|
||
|
TEST $CLI volume create $V0 replica 2 $H0:$B0/${V0}{0,1}
|
||
|
TEST $CLI volume set $V0 features.shard on
|
||
|
TEST $CLI volume set $V0 features.shard-block-size 4MB
|
||
|
+TEST $CLI volume set $V0 performance.quick-read off
|
||
|
+TEST $CLI volume set $V0 performance.io-cache off
|
||
|
+TEST $CLI volume set $V0 performance.read-ahead off
|
||
|
+TEST $CLI volume set $V0 performance.write-behind off
|
||
|
+
|
||
|
TEST $CLI volume start $V0
|
||
|
TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 $M0
|
||
|
|
||
|
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
|
||
|
index 8d4a970..b828ff9 100644
|
||
|
--- a/xlators/features/shard/src/shard.c
|
||
|
+++ b/xlators/features/shard/src/shard.c
|
||
|
@@ -1242,7 +1242,8 @@ out:
|
||
|
|
||
|
static inode_t *
|
||
|
shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode,
|
||
|
- struct iatt *buf, shard_internal_dir_type_t type)
|
||
|
+ xlator_t *this, struct iatt *buf,
|
||
|
+ shard_internal_dir_type_t type)
|
||
|
{
|
||
|
inode_t *linked_inode = NULL;
|
||
|
shard_priv_t *priv = NULL;
|
||
|
@@ -1250,7 +1251,7 @@ shard_link_internal_dir_inode(shard_local_t *local, inode_t *inode,
|
||
|
inode_t **priv_inode = NULL;
|
||
|
inode_t *parent = NULL;
|
||
|
|
||
|
- priv = THIS->private;
|
||
|
+ priv = this->private;
|
||
|
|
||
|
switch (type) {
|
||
|
case SHARD_INTERNAL_DIR_DOT_SHARD:
|
||
|
@@ -1294,7 +1295,7 @@ shard_refresh_internal_dir_cbk(call_frame_t *frame, void *cookie,
|
||
|
/* To-Do: Fix refcount increment per call to
|
||
|
* shard_link_internal_dir_inode().
|
||
|
*/
|
||
|
- linked_inode = shard_link_internal_dir_inode(local, inode, buf, type);
|
||
|
+ linked_inode = shard_link_internal_dir_inode(local, inode, this, buf, type);
|
||
|
shard_inode_ctx_mark_dir_refreshed(linked_inode, this);
|
||
|
out:
|
||
|
shard_common_resolve_shards(frame, this, local->post_res_handler);
|
||
|
@@ -1383,7 +1384,7 @@ shard_lookup_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
|
||
|
goto unwind;
|
||
|
}
|
||
|
|
||
|
- link_inode = shard_link_internal_dir_inode(local, inode, buf, type);
|
||
|
+ link_inode = shard_link_internal_dir_inode(local, inode, this, buf, type);
|
||
|
if (link_inode != inode) {
|
||
|
shard_refresh_internal_dir(frame, this, type);
|
||
|
} else {
|
||
|
@@ -3586,7 +3587,8 @@ shard_resolve_internal_dir(xlator_t *this, shard_local_t *local,
|
||
|
"Lookup on %s failed, exiting", bname);
|
||
|
goto err;
|
||
|
} else {
|
||
|
- shard_link_internal_dir_inode(local, loc->inode, &stbuf, type);
|
||
|
+ shard_link_internal_dir_inode(local, loc->inode, this, &stbuf,
|
||
|
+ type);
|
||
|
}
|
||
|
}
|
||
|
ret = 0;
|
||
|
@@ -3633,6 +3635,45 @@ err:
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
+static int
|
||
|
+shard_nameless_lookup_base_file(xlator_t *this, char *gfid)
|
||
|
+{
|
||
|
+ int ret = 0;
|
||
|
+ loc_t loc = {
|
||
|
+ 0,
|
||
|
+ };
|
||
|
+ dict_t *xattr_req = dict_new();
|
||
|
+ if (!xattr_req) {
|
||
|
+ ret = -1;
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ loc.inode = inode_new(this->itable);
|
||
|
+ if (loc.inode == NULL) {
|
||
|
+ ret = -1;
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = gf_uuid_parse(gfid, loc.gfid);
|
||
|
+ if (ret < 0)
|
||
|
+ goto out;
|
||
|
+
|
||
|
+ ret = dict_set_uint32(xattr_req, GF_UNLINKED_LOOKUP, 1);
|
||
|
+ if (ret < 0)
|
||
|
+ goto out;
|
||
|
+
|
||
|
+ ret = syncop_lookup(FIRST_CHILD(this), &loc, NULL, NULL, xattr_req, NULL);
|
||
|
+ if (ret < 0)
|
||
|
+ goto out;
|
||
|
+
|
||
|
+out:
|
||
|
+ if (xattr_req)
|
||
|
+ dict_unref(xattr_req);
|
||
|
+ loc_wipe(&loc);
|
||
|
+
|
||
|
+ return ret;
|
||
|
+}
|
||
|
+
|
||
|
int
|
||
|
shard_delete_shards(void *opaque)
|
||
|
{
|
||
|
@@ -3734,6 +3775,11 @@ shard_delete_shards(void *opaque)
|
||
|
if (ret < 0)
|
||
|
continue;
|
||
|
}
|
||
|
+
|
||
|
+ ret = shard_nameless_lookup_base_file(this, entry->d_name);
|
||
|
+ if (!ret)
|
||
|
+ continue;
|
||
|
+
|
||
|
link_inode = inode_link(entry->inode, local->fd->inode,
|
||
|
entry->d_name, &entry->d_stat);
|
||
|
|
||
|
@@ -4105,6 +4151,9 @@ err:
|
||
|
int
|
||
|
shard_unlock_entrylk(call_frame_t *frame, xlator_t *this);
|
||
|
|
||
|
+static int
|
||
|
+shard_unlink_handler_spawn(xlator_t *this);
|
||
|
+
|
||
|
int
|
||
|
shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
|
||
|
int32_t op_ret, int32_t op_errno,
|
||
|
@@ -4126,7 +4175,7 @@ shard_unlink_base_file_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
|
||
|
if (xdata)
|
||
|
local->xattr_rsp = dict_ref(xdata);
|
||
|
if (local->cleanup_required)
|
||
|
- shard_start_background_deletion(this);
|
||
|
+ shard_unlink_handler_spawn(this);
|
||
|
}
|
||
|
|
||
|
if (local->entrylk_frame) {
|
||
|
@@ -5785,7 +5834,7 @@ shard_mkdir_internal_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
- link_inode = shard_link_internal_dir_inode(local, inode, buf, type);
|
||
|
+ link_inode = shard_link_internal_dir_inode(local, inode, this, buf, type);
|
||
|
if (link_inode != inode) {
|
||
|
shard_refresh_internal_dir(frame, this, type);
|
||
|
} else {
|
||
|
@@ -7098,6 +7147,132 @@ shard_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+static void
|
||
|
+shard_unlink_wait(shard_unlink_thread_t *ti)
|
||
|
+{
|
||
|
+ struct timespec wait_till = {
|
||
|
+ 0,
|
||
|
+ };
|
||
|
+
|
||
|
+ pthread_mutex_lock(&ti->mutex);
|
||
|
+ {
|
||
|
+ /* shard_unlink_handler() runs every 10 mins of interval */
|
||
|
+ wait_till.tv_sec = time(NULL) + 600;
|
||
|
+
|
||
|
+ while (!ti->rerun) {
|
||
|
+ if (pthread_cond_timedwait(&ti->cond, &ti->mutex, &wait_till) ==
|
||
|
+ ETIMEDOUT)
|
||
|
+ break;
|
||
|
+ }
|
||
|
+ ti->rerun = _gf_false;
|
||
|
+ }
|
||
|
+ pthread_mutex_unlock(&ti->mutex);
|
||
|
+}
|
||
|
+
|
||
|
+static void *
|
||
|
+shard_unlink_handler(void *data)
|
||
|
+{
|
||
|
+ shard_unlink_thread_t *ti = data;
|
||
|
+ xlator_t *this = ti->this;
|
||
|
+
|
||
|
+ THIS = this;
|
||
|
+
|
||
|
+ while (!ti->stop) {
|
||
|
+ shard_start_background_deletion(this);
|
||
|
+ shard_unlink_wait(ti);
|
||
|
+ }
|
||
|
+ return NULL;
|
||
|
+}
|
||
|
+
|
||
|
+static int
|
||
|
+shard_unlink_handler_spawn(xlator_t *this)
|
||
|
+{
|
||
|
+ int ret = 0;
|
||
|
+ shard_priv_t *priv = this->private;
|
||
|
+ shard_unlink_thread_t *ti = &priv->thread_info;
|
||
|
+
|
||
|
+ ti->this = this;
|
||
|
+
|
||
|
+ pthread_mutex_lock(&ti->mutex);
|
||
|
+ {
|
||
|
+ if (ti->running) {
|
||
|
+ pthread_cond_signal(&ti->cond);
|
||
|
+ } else {
|
||
|
+ ret = gf_thread_create(&ti->thread, NULL, shard_unlink_handler, ti,
|
||
|
+ "shard_unlink");
|
||
|
+ if (ret < 0) {
|
||
|
+ gf_log(this->name, GF_LOG_ERROR,
|
||
|
+ "Failed to create \"shard_unlink\" thread");
|
||
|
+ goto unlock;
|
||
|
+ }
|
||
|
+ ti->running = _gf_true;
|
||
|
+ }
|
||
|
+
|
||
|
+ ti->rerun = _gf_true;
|
||
|
+ }
|
||
|
+unlock:
|
||
|
+ pthread_mutex_unlock(&ti->mutex);
|
||
|
+ return ret;
|
||
|
+}
|
||
|
+
|
||
|
+static int
|
||
|
+shard_unlink_handler_init(shard_unlink_thread_t *ti)
|
||
|
+{
|
||
|
+ int ret = 0;
|
||
|
+ xlator_t *this = THIS;
|
||
|
+
|
||
|
+ ret = pthread_mutex_init(&ti->mutex, NULL);
|
||
|
+ if (ret) {
|
||
|
+ gf_log(this->name, GF_LOG_ERROR,
|
||
|
+ "Failed to init mutex for \"shard_unlink\" thread");
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ret = pthread_cond_init(&ti->cond, NULL);
|
||
|
+ if (ret) {
|
||
|
+ gf_log(this->name, GF_LOG_ERROR,
|
||
|
+ "Failed to init cond var for \"shard_unlink\" thread");
|
||
|
+ pthread_mutex_destroy(&ti->mutex);
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
+ ti->running = _gf_false;
|
||
|
+ ti->rerun = _gf_false;
|
||
|
+ ti->stop = _gf_false;
|
||
|
+
|
||
|
+out:
|
||
|
+ return -ret;
|
||
|
+}
|
||
|
+
|
||
|
+static void
|
||
|
+shard_unlink_handler_fini(shard_unlink_thread_t *ti)
|
||
|
+{
|
||
|
+ int ret = 0;
|
||
|
+ xlator_t *this = THIS;
|
||
|
+ if (!ti)
|
||
|
+ return;
|
||
|
+
|
||
|
+ pthread_mutex_lock(&ti->mutex);
|
||
|
+ if (ti->running) {
|
||
|
+ ti->rerun = _gf_true;
|
||
|
+ ti->stop = _gf_true;
|
||
|
+ pthread_cond_signal(&ti->cond);
|
||
|
+ }
|
||
|
+ pthread_mutex_unlock(&ti->mutex);
|
||
|
+
|
||
|
+ if (ti->running) {
|
||
|
+ ret = pthread_join(ti->thread, NULL);
|
||
|
+ if (ret)
|
||
|
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
|
||
|
+ "Failed to clean up shard unlink thread.");
|
||
|
+ ti->running = _gf_false;
|
||
|
+ }
|
||
|
+ ti->thread = 0;
|
||
|
+
|
||
|
+ pthread_cond_destroy(&ti->cond);
|
||
|
+ pthread_mutex_destroy(&ti->mutex);
|
||
|
+}
|
||
|
+
|
||
|
int32_t
|
||
|
mem_acct_init(xlator_t *this)
|
||
|
{
|
||
|
@@ -7164,6 +7339,14 @@ init(xlator_t *this)
|
||
|
this->private = priv;
|
||
|
LOCK_INIT(&priv->lock);
|
||
|
INIT_LIST_HEAD(&priv->ilist_head);
|
||
|
+
|
||
|
+ ret = shard_unlink_handler_init(&priv->thread_info);
|
||
|
+ if (ret) {
|
||
|
+ gf_log(this->name, GF_LOG_ERROR,
|
||
|
+ "Failed to initialize resources for \"shard_unlink\" thread");
|
||
|
+ goto out;
|
||
|
+ }
|
||
|
+
|
||
|
ret = 0;
|
||
|
out:
|
||
|
if (ret) {
|
||
|
@@ -7188,6 +7371,8 @@ fini(xlator_t *this)
|
||
|
if (!priv)
|
||
|
goto out;
|
||
|
|
||
|
+ shard_unlink_handler_fini(&priv->thread_info);
|
||
|
+
|
||
|
this->private = NULL;
|
||
|
LOCK_DESTROY(&priv->lock);
|
||
|
GF_FREE(priv);
|
||
|
diff --git a/xlators/features/shard/src/shard.h b/xlators/features/shard/src/shard.h
|
||
|
index 4fe181b..3dcb112 100644
|
||
|
--- a/xlators/features/shard/src/shard.h
|
||
|
+++ b/xlators/features/shard/src/shard.h
|
||
|
@@ -207,6 +207,16 @@ typedef enum {
|
||
|
|
||
|
/* rm = "remove me" */
|
||
|
|
||
|
+typedef struct shard_unlink_thread {
|
||
|
+ pthread_mutex_t mutex;
|
||
|
+ pthread_cond_t cond;
|
||
|
+ pthread_t thread;
|
||
|
+ gf_boolean_t running;
|
||
|
+ gf_boolean_t rerun;
|
||
|
+ gf_boolean_t stop;
|
||
|
+ xlator_t *this;
|
||
|
+} shard_unlink_thread_t;
|
||
|
+
|
||
|
typedef struct shard_priv {
|
||
|
uint64_t block_size;
|
||
|
uuid_t dot_shard_gfid;
|
||
|
@@ -220,6 +230,7 @@ typedef struct shard_priv {
|
||
|
shard_bg_deletion_state_t bg_del_state;
|
||
|
gf_boolean_t first_lookup_done;
|
||
|
uint64_t lru_limit;
|
||
|
+ shard_unlink_thread_t thread_info;
|
||
|
} shard_priv_t;
|
||
|
|
||
|
typedef struct {
|
||
|
diff --git a/xlators/storage/posix/src/posix-entry-ops.c b/xlators/storage/posix/src/posix-entry-ops.c
|
||
|
index b3a5381..1511e68 100644
|
||
|
--- a/xlators/storage/posix/src/posix-entry-ops.c
|
||
|
+++ b/xlators/storage/posix/src/posix-entry-ops.c
|
||
|
@@ -183,6 +183,11 @@ posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
|
||
|
struct posix_private *priv = NULL;
|
||
|
posix_inode_ctx_t *ctx = NULL;
|
||
|
int ret = 0;
|
||
|
+ uint32_t lookup_unlink_dir = 0;
|
||
|
+ char *unlink_path = NULL;
|
||
|
+ struct stat lstatbuf = {
|
||
|
+ 0,
|
||
|
+ };
|
||
|
|
||
|
VALIDATE_OR_GOTO(frame, out);
|
||
|
VALIDATE_OR_GOTO(this, out);
|
||
|
@@ -208,7 +213,36 @@ posix_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
|
||
|
op_ret = -1;
|
||
|
if (gf_uuid_is_null(loc->pargfid) || (loc->name == NULL)) {
|
||
|
/* nameless lookup */
|
||
|
+ op_ret = op_errno = errno = 0;
|
||
|
MAKE_INODE_HANDLE(real_path, this, loc, &buf);
|
||
|
+
|
||
|
+ /* The gfid will be renamed to ".glusterfs/unlink" in case
|
||
|
+ * there are any open fds on the file in posix_unlink path.
|
||
|
+ * So client can request server to do nameless lookup with
|
||
|
+ * xdata = GF_UNLINKED_LOOKUP in ".glusterfs/unlink"
|
||
|
+ * dir if a client wants to know the status of the all open fds
|
||
|
+ * on the unlinked file. If the file still present in the
|
||
|
+ * ".glusterfs/unlink" dir then it indicates there still
|
||
|
+ * open fds present on the file and the file is still under
|
||
|
+ * unlink process */
|
||
|
+ if (op_ret < 0 && errno == ENOENT) {
|
||
|
+ ret = dict_get_uint32(xdata, GF_UNLINKED_LOOKUP,
|
||
|
+ &lookup_unlink_dir);
|
||
|
+ if (!ret && lookup_unlink_dir) {
|
||
|
+ op_ret = op_errno = errno = 0;
|
||
|
+ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, loc->gfid,
|
||
|
+ unlink_path);
|
||
|
+ ret = sys_lstat(unlink_path, &lstatbuf);
|
||
|
+ if (ret) {
|
||
|
+ op_ret = -1;
|
||
|
+ op_errno = errno;
|
||
|
+ } else {
|
||
|
+ iatt_from_stat(&buf, &lstatbuf);
|
||
|
+ buf.ia_nlink = 0;
|
||
|
+ }
|
||
|
+ goto nameless_lookup_unlink_dir_out;
|
||
|
+ }
|
||
|
+ }
|
||
|
} else {
|
||
|
MAKE_ENTRY_HANDLE(real_path, par_path, this, loc, &buf);
|
||
|
if (!real_path || !par_path) {
|
||
|
@@ -328,6 +362,8 @@ out:
|
||
|
|
||
|
if (op_ret == 0)
|
||
|
op_errno = 0;
|
||
|
+
|
||
|
+nameless_lookup_unlink_dir_out:
|
||
|
STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno,
|
||
|
(loc) ? loc->inode : NULL, &buf, xattr, &postparent);
|
||
|
|
||
|
diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c
|
||
|
index 761e018..4c2983a 100644
|
||
|
--- a/xlators/storage/posix/src/posix-inode-fd-ops.c
|
||
|
+++ b/xlators/storage/posix/src/posix-inode-fd-ops.c
|
||
|
@@ -2504,6 +2504,39 @@ out:
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
+static int
|
||
|
+posix_unlink_renamed_file(xlator_t *this, inode_t *inode)
|
||
|
+{
|
||
|
+ int ret = 0;
|
||
|
+ char *unlink_path = NULL;
|
||
|
+ uint64_t ctx_uint = 0;
|
||
|
+ posix_inode_ctx_t *ctx = NULL;
|
||
|
+ struct posix_private *priv = this->private;
|
||
|
+
|
||
|
+ ret = inode_ctx_get(inode, this, &ctx_uint);
|
||
|
+
|
||
|
+ if (ret < 0)
|
||
|
+ goto out;
|
||
|
+
|
||
|
+ ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint;
|
||
|
+
|
||
|
+ if (ctx->unlink_flag == GF_UNLINK_TRUE) {
|
||
|
+ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, inode->gfid, unlink_path);
|
||
|
+ if (!unlink_path) {
|
||
|
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED,
|
||
|
+ "Failed to remove gfid :%s", uuid_utoa(inode->gfid));
|
||
|
+ ret = -1;
|
||
|
+ } else {
|
||
|
+ ret = sys_unlink(unlink_path);
|
||
|
+ if (!ret)
|
||
|
+ ctx->unlink_flag = GF_UNLINK_FALSE;
|
||
|
+ }
|
||
|
+ }
|
||
|
+
|
||
|
+out:
|
||
|
+ return ret;
|
||
|
+}
|
||
|
+
|
||
|
int32_t
|
||
|
posix_release(xlator_t *this, fd_t *fd)
|
||
|
{
|
||
|
@@ -2514,6 +2547,9 @@ posix_release(xlator_t *this, fd_t *fd)
|
||
|
VALIDATE_OR_GOTO(this, out);
|
||
|
VALIDATE_OR_GOTO(fd, out);
|
||
|
|
||
|
+ if (fd->inode->active_fd_count == 0)
|
||
|
+ posix_unlink_renamed_file(this, fd->inode);
|
||
|
+
|
||
|
ret = fd_ctx_del(fd, this, &tmp_pfd);
|
||
|
if (ret < 0) {
|
||
|
gf_msg(this->name, GF_LOG_WARNING, 0, P_MSG_PFD_NULL,
|
||
|
@@ -5881,41 +5917,33 @@ posix_forget(xlator_t *this, inode_t *inode)
|
||
|
uint64_t ctx_uint1 = 0;
|
||
|
uint64_t ctx_uint2 = 0;
|
||
|
posix_inode_ctx_t *ctx = NULL;
|
||
|
- posix_mdata_t *mdata = NULL;
|
||
|
- struct posix_private *priv_posix = NULL;
|
||
|
-
|
||
|
- priv_posix = (struct posix_private *)this->private;
|
||
|
- if (!priv_posix)
|
||
|
- return 0;
|
||
|
+ struct posix_private *priv = this->private;
|
||
|
|
||
|
ret = inode_ctx_del2(inode, this, &ctx_uint1, &ctx_uint2);
|
||
|
+
|
||
|
+ if (ctx_uint2)
|
||
|
+ GF_FREE((posix_mdata_t *)(uintptr_t)ctx_uint2);
|
||
|
+
|
||
|
if (!ctx_uint1)
|
||
|
- goto check_ctx2;
|
||
|
+ return 0;
|
||
|
|
||
|
ctx = (posix_inode_ctx_t *)(uintptr_t)ctx_uint1;
|
||
|
|
||
|
if (ctx->unlink_flag == GF_UNLINK_TRUE) {
|
||
|
- POSIX_GET_FILE_UNLINK_PATH(priv_posix->base_path, inode->gfid,
|
||
|
- unlink_path);
|
||
|
+ POSIX_GET_FILE_UNLINK_PATH(priv->base_path, inode->gfid, unlink_path);
|
||
|
if (!unlink_path) {
|
||
|
gf_msg(this->name, GF_LOG_ERROR, ENOMEM, P_MSG_UNLINK_FAILED,
|
||
|
"Failed to remove gfid :%s", uuid_utoa(inode->gfid));
|
||
|
ret = -1;
|
||
|
- goto ctx_free;
|
||
|
+ } else {
|
||
|
+ ret = sys_unlink(unlink_path);
|
||
|
}
|
||
|
- ret = sys_unlink(unlink_path);
|
||
|
}
|
||
|
-ctx_free:
|
||
|
+
|
||
|
pthread_mutex_destroy(&ctx->xattrop_lock);
|
||
|
pthread_mutex_destroy(&ctx->write_atomic_lock);
|
||
|
pthread_mutex_destroy(&ctx->pgfid_lock);
|
||
|
GF_FREE(ctx);
|
||
|
|
||
|
-check_ctx2:
|
||
|
- if (ctx_uint2) {
|
||
|
- mdata = (posix_mdata_t *)(uintptr_t)ctx_uint2;
|
||
|
- }
|
||
|
-
|
||
|
- GF_FREE(mdata);
|
||
|
return ret;
|
||
|
}
|
||
|
--
|
||
|
1.8.3.1
|
||
|
|