2f3c8edfdf
Resolves: bz#1390151 bz#1410145 bz#1429190 bz#1510752 bz#1511779 Resolves: bz#1570958 bz#1574490 bz#1595246 bz#1618669 bz#1661393 Resolves: bz#1668989 bz#1669020 Signed-off-by: Milind Changire <mchangir@redhat.com>
1029 lines
37 KiB
Diff
1029 lines
37 KiB
Diff
From 55e67fb41ae3b4388839723ac929cd239280a0fc Mon Sep 17 00:00:00 2001
|
|
From: Amar Tumballi <amarts@redhat.com>
|
|
Date: Thu, 7 Feb 2019 18:06:43 +0530
|
|
Subject: [PATCH 522/529] fuse: add --lru-limit option
|
|
|
|
The inode LRU mechanism is moot in fuse xlator (ie. there is no
|
|
limit for the LRU list), as fuse inodes are referenced from
|
|
kernel context, and thus they can only be dropped on request of
|
|
the kernel. This might results in a high number of passive
|
|
inodes which are useless for the glusterfs client, causing a
|
|
significant memory overhead.
|
|
|
|
This change tries to remedy this by extending the LRU semantics
|
|
and allowing to set a finite limit on the fuse inode LRU.
|
|
|
|
A brief history of problem:
|
|
|
|
When gluster's inode table was designed, fuse didn't have any
|
|
'invalidate' method, which means, userspace application could
|
|
never ask kernel to send a 'forget()' fop, instead had to wait
|
|
for kernel to send it based on kernel's parameters. Inode table
|
|
remembers the number of times kernel has cached the inode based
|
|
on the 'nlookup' parameter. And 'nlookup' field is not used by
|
|
no other entry points (like server-protocol, gfapi etc).
|
|
|
|
Hence the inode_table of fuse module always has to have lru-limit
|
|
as '0', which means no limit. GlusterFS always had to keep all
|
|
inodes in memory as kernel would have had a reference to it.
|
|
Again, the reason for this is, kernel's glusterfs inode reference
|
|
was pointer of 'inode_t' structure in glusterfs. As it is a
|
|
pointer, we could never free it (to prevent segfault, or memory
|
|
corruption).
|
|
|
|
Solution:
|
|
|
|
In the inode table, handle the prune case of inodes with 'nlookup'
|
|
differently, and call a 'invalidator' method, which in this case is
|
|
fuse_invalidate(), and it sends the request to kernel for getting
|
|
the forget request.
|
|
|
|
When the kernel sends the forget, it means, it has dropped all
|
|
the reference to the inode, and it will send the forget with the
|
|
'nlookup' parameter too. We just need to make sure to reduce the
|
|
'nlookup' value we have when we get forget. That automatically
|
|
cause the relevant prune to happen.
|
|
|
|
Credits: Csaba Henk, Xavier Hernandez, Raghavendra Gowdappa, Nithya B
|
|
|
|
Upstream:
|
|
> URL: https://review.gluster.org/19778
|
|
|
|
BUG: 1511779
|
|
Change-Id: Iabe22a62e0f819b7eb67d4ecb850dd559b0c937f
|
|
Signed-off-by: Amar Tumballi <amarts@redhat.com>
|
|
Reviewed-on: https://code.engineering.redhat.com/gerrit/162494
|
|
Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
|
|
Tested-by: RHGS Build Bot <nigelb@redhat.com>
|
|
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
|
|
---
|
|
doc/mount.glusterfs.8 | 4 +
|
|
glusterfsd/src/glusterfsd.c | 24 +++
|
|
glusterfsd/src/glusterfsd.h | 1 +
|
|
libglusterfs/src/glusterfs.h | 1 +
|
|
libglusterfs/src/inode.c | 256 ++++++++++++++++++++++++----
|
|
libglusterfs/src/inode.h | 17 +-
|
|
tests/features/fuse-lru-limit.t | 42 +++++
|
|
xlators/mount/fuse/src/fuse-bridge.c | 121 ++++++++-----
|
|
xlators/mount/fuse/src/fuse-bridge.h | 3 +
|
|
xlators/mount/fuse/utils/mount.glusterfs.in | 7 +
|
|
10 files changed, 393 insertions(+), 83 deletions(-)
|
|
create mode 100644 tests/features/fuse-lru-limit.t
|
|
|
|
diff --git a/doc/mount.glusterfs.8 b/doc/mount.glusterfs.8
|
|
index 95aad02..ed6b410 100644
|
|
--- a/doc/mount.glusterfs.8
|
|
+++ b/doc/mount.glusterfs.8
|
|
@@ -119,6 +119,10 @@ Provide list of backup volfile servers in the following format [default: None]
|
|
\fBDeprecated\fR option - placed here for backward compatibility [default: 1]
|
|
.TP
|
|
.TP
|
|
+\fBlru-limit=\fRN
|
|
+Set fuse module's limit for number of inodes kept in LRU list to N [default: 131072]
|
|
+.TP
|
|
+.TP
|
|
\fBbackground-qlen=\fRN
|
|
Set fuse module's background queue length to N [default: 64]
|
|
.TP
|
|
diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c
|
|
index 990036c..2e2cd77 100644
|
|
--- a/glusterfsd/src/glusterfsd.c
|
|
+++ b/glusterfsd/src/glusterfsd.c
|
|
@@ -203,6 +203,9 @@ static struct argp_option gf_options[] = {
|
|
"[default: 300]"},
|
|
{"resolve-gids", ARGP_RESOLVE_GIDS_KEY, 0, 0,
|
|
"Resolve all auxiliary groups in fuse translator (max 32 otherwise)"},
|
|
+ {"lru-limit", ARGP_FUSE_LRU_LIMIT_KEY, "N", 0,
|
|
+ "Set fuse module's limit for number of inodes kept in LRU list to N "
|
|
+ "[default: 131072]"},
|
|
{"background-qlen", ARGP_FUSE_BACKGROUND_QLEN_KEY, "N", 0,
|
|
"Set fuse module's background queue length to N "
|
|
"[default: 64]"},
|
|
@@ -462,6 +465,15 @@ set_fuse_mount_options (glusterfs_ctx_t *ctx, dict_t *options)
|
|
}
|
|
}
|
|
|
|
+ if (cmd_args->lru_limit >= 0) {
|
|
+ ret = dict_set_int32(options, "lru-limit", cmd_args->lru_limit);
|
|
+ if (ret < 0) {
|
|
+ gf_msg("glusterfsd", GF_LOG_ERROR, 0, glusterfsd_msg_4,
|
|
+ "lru-limit");
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
if (cmd_args->background_qlen) {
|
|
ret = dict_set_int32 (options, "background-qlen",
|
|
cmd_args->background_qlen);
|
|
@@ -1169,6 +1181,13 @@ parse_opts (int key, char *arg, struct argp_state *state)
|
|
cmd_args->resolve_gids = 1;
|
|
break;
|
|
|
|
+ case ARGP_FUSE_LRU_LIMIT_KEY:
|
|
+ if (!gf_string2int32(arg, &cmd_args->lru_limit))
|
|
+ break;
|
|
+
|
|
+ argp_failure(state, -1, 0, "unknown LRU limit option %s", arg);
|
|
+ break;
|
|
+
|
|
case ARGP_FUSE_BACKGROUND_QLEN_KEY:
|
|
if (!gf_string2int (arg, &cmd_args->background_qlen))
|
|
break;
|
|
@@ -1937,6 +1956,11 @@ parse_cmdline (int argc, char *argv[], glusterfs_ctx_t *ctx)
|
|
ctx->ssl_cert_depth = glusterfs_read_secure_access_file ();
|
|
}
|
|
|
|
+ /* Need to set lru_limit to below 0 to indicate there was nothing
|
|
+ specified. This is needed as 0 is a valid option, and may not be
|
|
+ default value. */
|
|
+ cmd_args->lru_limit = -1;
|
|
+
|
|
argp_parse (&argp, argc, argv, ARGP_IN_ORDER, NULL, cmd_args);
|
|
if (cmd_args->print_netgroups) {
|
|
/* When this option is set we don't want to do anything else
|
|
diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h
|
|
index 75cb1d8..1550a30 100644
|
|
--- a/glusterfsd/src/glusterfsd.h
|
|
+++ b/glusterfsd/src/glusterfsd.h
|
|
@@ -100,6 +100,7 @@ enum argp_option_keys {
|
|
ARGP_SUBDIR_MOUNT_KEY = 178,
|
|
ARGP_FUSE_EVENT_HISTORY_KEY = 179,
|
|
ARGP_READER_THREAD_COUNT_KEY = 180,
|
|
+ ARGP_FUSE_LRU_LIMIT_KEY = 190,
|
|
};
|
|
|
|
struct _gfd_vol_top_priv {
|
|
diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h
|
|
index 157437c..2690306 100644
|
|
--- a/libglusterfs/src/glusterfs.h
|
|
+++ b/libglusterfs/src/glusterfs.h
|
|
@@ -413,6 +413,7 @@ struct _cmd_args {
|
|
pid_t client_pid;
|
|
int client_pid_set;
|
|
unsigned uid_map_root;
|
|
+ int32_t lru_limit;
|
|
int background_qlen;
|
|
int congestion_threshold;
|
|
char *fuse_mountopts;
|
|
diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c
|
|
index 29d3c8f..f57020a 100644
|
|
--- a/libglusterfs/src/inode.c
|
|
+++ b/libglusterfs/src/inode.c
|
|
@@ -24,6 +24,100 @@
|
|
move latest accessed dentry to list_head of inode
|
|
*/
|
|
|
|
+/* clang-format off */
|
|
+/*
|
|
+
|
|
+Details as per Xavi:
|
|
+
|
|
+ I think we should have 3 lists: active, lru and invalidate.
|
|
+
|
|
+We'll need 3 things: refs, nlookups and invalidate_sent flag. Any change of
|
|
+refs, invalidate_sent flag and moving from one list to another must be done
|
|
+atomically.
|
|
+
|
|
+With this information, these are the states that cause a transition:
|
|
+
|
|
+ refs nlookups inv_sent op
|
|
+ 1 0 0 unref -> refs = 0, active--->destroy
|
|
+ 1 1 0 unref -> refs = 0, active--->lru
|
|
+ 1 1 0 forget -> nlookups = 0, active--->active
|
|
+ *0 1 0 forget -> nlookups = 0, lru--->destroy
|
|
+ *0 1 1 forget -> nlookups = 0, invalidate--->destroy
|
|
+ 0 1 0 ref -> refs = 1, lru--->active
|
|
+ 0 1 1 ref -> refs = 1, inv_sent = 0, invalidate--->active
|
|
+ 0 1 0 overflow -> refs = 1, inv_sent = 1, lru--->invalidate
|
|
+ 1 1 1 unref -> refs = 0, invalidate--->invalidate
|
|
+ 1 1 1 forget -> nlookups = 0, inv_sent = 0, invalidate--->active
|
|
+
|
|
+(*) technically these combinations cannot happen because a forget sent by the
|
|
+kernel first calls ref() and then unref(). However it's equivalent.
|
|
+
|
|
+overflow means that lru list has grown beyond the limit and the inode needs to
|
|
+be invalidated. All other combinations do not cause a change in state or are not
|
|
+possible.
|
|
+
|
|
+Based on this, the code could be similar to this:
|
|
+
|
|
+ ref(inode, inv)
|
|
+ {
|
|
+ if (refs == 0) {
|
|
+ if (inv_sent) {
|
|
+ invalidate_count--;
|
|
+ inv_sent = 0;
|
|
+ } else {
|
|
+ lru_count--;
|
|
+ }
|
|
+ if (inv) {
|
|
+ inv_sent = 1;
|
|
+ invalidate_count++;
|
|
+ list_move(inode, invalidate);
|
|
+ } else {
|
|
+ active_count++;
|
|
+ list_move(inode, active);
|
|
+ }
|
|
+ }
|
|
+ refs++;
|
|
+ }
|
|
+
|
|
+ unref(inode, clear)
|
|
+ {
|
|
+ if (clear && inv_sent) {
|
|
+ // there is a case of fuse itself sending forget, without
|
|
+ // invalidate, after entry delete, like unlink(), rmdir().
|
|
+ inv_sent = 0;
|
|
+ invalidate_count--;
|
|
+ active_count++;
|
|
+ list_move(inode, active);
|
|
+ }
|
|
+ refs--;
|
|
+ if ((refs == 0) && !inv_sent) {
|
|
+ active_count--;
|
|
+ if (nlookups == 0) {
|
|
+ destroy(inode);
|
|
+ } else {
|
|
+ lru_count++;
|
|
+ list_move(inode, lru);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ forget(inode)
|
|
+ {
|
|
+ ref(inode, false);
|
|
+ nlookups--;
|
|
+ unref(inode, true);
|
|
+ }
|
|
+
|
|
+ overflow(inode)
|
|
+ {
|
|
+ ref(inode, true);
|
|
+ invalidator(inode);
|
|
+ unref(inode, false);
|
|
+ }
|
|
+
|
|
+*/
|
|
+/* clang-format on */
|
|
+
|
|
#define INODE_DUMP_LIST(head, key_buf, key_prefix, list_type) \
|
|
{ \
|
|
int i = 1; \
|
|
@@ -37,7 +131,7 @@
|
|
}
|
|
|
|
static inode_t *
|
|
-__inode_unref (inode_t *inode);
|
|
+__inode_unref (inode_t *inode, gf_boolean_t clear);
|
|
|
|
static int
|
|
inode_table_prune (inode_table_t *table);
|
|
@@ -138,7 +232,7 @@ __dentry_unset (dentry_t *dentry)
|
|
dentry->name = NULL;
|
|
|
|
if (dentry->parent) {
|
|
- __inode_unref (dentry->parent);
|
|
+ __inode_unref (dentry->parent, _gf_false);
|
|
dentry->parent = NULL;
|
|
}
|
|
|
|
@@ -465,7 +559,7 @@ out:
|
|
|
|
|
|
static inode_t *
|
|
-__inode_unref (inode_t *inode)
|
|
+__inode_unref (inode_t *inode, gf_boolean_t clear)
|
|
{
|
|
int index = 0;
|
|
xlator_t *this = NULL;
|
|
@@ -473,8 +567,6 @@ __inode_unref (inode_t *inode)
|
|
if (!inode)
|
|
return NULL;
|
|
|
|
- this = THIS;
|
|
-
|
|
/*
|
|
* Root inode should always be in active list of inode table. So unrefs
|
|
* on root inode are no-ops.
|
|
@@ -482,6 +574,14 @@ __inode_unref (inode_t *inode)
|
|
if (__is_root_gfid(inode->gfid))
|
|
return inode;
|
|
|
|
+ this = THIS;
|
|
+
|
|
+ if (clear && inode->invalidate_sent) {
|
|
+ inode->invalidate_sent = _gf_false;
|
|
+ inode->table->invalidate_size--;
|
|
+ __inode_activate(inode);
|
|
+ }
|
|
+
|
|
GF_ASSERT (inode->ref);
|
|
|
|
--inode->ref;
|
|
@@ -492,7 +592,7 @@ __inode_unref (inode_t *inode)
|
|
inode->_ctx[index].ref--;
|
|
}
|
|
|
|
- if (!inode->ref) {
|
|
+ if (!inode->ref && !inode->invalidate_sent) {
|
|
inode->table->active_size--;
|
|
|
|
if (inode->nlookup)
|
|
@@ -506,7 +606,7 @@ __inode_unref (inode_t *inode)
|
|
|
|
|
|
static inode_t *
|
|
-__inode_ref (inode_t *inode)
|
|
+__inode_ref (inode_t *inode, gf_boolean_t is_invalidate)
|
|
{
|
|
int index = 0;
|
|
xlator_t *this = NULL;
|
|
@@ -516,11 +616,6 @@ __inode_ref (inode_t *inode)
|
|
|
|
this = THIS;
|
|
|
|
- if (!inode->ref) {
|
|
- inode->table->lru_size--;
|
|
- __inode_activate (inode);
|
|
- }
|
|
-
|
|
/*
|
|
* Root inode should always be in active list of inode table. So unrefs
|
|
* on root inode are no-ops. If we do not allow unrefs but allow refs,
|
|
@@ -532,6 +627,22 @@ __inode_ref (inode_t *inode)
|
|
if (__is_root_gfid(inode->gfid) && inode->ref)
|
|
return inode;
|
|
|
|
+ if (!inode->ref) {
|
|
+ if (inode->invalidate_sent) {
|
|
+ inode->invalidate_sent = _gf_false;
|
|
+ inode->table->invalidate_size--;
|
|
+ } else {
|
|
+ inode->table->lru_size--;
|
|
+ }
|
|
+ if (is_invalidate) {
|
|
+ inode->invalidate_sent = _gf_true;
|
|
+ inode->table->invalidate_size++;
|
|
+ list_move_tail(&inode->list, &inode->table->invalidate);
|
|
+ } else {
|
|
+ __inode_activate(inode);
|
|
+ }
|
|
+ }
|
|
+
|
|
inode->ref++;
|
|
|
|
index = __inode_get_xl_index (inode, this);
|
|
@@ -556,7 +667,7 @@ inode_unref (inode_t *inode)
|
|
|
|
pthread_mutex_lock (&table->lock);
|
|
{
|
|
- inode = __inode_unref (inode);
|
|
+ inode = __inode_unref (inode, _gf_false);
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
@@ -578,7 +689,7 @@ inode_ref (inode_t *inode)
|
|
|
|
pthread_mutex_lock (&table->lock);
|
|
{
|
|
- inode = __inode_ref (inode);
|
|
+ inode = __inode_ref (inode, _gf_false);
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
@@ -614,7 +725,7 @@ __dentry_create (inode_t *inode, inode_t *parent, const char *name)
|
|
}
|
|
|
|
if (parent)
|
|
- newd->parent = __inode_ref (parent);
|
|
+ newd->parent = __inode_ref (parent, _gf_false);
|
|
|
|
list_add (&newd->inode_list, &inode->dentry_list);
|
|
newd->inode = inode;
|
|
@@ -685,7 +796,7 @@ inode_new (inode_table_t *table)
|
|
{
|
|
inode = __inode_create (table);
|
|
if (inode != NULL) {
|
|
- __inode_ref (inode);
|
|
+ __inode_ref (inode, _gf_false);
|
|
}
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
@@ -802,7 +913,7 @@ inode_grep (inode_table_t *table, inode_t *parent, const char *name)
|
|
inode = dentry->inode;
|
|
|
|
if (inode)
|
|
- __inode_ref (inode);
|
|
+ __inode_ref (inode, _gf_false);
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
@@ -947,7 +1058,7 @@ inode_find (inode_table_t *table, uuid_t gfid)
|
|
{
|
|
inode = __inode_find (table, gfid);
|
|
if (inode)
|
|
- __inode_ref (inode);
|
|
+ __inode_ref (inode, _gf_false);
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
@@ -1096,7 +1207,7 @@ inode_link (inode_t *inode, inode_t *parent, const char *name,
|
|
linked_inode = __inode_link (inode, parent, name, iatt);
|
|
|
|
if (linked_inode)
|
|
- __inode_ref (linked_inode);
|
|
+ __inode_ref (linked_inode, _gf_false);
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
@@ -1178,6 +1289,31 @@ inode_forget (inode_t *inode, uint64_t nlookup)
|
|
return 0;
|
|
}
|
|
|
|
+int
|
|
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup)
|
|
+{
|
|
+ inode_table_t *table = NULL;
|
|
+
|
|
+ if (!inode) {
|
|
+ gf_msg_callingfn(THIS->name, GF_LOG_WARNING, 0, LG_MSG_INODE_NOT_FOUND,
|
|
+ "inode not found");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ table = inode->table;
|
|
+
|
|
+ pthread_mutex_lock(&table->lock);
|
|
+ {
|
|
+ __inode_forget(inode, nlookup);
|
|
+ __inode_unref(inode, _gf_true);
|
|
+ }
|
|
+ pthread_mutex_unlock(&table->lock);
|
|
+
|
|
+ inode_table_prune(table);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
/*
|
|
* Invalidate an inode. This is invoked when a translator decides that an inode's
|
|
* cache is no longer valid. Any translator interested in taking action in this
|
|
@@ -1356,7 +1492,7 @@ inode_parent (inode_t *inode, uuid_t pargfid, const char *name)
|
|
parent = dentry->parent;
|
|
|
|
if (parent)
|
|
- __inode_ref (parent);
|
|
+ __inode_ref (parent, _gf_false);
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
@@ -1540,6 +1676,7 @@ inode_table_prune (inode_table_t *table)
|
|
inode_t *del = NULL;
|
|
inode_t *tmp = NULL;
|
|
inode_t *entry = NULL;
|
|
+ int64_t lru_size = 0;
|
|
|
|
if (!table)
|
|
return -1;
|
|
@@ -1548,8 +1685,11 @@ inode_table_prune (inode_table_t *table)
|
|
|
|
pthread_mutex_lock (&table->lock);
|
|
{
|
|
- while (table->lru_limit
|
|
- && table->lru_size > (table->lru_limit)) {
|
|
+ if (!table->lru_limit)
|
|
+ goto purge_list;
|
|
+
|
|
+ lru_size = table->lru_size;
|
|
+ while (lru_size > (table->lru_limit)) {
|
|
if (list_empty (&table->lru)) {
|
|
gf_msg_callingfn (THIS->name, GF_LOG_WARNING, 0,
|
|
LG_MSG_INVALID_INODE_LIST,
|
|
@@ -1559,7 +1699,18 @@ inode_table_prune (inode_table_t *table)
|
|
break;
|
|
}
|
|
|
|
+ lru_size--;
|
|
entry = list_entry (table->lru.next, inode_t, list);
|
|
+ /* The logic of invalidation is required only if invalidator_fn
|
|
+ is present */
|
|
+ if (table->invalidator_fn) {
|
|
+ /* check for valid inode with 'nlookup' */
|
|
+ if (entry->nlookup) {
|
|
+ __inode_ref(entry, _gf_true);
|
|
+ tmp = entry;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
|
|
table->lru_size--;
|
|
__inode_retire (entry);
|
|
@@ -1567,17 +1718,25 @@ inode_table_prune (inode_table_t *table)
|
|
ret++;
|
|
}
|
|
|
|
+ purge_list:
|
|
list_splice_init (&table->purge, &purge);
|
|
table->purge_size = 0;
|
|
}
|
|
pthread_mutex_unlock (&table->lock);
|
|
|
|
- {
|
|
- list_for_each_entry_safe (del, tmp, &purge, list) {
|
|
- list_del_init (&del->list);
|
|
- __inode_forget (del, 0);
|
|
- __inode_destroy (del);
|
|
- }
|
|
+ /* Pick 1 inode for invalidation */
|
|
+ if (tmp) {
|
|
+ xlator_t *old_THIS = THIS;
|
|
+ THIS = table->invalidator_xl;
|
|
+ table->invalidator_fn(table->invalidator_xl, tmp);
|
|
+ THIS = old_THIS;
|
|
+ inode_unref(tmp);
|
|
+ }
|
|
+
|
|
+ list_for_each_entry_safe (del, tmp, &purge, list) {
|
|
+ list_del_init (&del->list);
|
|
+ __inode_forget (del, 0);
|
|
+ __inode_destroy (del);
|
|
}
|
|
|
|
return ret;
|
|
@@ -1605,9 +1764,12 @@ __inode_table_init_root (inode_table_t *table)
|
|
|
|
|
|
inode_table_t *
|
|
-inode_table_new (size_t lru_limit, xlator_t *xl)
|
|
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
|
|
+ int32_t (*invalidator_fn)(xlator_t *, inode_t *),
|
|
+ xlator_t *invalidator_xl)
|
|
{
|
|
inode_table_t *new = NULL;
|
|
+ uint32_t mem_pool_size = lru_limit;
|
|
int ret = -1;
|
|
int i = 0;
|
|
|
|
@@ -1619,20 +1781,19 @@ inode_table_new (size_t lru_limit, xlator_t *xl)
|
|
new->ctxcount = xl->graph->xl_count + 1;
|
|
|
|
new->lru_limit = lru_limit;
|
|
+ new->invalidator_fn = invalidator_fn;
|
|
+ new->invalidator_xl = invalidator_xl;
|
|
|
|
new->hashsize = 14057; /* TODO: Random Number?? */
|
|
|
|
- /* In case FUSE is initing the inode table. */
|
|
- if (lru_limit == 0)
|
|
- lru_limit = DEFAULT_INODE_MEMPOOL_ENTRIES;
|
|
-
|
|
- new->inode_pool = mem_pool_new (inode_t, lru_limit);
|
|
+ if (!mem_pool_size || (mem_pool_size > DEFAULT_INODE_MEMPOOL_ENTRIES))
|
|
+ mem_pool_size = DEFAULT_INODE_MEMPOOL_ENTRIES;
|
|
|
|
+ new->inode_pool = mem_pool_new(inode_t, mem_pool_size);
|
|
if (!new->inode_pool)
|
|
goto out;
|
|
|
|
- new->dentry_pool = mem_pool_new (dentry_t, lru_limit);
|
|
-
|
|
+ new->dentry_pool = mem_pool_new (dentry_t, mem_pool_size);
|
|
if (!new->dentry_pool)
|
|
goto out;
|
|
|
|
@@ -1667,6 +1828,7 @@ inode_table_new (size_t lru_limit, xlator_t *xl)
|
|
INIT_LIST_HEAD (&new->active);
|
|
INIT_LIST_HEAD (&new->lru);
|
|
INIT_LIST_HEAD (&new->purge);
|
|
+ INIT_LIST_HEAD(&new->invalidate);
|
|
|
|
ret = gf_asprintf (&new->name, "%s/inode", xl->name);
|
|
if (-1 == ret) {
|
|
@@ -1696,6 +1858,14 @@ out:
|
|
return new;
|
|
}
|
|
|
|
+inode_table_t *
|
|
+inode_table_new(uint32_t lru_limit, xlator_t *xl)
|
|
+{
|
|
+ /* Only fuse for now requires the inode table with invalidator */
|
|
+ return inode_table_with_invalidator(lru_limit, xl, NULL, NULL);
|
|
+}
|
|
+
|
|
+
|
|
int
|
|
inode_table_ctx_free (inode_table_t *table)
|
|
{
|
|
@@ -1830,6 +2000,15 @@ inode_table_destroy (inode_table_t *inode_table) {
|
|
inode_table->lru_size--;
|
|
}
|
|
|
|
+ /* Same logic for invalidate list */
|
|
+ while (!list_empty(&inode_table->invalidate)) {
|
|
+ trav = list_first_entry(&inode_table->invalidate,
|
|
+ inode_t, list);
|
|
+ __inode_forget(trav, 0);
|
|
+ __inode_retire(trav);
|
|
+ inode_table->invalidate_size--;
|
|
+ }
|
|
+
|
|
while (!list_empty (&inode_table->active)) {
|
|
trav = list_first_entry (&inode_table->active,
|
|
inode_t, list);
|
|
@@ -2347,6 +2526,8 @@ inode_dump (inode_t *inode, char *prefix)
|
|
gf_proc_dump_write("active-fd-count", "%u",
|
|
inode->active_fd_count);
|
|
gf_proc_dump_write("ref", "%u", inode->ref);
|
|
+ gf_proc_dump_write("invalidate-sent", "%d",
|
|
+ inode->invalidate_sent);
|
|
gf_proc_dump_write("ia_type", "%d", inode->ia_type);
|
|
if (inode->_ctx) {
|
|
inode_ctx = GF_CALLOC (inode->table->ctxcount,
|
|
@@ -2427,10 +2608,13 @@ inode_table_dump (inode_table_t *itable, char *prefix)
|
|
gf_proc_dump_write(key, "%d", itable->lru_size);
|
|
gf_proc_dump_build_key(key, prefix, "purge_size");
|
|
gf_proc_dump_write(key, "%d", itable->purge_size);
|
|
+ gf_proc_dump_build_key(key, prefix, "invalidate_size");
|
|
+ gf_proc_dump_write(key, "%d", itable->invalidate_size);
|
|
|
|
INODE_DUMP_LIST(&itable->active, key, prefix, "active");
|
|
INODE_DUMP_LIST(&itable->lru, key, prefix, "lru");
|
|
INODE_DUMP_LIST(&itable->purge, key, prefix, "purge");
|
|
+ INODE_DUMP_LIST(&itable->invalidate, key, prefix, "invalidate");
|
|
|
|
pthread_mutex_unlock(&itable->lock);
|
|
}
|
|
diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h
|
|
index 7a87748..6a96447 100644
|
|
--- a/libglusterfs/src/inode.h
|
|
+++ b/libglusterfs/src/inode.h
|
|
@@ -55,6 +55,13 @@ struct _inode_table {
|
|
struct mem_pool *dentry_pool; /* memory pool for dentrys */
|
|
struct mem_pool *fd_mem_pool; /* memory pool for fd_t */
|
|
int ctxcount; /* number of slots in inode->ctx */
|
|
+
|
|
+ /* This is required for 'invalidation' when 'nlookup' would be used,
|
|
+ specially in case of fuse-bridge */
|
|
+ int32_t (*invalidator_fn)(xlator_t *, inode_t *);
|
|
+ xlator_t *invalidator_xl;
|
|
+ struct list_head invalidate; /* inodes which are in invalidation queue */
|
|
+ uint32_t invalidate_size; /* count of inodes in invalidation list */
|
|
};
|
|
|
|
|
|
@@ -102,6 +109,7 @@ struct _inode {
|
|
struct list_head list; /* active/lru/purge */
|
|
|
|
struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */
|
|
+ gf_boolean_t invalidate_sent; /* Set it if invalidator_fn is called for inode */
|
|
};
|
|
|
|
|
|
@@ -110,7 +118,14 @@ struct _inode {
|
|
#define GFID_STR_PFX_LEN (sizeof (GFID_STR_PFX) - 1)
|
|
|
|
inode_table_t *
|
|
-inode_table_new (size_t lru_limit, xlator_t *xl);
|
|
+inode_table_new(uint32_t lru_limit, xlator_t *xl);
|
|
+
|
|
+inode_table_t *
|
|
+inode_table_with_invalidator(uint32_t lru_limit, xlator_t *xl,
|
|
+ int32_t (*invalidator_fn)(xlator_t *, inode_t *),
|
|
+ xlator_t *invalidator_xl);
|
|
+int
|
|
+inode_forget_with_unref(inode_t *inode, uint64_t nlookup);
|
|
|
|
void
|
|
inode_table_destroy_all (glusterfs_ctx_t *ctx);
|
|
diff --git a/tests/features/fuse-lru-limit.t b/tests/features/fuse-lru-limit.t
|
|
new file mode 100644
|
|
index 0000000..9f12116
|
|
--- /dev/null
|
|
+++ b/tests/features/fuse-lru-limit.t
|
|
@@ -0,0 +1,42 @@
|
|
+#!/bin/bash
|
|
+
|
|
+. $(dirname $0)/../include.rc
|
|
+. $(dirname $0)/../volume.rc
|
|
+
|
|
+cleanup
|
|
+
|
|
+TEST glusterd
|
|
+TEST pidof glusterd
|
|
+TEST $CLI volume create $V0 $H0:$B0/${V0}{0,1}
|
|
+TEST $CLI volume start $V0
|
|
+TEST glusterfs -s $H0 --volfile-id $V0 $M0
|
|
+
|
|
+EXPECT "1" get_mount_active_size_value $V0 $M0
|
|
+EXPECT "0" get_mount_lru_size_value $V0 $M0
|
|
+
|
|
+mkdir ${M0}/dir-{1..9}
|
|
+for i in {1..9}; do
|
|
+ for j in {1..1000}; do
|
|
+ echo "Test file" > ${M0}/dir-$i/file-$j;
|
|
+ done;
|
|
+done
|
|
+lc=$(get_mount_lru_size_value $V0 ${M0})
|
|
+# ideally it should be 9000+
|
|
+TEST [ $lc -ge 9000 ]
|
|
+
|
|
+TEST umount $M0
|
|
+
|
|
+TEST glusterfs -s $H0 --volfile-id $V0 --lru-limit 1000 $M0
|
|
+
|
|
+TEST find $M0
|
|
+lc=$(get_mount_lru_size_value $V0 ${M0})
|
|
+# ideally it should be <1000
|
|
+# Not sure if there are any possibilities of buffer need.
|
|
+TEST [ $lc -le 1000 ]
|
|
+
|
|
+TEST rm -rf $M0/*
|
|
+
|
|
+EXPECT "1" get_mount_active_size_value $V0 $M0
|
|
+EXPECT "0" get_mount_lru_size_value $V0 $M0
|
|
+
|
|
+cleanup
|
|
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
|
|
index 8d1e3a0..f3188d6 100644
|
|
--- a/xlators/mount/fuse/src/fuse-bridge.c
|
|
+++ b/xlators/mount/fuse/src/fuse-bridge.c
|
|
@@ -279,29 +279,31 @@ send_fuse_data (xlator_t *this, fuse_in_header_t *finh, void *data, size_t size)
|
|
send_fuse_data (this, finh, obj, sizeof (*(obj)))
|
|
|
|
|
|
-#if FUSE_KERNEL_MINOR_VERSION >= 11
|
|
static void
|
|
fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
|
|
{
|
|
+#if FUSE_KERNEL_MINOR_VERSION >= 11
|
|
struct fuse_out_header *fouh = NULL;
|
|
struct fuse_notify_inval_entry_out *fnieo = NULL;
|
|
fuse_private_t *priv = NULL;
|
|
dentry_t *dentry = NULL;
|
|
+ dentry_t *tmp = NULL;
|
|
inode_t *inode = NULL;
|
|
size_t nlen = 0;
|
|
fuse_invalidate_node_t *node = NULL;
|
|
+ char gfid_str[UUID_CANONICAL_FORM_LEN + 1];
|
|
|
|
priv = this->private;
|
|
|
|
if (!priv->reverse_fuse_thread_started)
|
|
return;
|
|
|
|
- inode = fuse_ino_to_inode(fuse_ino, this);
|
|
+ inode = (inode_t *)(unsigned long)fuse_ino;
|
|
if (inode == NULL) {
|
|
return;
|
|
}
|
|
|
|
- list_for_each_entry (dentry, &inode->dentry_list, inode_list) {
|
|
+ list_for_each_entry_safe (dentry, tmp, &inode->dentry_list, inode_list) {
|
|
node = GF_CALLOC (1, sizeof (*node),
|
|
gf_fuse_mt_invalidate_node_t);
|
|
if (node == NULL)
|
|
@@ -315,14 +317,31 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
|
|
fouh->unique = 0;
|
|
fouh->error = FUSE_NOTIFY_INVAL_ENTRY;
|
|
|
|
- nlen = strlen (dentry->name);
|
|
- fouh->len = sizeof (*fouh) + sizeof (*fnieo) + nlen + 1;
|
|
- fnieo->parent = inode_to_fuse_nodeid (dentry->parent);
|
|
+ if (dentry->name) {
|
|
+ nlen = strlen (dentry->name);
|
|
+ fouh->len = sizeof (*fouh) + sizeof (*fnieo) + nlen + 1;
|
|
+ fnieo->parent = inode_to_fuse_nodeid (dentry->parent);
|
|
+
|
|
+ fnieo->namelen = nlen;
|
|
+ strcpy (node->inval_buf + sizeof (*fouh) + sizeof (*fnieo),
|
|
+ dentry->name);
|
|
+ }
|
|
|
|
- fnieo->namelen = nlen;
|
|
- strcpy (node->inval_buf + sizeof (*fouh) + sizeof (*fnieo),
|
|
- dentry->name);
|
|
+ gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
|
|
+ "%"PRIu64"/%s (gfid:%s)", fnieo->parent, dentry->name,
|
|
+ uuid_utoa(inode->gfid));
|
|
|
|
+ if (dentry->parent) {
|
|
+ fuse_log_eh (this, "Invalidated entry %s (parent: %s)"
|
|
+ "(gfid: %s)", dentry->name,
|
|
+ uuid_utoa (dentry->parent->gfid),
|
|
+ uuid_utoa_r(inode->gfid, gfid_str));
|
|
+ } else {
|
|
+ fuse_log_eh (this, "Invalidated entry %s(nodeid: %"
|
|
+ PRIu64 ") gfid: %s",
|
|
+ dentry->name, fnieo->parent,
|
|
+ uuid_utoa (inode->gfid));
|
|
+ }
|
|
pthread_mutex_lock (&priv->invalidate_mutex);
|
|
{
|
|
list_add_tail (&node->next, &priv->invalidate_list);
|
|
@@ -330,23 +349,10 @@ fuse_invalidate_entry (xlator_t *this, uint64_t fuse_ino)
|
|
}
|
|
pthread_mutex_unlock (&priv->invalidate_mutex);
|
|
|
|
- gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE entry: "
|
|
- "%"PRIu64"/%s", fnieo->parent, dentry->name);
|
|
-
|
|
- if (dentry->parent) {
|
|
- fuse_log_eh (this, "Invalidated entry %s (parent: %s)",
|
|
- dentry->name,
|
|
- uuid_utoa (dentry->parent->gfid));
|
|
- } else {
|
|
- fuse_log_eh (this, "Invalidated entry %s(nodeid: %" PRIu64 ")",
|
|
- dentry->name, fnieo->parent);
|
|
- }
|
|
}
|
|
-
|
|
- if (inode)
|
|
- inode_unref (inode);
|
|
+#endif /* KERNEL_VERSION */
|
|
+ return;
|
|
}
|
|
-#endif
|
|
|
|
/*
|
|
* Send an inval inode notification to fuse. This causes an invalidation of the
|
|
@@ -367,6 +373,10 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
|
|
if (!priv->reverse_fuse_thread_started)
|
|
return;
|
|
|
|
+ inode = (inode_t *)(unsigned long)fuse_ino;
|
|
+ if (inode == NULL)
|
|
+ return;
|
|
+
|
|
node = GF_CALLOC (1, sizeof (*node), gf_fuse_mt_invalidate_node_t);
|
|
if (node == NULL)
|
|
return;
|
|
@@ -386,7 +396,11 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
|
|
fniio->off = 0;
|
|
fniio->len = -1;
|
|
|
|
- inode = fuse_ino_to_inode (fuse_ino, this);
|
|
+ fuse_log_eh(this, "Invalidated inode %" PRIu64 " (gfid: %s)", fuse_ino,
|
|
+ uuid_utoa(inode->gfid));
|
|
+ gf_log("glusterfs-fuse", GF_LOG_TRACE,
|
|
+ "INVALIDATE inode: %" PRIu64 "(gfid:%s)", fuse_ino,
|
|
+ uuid_utoa(inode->gfid));
|
|
|
|
pthread_mutex_lock (&priv->invalidate_mutex);
|
|
{
|
|
@@ -395,24 +409,23 @@ fuse_invalidate_inode(xlator_t *this, uint64_t fuse_ino)
|
|
}
|
|
pthread_mutex_unlock (&priv->invalidate_mutex);
|
|
|
|
- gf_log ("glusterfs-fuse", GF_LOG_TRACE, "INVALIDATE inode: %" PRIu64,
|
|
- fuse_ino);
|
|
-
|
|
- if (inode) {
|
|
- fuse_log_eh (this, "Invalidated inode %" PRIu64 " (gfid: %s)",
|
|
- fuse_ino, uuid_utoa (inode->gfid));
|
|
- } else {
|
|
- fuse_log_eh (this, "Invalidated inode %" PRIu64, fuse_ino);
|
|
- }
|
|
-
|
|
- if (inode)
|
|
- inode_unref (inode);
|
|
#else
|
|
gf_log ("glusterfs-fuse", GF_LOG_WARNING,
|
|
- "fuse_invalidate_inode not implemented on OS X due to missing FUSE notification");
|
|
+ "fuse_invalidate_inode not implemented on this system");
|
|
#endif
|
|
+ return;
|
|
}
|
|
|
|
+#if FUSE_KERNEL_MINOR_VERSION >= 11
|
|
+/* Need this function for the signature (inode_t *, instead of uint64_t) */
|
|
+static int32_t
|
|
+fuse_inode_invalidate_fn(xlator_t *this, inode_t *inode)
|
|
+{
|
|
+ fuse_invalidate_entry(this, (uint64_t)inode);
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
|
|
int
|
|
send_fuse_err (xlator_t *this, fuse_in_header_t *finh, int error)
|
|
@@ -686,11 +699,14 @@ do_forget(xlator_t *this, uint64_t unique, uint64_t nodeid, uint64_t nlookup)
|
|
{
|
|
inode_t *fuse_inode = fuse_ino_to_inode(nodeid, this);
|
|
|
|
+ gf_log("fuse", GF_LOG_TRACE,
|
|
+ "%" PRIu64 ": FORGET %" PRIu64 "/%" PRIu64 " gfid: (%s)", unique,
|
|
+ nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
|
|
+
|
|
fuse_log_eh(this, "%"PRIu64": FORGET %"PRIu64"/%"PRIu64" gfid: (%s)",
|
|
unique, nodeid, nlookup, uuid_utoa(fuse_inode->gfid));
|
|
|
|
- inode_forget(fuse_inode, nlookup);
|
|
- inode_unref(fuse_inode);
|
|
+ inode_forget_with_unref(fuse_inode, nlookup);
|
|
}
|
|
|
|
static void
|
|
@@ -705,10 +721,6 @@ fuse_forget (xlator_t *this, fuse_in_header_t *finh, void *msg,
|
|
return;
|
|
}
|
|
|
|
- gf_log ("glusterfs-fuse", GF_LOG_TRACE,
|
|
- "%"PRIu64": FORGET %"PRIu64"/%"PRIu64,
|
|
- finh->unique, finh->nodeid, ffi->nlookup);
|
|
-
|
|
do_forget(this, finh->unique, finh->nodeid, ffi->nlookup);
|
|
|
|
GF_FREE (finh);
|
|
@@ -4940,7 +4952,9 @@ fuse_thread_proc (void *data)
|
|
fuse_in_header_t *finh = NULL;
|
|
struct iovec iov_in[2];
|
|
void *msg = NULL;
|
|
- const size_t msg0_size = sizeof (*finh) + 128;
|
|
+ /* we need 512 extra buffer size for BATCH_FORGET fop. By tests, it is
|
|
+ found to be reduces 'REALLOC()' in the loop */
|
|
+ const size_t msg0_size = sizeof (*finh) + 512;
|
|
fuse_handler_t **fuse_ops = NULL;
|
|
struct pollfd pfd[2] = {{0,}};
|
|
|
|
@@ -5283,7 +5297,12 @@ fuse_graph_setup (xlator_t *this, glusterfs_graph_t *graph)
|
|
goto unlock;
|
|
}
|
|
|
|
- itable = inode_table_new (0, graph->top);
|
|
+#if FUSE_KERNEL_MINOR_VERSION >= 11
|
|
+ itable = inode_table_with_invalidator(priv->lru_limit, graph->top,
|
|
+ fuse_inode_invalidate_fn, this);
|
|
+#else
|
|
+ itable = inode_table_new(0, graph->top);
|
|
+#endif
|
|
if (!itable) {
|
|
ret = -1;
|
|
goto unlock;
|
|
@@ -5740,6 +5759,8 @@ init (xlator_t *this_xl)
|
|
}
|
|
}
|
|
|
|
+ GF_OPTION_INIT("lru-limit", priv->lru_limit, uint32, cleanup_exit);
|
|
+
|
|
GF_OPTION_INIT("event-history", priv->event_history, bool,
|
|
cleanup_exit);
|
|
|
|
@@ -6061,5 +6082,13 @@ struct volume_options options[] = {
|
|
.max = 64,
|
|
.description = "Sets fuse reader thread count.",
|
|
},
|
|
+ {
|
|
+ .key = {"lru-limit"},
|
|
+ .type = GF_OPTION_TYPE_INT,
|
|
+ .default_value = "131072",
|
|
+ .min = 0,
|
|
+ .description = "makes glusterfs invalidate kernel inodes after "
|
|
+ "reaching this limit (0 means 'unlimited')",
|
|
+ },
|
|
{ .key = {NULL} },
|
|
};
|
|
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
|
|
index 4ca76e9..4e32a7f 100644
|
|
--- a/xlators/mount/fuse/src/fuse-bridge.h
|
|
+++ b/xlators/mount/fuse/src/fuse-bridge.h
|
|
@@ -144,6 +144,9 @@ struct fuse_private {
|
|
gf_boolean_t mount_finished;
|
|
gf_boolean_t handle_graph_switch;
|
|
pthread_cond_t migrate_cond;
|
|
+
|
|
+ /* LRU Limit, if not set, default is 128k for now */
|
|
+ uint32_t lru_limit;
|
|
};
|
|
typedef struct fuse_private fuse_private_t;
|
|
|
|
diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in
|
|
index 817619e..9a0404f 100755
|
|
--- a/xlators/mount/fuse/utils/mount.glusterfs.in
|
|
+++ b/xlators/mount/fuse/utils/mount.glusterfs.in
|
|
@@ -245,6 +245,10 @@ start_glusterfs ()
|
|
cmd_line=$(echo "$cmd_line --gid-timeout=$gid_timeout");
|
|
fi
|
|
|
|
+ if [ -n "$lru_limit" ]; then
|
|
+ cmd_line=$(echo "$cmd_line --lru-limit=$lru_limit");
|
|
+ fi
|
|
+
|
|
if [ -n "$bg_qlen" ]; then
|
|
cmd_line=$(echo "$cmd_line --background-qlen=$bg_qlen");
|
|
fi
|
|
@@ -467,6 +471,9 @@ with_options()
|
|
"gid-timeout")
|
|
gid_timeout=$value
|
|
;;
|
|
+ "lru-limit")
|
|
+ lru_limit=$value
|
|
+ ;;
|
|
"background-qlen")
|
|
bg_qlen=$value
|
|
;;
|
|
--
|
|
1.8.3.1
|
|
|