glusterfs
glusterfs copied to clipboard
core[WIP] Readdir improvement for fuse
There are multiple readdir improvement with this patch for fuse.
- Set the buffer size 128KB during wind a readdir call by fuse
- Wind a lookup call for all entries parallel by fuse_readdir_cbk
- Keep the inode in active list after taking extra reference for 10m(600s) sothat in next lookup is served by md-cache
- Take the reference only while table->active size is less than 1M
Fixes: #4176 Change-Id: Ic9c75799883d94ca473c230e6213cbf00b383684
CLANG-FORMAT FAILURE: Before merging the patch, this diff needs to be considered for passing clang-format
index a95a9ce0c..077d0be76 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -232,7 +232,8 @@ static struct argp_option gf_options[] = {
{"use-readdirp", ARGP_FUSE_USE_READDIRP_KEY, "BOOL", OPTION_ARG_OPTIONAL,
"Use readdirp mode in fuse kernel module"
" [default: \"yes\"]"},
- {"readdir-optimize", ARGP_FUSE_USE_READDIR_OPTIMIZE, "BOOL", OPTION_ARG_OPTIONAL,
+ {"readdir-optimize", ARGP_FUSE_USE_READDIR_OPTIMIZE, "BOOL",
+ OPTION_ARG_OPTIONAL,
"Enable readdir optimize mode for fuse "
" [default: \"no\"]"},
{"secure-mgmt", ARGP_SECURE_MGMT_KEY, "BOOL", OPTION_ARG_OPTIONAL,
@@ -1254,8 +1255,8 @@ parse_opts(int key, char *arg, struct argp_state *state)
break;
}
- argp_failure(state, -1, 0, "unknown readdir-optimize setting \"%s\"",
- arg);
+ argp_failure(state, -1, 0,
+ "unknown readdir-optimize setting \"%s\"", arg);
break;
case ARGP_LOGGER:
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
index b8f66d861..3d709085e 100644
--- a/libglusterfs/src/glusterfs/glusterfs.h
+++ b/libglusterfs/src/glusterfs/glusterfs.h
@@ -63,7 +63,6 @@
#define O_PATH 010000000 /* from asm-generic/fcntl.h */
#endif
-
#ifndef EBADFD
/* Mac OS X does not have EBADFD */
#define EBADFD EBADF
diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c
index d13ac364f..fbddfd0af 100644
--- a/xlators/features/upcall/src/upcall.c
+++ b/xlators/features/upcall/src/upcall.c
@@ -745,14 +745,13 @@ out:
return 0;
}
-static void
+static void
up_inode_unref(void *data)
{
inode_t *inode = data;
if (inode)
inode_unref(inode);
-
}
static int32_t
@@ -774,8 +773,8 @@ up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
goto err;
}
- //delay.tv_sec = 600;
- //delay.tv_nsec = 0;
+ // delay.tv_sec = 600;
+ // delay.tv_nsec = 0;
if (!dict_get_int32_sizen(xattr_req, "inode-unref-delay", &delay_time)) {
delay.tv_sec = delay_time;
delay.tv_nsec = 0;
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index de1905156..822a6bcd1 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -1042,14 +1042,14 @@ fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
feo.attr_valid = calc_timeout_sec(priv->attribute_timeout);
feo.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
if (!state->wind_from_readdir) {
- #if FUSE_KERNEL_MINOR_VERSION >= 9
- priv->proto_minor >= 9
- ? send_fuse_obj(this, finh, &feo)
- : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
- #else
- send_fuse_obj(this, finh, &feo);
- #endif
- }
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+ priv->proto_minor >= 9
+ ? send_fuse_obj(this, finh, &feo)
+ : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
+#else
+ send_fuse_obj(this, finh, &feo);
+#endif
+ }
} else {
gf_log("glusterfs-fuse",
(op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING),
@@ -1060,7 +1060,8 @@ fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
if (!state->wind_from_readdir) {
if ((op_errno == ENOENT) && (priv->negative_timeout != 0)) {
feo.entry_valid = calc_timeout_sec(priv->negative_timeout);
- feo.entry_valid_nsec = calc_timeout_nsec(priv->negative_timeout);
+ feo.entry_valid_nsec = calc_timeout_nsec(
+ priv->negative_timeout);
send_fuse_obj(this, finh, &feo);
} else {
send_fuse_err(this, state->finh, op_errno);
@@ -1153,54 +1154,54 @@ fuse_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
}
UNLOCK(&old_state->lock);
if ((lkup_recvd == (lkup_sent - 1)) && (total_lookup == lkup_sent)) {
+ /* The above condition is true it means this is the last lookup
+ fop wind by readdir fop
+ */
+ fdctx = fd_ctx_get_ptr(old_state->fd, old_state->this);
+ ecache = fdctx->equeue;
+ main_frame = state->main_frame;
+ priv = old_state->this->private;
+
+ if (ecache) {
+ INIT_LIST_HEAD(&local_entries.list);
+ list_for_each_entry_safe(entry, tmp, &ecache->list, list)
+ {
+ size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+ (entry->d_len));
+ max_size += fde_size;
- /* The above condition is true it means this is the last lookup
- fop wind by readdir fop
- */
- fdctx = fd_ctx_get_ptr(old_state->fd, old_state->this);
- ecache = fdctx->equeue;
- main_frame = state->main_frame;
- priv = old_state->this->private;
+ if (max_size > old_state->size) {
+ /* we received too many entries to fit in the reply */
+ max_size -= fde_size;
+ break;
+ }
+ }
- if (ecache) {
- INIT_LIST_HEAD(&local_entries.list);
- list_for_each_entry_safe(entry, tmp, &ecache->list, list)
- {
- size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (entry->d_len));
- max_size += fde_size;
+ if (max_size == 0)
+ goto skip;
- if (max_size > old_state->size) {
- /* we received too many entries to fit in the reply */
- max_size -= fde_size;
- break;
+ buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
+ if (!buf) {
+ gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+ ": READDIR => -1 (%s)", strerror(ENOMEM));
+ goto skip;
}
- }
- if (max_size == 0)
- goto skip;
-
- buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
- if (!buf) {
- gf_log("glusterfs-fuse", GF_LOG_DEBUG,
- ": READDIR => -1 (%s)", strerror(ENOMEM));
- goto skip;
- }
+ list_for_each_entry_safe(entry, tmp, &ecache->list, list)
+ {
+ fde = (struct fuse_dirent *)(buf + size);
+ gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
+ size += FUSE_DIRENT_SIZE(fde);
+ list_del_init(&entry->list);
+ list_add_tail(&entry->list, &local_entries.list);
- list_for_each_entry_safe(entry, tmp, &ecache->list, list)
- {
- fde = (struct fuse_dirent *)(buf + size);
- gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
- size += FUSE_DIRENT_SIZE(fde);
- list_del_init(&entry->list);
- list_add_tail(&entry->list, &local_entries.list);
+ if (size == max_size)
+ break;
+ }
- if (size == max_size)
- break;
+ send_fuse_data(old_state->this, old_state->finh, buf, size);
+ gf_dirent_free(&local_entries);
}
-
- send_fuse_data(old_state->this, old_state->finh, buf, size);
- gf_dirent_free(&local_entries);
- }
skip:
state->old_state = NULL;
free_fuse_state(old_state);
@@ -1316,17 +1317,17 @@ fuse_lookup_resume(fuse_state_t *state)
fuse_gfid_set(state);
/* It means the lookup operations winds from readdir */
if (state->wind_from_readdir) {
- /* TODO Current value is 600s fixed but we can provide a configurable
- option
+ /* TODO Current value is 600s fixed but we can provide a
+ configurable option
*/
delay.tv_sec = 600;
delay.tv_nsec = 0;
- /* Take extra ref so that inode will be keep in active list */
+ /* Take extra ref so that inode will be keep in active list */
inode = inode_new(state->loc.parent->table);
table = state->loc.parent->table;
if (table->active_size < 1000000) {
- //if (table->active_size < 1000000) {
- dict_set_int32(state->xdata, "inode-unref-delay", 600);
+ // if (table->active_size < 1000000) {
+ dict_set_int32(state->xdata, "inode-unref-delay", 600);
state->loc.inode = inode_ref(inode);
gf_timer_call_after(this->ctx, delay, fuse_inode_unref, inode);
} else {
@@ -1348,7 +1349,7 @@ fuse_lookup_resume(fuse_state_t *state)
out:
if (!state->wind_from_readdir)
send_fuse_err(state->this, finh, op_errno);
- free_fuse_state(state);
+ free_fuse_state(state);
}
static void
@@ -3727,9 +3728,9 @@ fuse_opendir(xlator_t *this, fuse_in_header_t *finh, void *msg,
fuse_resolve_and_resume(state, fuse_opendir_resume);
}
-
static void
-fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg, void *old_state, void *main_frame)
+fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg,
+ void *old_state, void *main_frame)
{
char *name = msg;
fuse_state_t *state = NULL;
@@ -3738,15 +3739,15 @@ fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg, void *old
if (state) {
state->old_state = old_state;
state->main_frame = main_frame;
- state->wind_from_readdir = _gf_true;
- (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
+ state->wind_from_readdir = _gf_true;
+ (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid,
+ name);
fuse_resolve_and_resume(state, fuse_lookup_resume);
}
return;
}
-
static int
fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
@@ -3789,19 +3790,20 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
readdir_optimize = priv->readdir_optimize;
if (readdir_optimize) {
- fdctx = fd_ctx_get_ptr(state->fd, this);
+ fdctx = fd_ctx_get_ptr(state->fd, this);
ecache = fdctx->equeue;
INIT_LIST_HEAD(&local_entries.list);
list_for_each_entry(orig_entry, (&entries->list), list)
{
- if (!strcmp(orig_entry->d_name, ".") || !strcmp(orig_entry->d_name, ".."))
+ if (!strcmp(orig_entry->d_name, ".") ||
+ !strcmp(orig_entry->d_name, ".."))
continue;
total_lookup++;
}
state->total_lookup = total_lookup;
if ((!total_lookup) || (table->active_size >= 1000000)) {
- //if (!total_lookup) {
+ // if (!total_lookup) {
readdir_optimize = 0;
goto skip_lookup;
}
@@ -3809,12 +3811,13 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
flag = 1;
list_for_each_entry(orig_entry, (&entries->list), list)
{
- new_entry = gf_dirent_for_name2(orig_entry->d_name, orig_entry->d_len,
- orig_entry->d_ino, orig_entry->d_off,
- orig_entry->d_type, NULL);
+ new_entry = gf_dirent_for_name2(
+ orig_entry->d_name, orig_entry->d_len, orig_entry->d_ino,
+ orig_entry->d_off, orig_entry->d_type, NULL);
list_add_tail(&new_entry->list, &ecache->list);
if (flag) {
- fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + new_entry->d_len);
+ fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+ new_entry->d_len);
max_size += fde_size;
}
@@ -3822,17 +3825,17 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
max_size -= fde_size;
flag = 0;
}
- if (!strcmp(orig_entry->d_name, ".") || !strcmp(orig_entry->d_name, ".."))
+ if (!strcmp(orig_entry->d_name, ".") ||
+ !strcmp(orig_entry->d_name, ".."))
continue;
/* Wind a lookup fop for every entry */
fuse_readdir_lookup(this, finh, new_entry->d_name, state, frame);
- }
- return 0;
+ }
+ return 0;
}
skip_lookup:
if (!readdir_optimize) {
-
gf_log("glusterfs-fuse", GF_LOG_TRACE,
"%" PRIu64 ": READDIR => %d/%" GF_PRI_SIZET ",%" PRId64,
frame->root->unique, op_ret, state->size, state->off);
@@ -3876,8 +3879,8 @@ skip_lookup:
send_fuse_data(this, finh, buf, size);
- /* TODO: */
- /* gf_link_inodes_from_dirent (state->fd->inode, entries); */
+ /* TODO: */
+ /* gf_link_inodes_from_dirent (state->fd->inode, entries); */
}
out:
free_fuse_state(state);
@@ -3903,7 +3906,7 @@ fuse_readdir_resume(fuse_state_t *state)
size_t new_size = state->size;
if (priv->readdir_optimize) {
- fdctx = fd_ctx_get_ptr(state->fd, this);
+ fdctx = fd_ctx_get_ptr(state->fd, this);
ecache = fdctx->equeue;
new_size = 131072;
@@ -3917,7 +3920,8 @@ fuse_readdir_resume(fuse_state_t *state)
INIT_LIST_HEAD(&local_entries.list);
list_for_each_entry(entry, &ecache->list, list)
{
- size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (entry->d_len));
+ size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+ (entry->d_len));
max_size += fde_size;
if (max_size > state->size) {
@@ -3932,8 +3936,8 @@ fuse_readdir_resume(fuse_state_t *state)
buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
if (!buf) {
- gf_log("glusterfs-fuse", GF_LOG_DEBUG,
- ": READDIR => -1 (%s)", strerror(ENOMEM));
+ gf_log("glusterfs-fuse", GF_LOG_DEBUG, ": READDIR => -1 (%s)",
+ strerror(ENOMEM));
send_fuse_err(this, state->finh, ENOMEM);
goto out;
}
@@ -5460,7 +5464,8 @@ fuse_init(xlator_t *this, fuse_in_header_t *finh, void *msg,
priv->init_recvd = 1;
- gf_log("MY_LOG", GF_LOG_INFO, "readdir-optimize is %d", priv->readdir_optimize);
+ gf_log("MY_LOG", GF_LOG_INFO, "readdir-optimize is %d",
+ priv->readdir_optimize);
if (fini->major != FUSE_KERNEL_VERSION) {
gf_log("glusterfs-fuse", GF_LOG_ERROR,
@@ -6673,7 +6678,8 @@ fuse_priv_dump(xlator_t *this)
if (!this)
return -1;
- private = this->private;
+ private
+ = this->private;
if (!private)
return -1;
@@ -6827,7 +6833,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
glusterfs_graph_t *graph = NULL;
struct pollfd pfd = {0};
- private = this->private;
+ private
+ = this->private;
graph = data;
@@ -6849,7 +6856,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
(event == GF_EVENT_CHILD_DOWN)) {
pthread_mutex_lock(&private->sync_mutex);
{
- private->event_recvd = 1;
+ private
+ ->event_recvd = 1;
pthread_cond_broadcast(&private->sync_cond);
}
pthread_mutex_unlock(&private->sync_mutex);
@@ -6858,16 +6866,18 @@ notify(xlator_t *this, int32_t event, void *data, ...)
pthread_mutex_lock(&private->sync_mutex);
{
if (!private->fuse_thread_started) {
- private->fuse_thread_started = 1;
+ private
+ ->fuse_thread_started = 1;
start_thread = _gf_true;
}
}
pthread_mutex_unlock(&private->sync_mutex);
if (start_thread) {
- private->fuse_thread = GF_CALLOC(private->reader_thread_count,
- sizeof(pthread_t),
- gf_fuse_mt_pthread_t);
+ private
+ ->fuse_thread = GF_CALLOC(private->reader_thread_count,
+ sizeof(pthread_t),
+ gf_fuse_mt_pthread_t);
for (i = 0; i < private->reader_thread_count; i++) {
ret = gf_thread_create(&private->fuse_thread[i], NULL,
fuse_thread_proc, this, "fuseproc");
@@ -6901,7 +6911,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
if (fuse_get_mount_status(this) != 0) {
goto auth_fail_unlock;
}
- private->mount_finished = _gf_true;
+ private
+ ->mount_finished = _gf_true;
} else if (pfd.revents) {
gf_log(this->name, GF_LOG_ERROR,
"mount pipe closed without status");
@@ -7144,7 +7155,8 @@ init(xlator_t *this_xl)
GF_OPTION_INIT("use-readdirp", priv->use_readdirp, bool, cleanup_exit);
if (!priv->use_readdirp)
- GF_OPTION_INIT("readdir-optimize", priv->readdir_optimize, bool, cleanup_exit);
+ GF_OPTION_INIT("readdir-optimize", priv->readdir_optimize, bool,
+ cleanup_exit);
priv->fuse_dump_fd = -1;
ret = dict_get_str(options, "dump-fuse", &value_string);
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
index 4a1f7367c..e3fc38b20 100644
--- a/xlators/mount/fuse/src/fuse-bridge.h
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -130,7 +130,6 @@ struct fuse_private {
/* for using fuse-kernel readdirp*/
gf_boolean_t use_readdirp;
-
/* The option to enable/disable readdir_optimize at fuse level*/
gf_boolean_t readdir_optimize;
/run regression
I haven't checked the patch in detail, but this change was tested by a user (#4176) and the results weren't good at all.
thanks for the information. I will keep it intact for now till we see more testing