glusterfs icon indicating copy to clipboard operation
glusterfs copied to clipboard

core[WIP] Readdir improvement for fuse

Open mohit84 opened this issue 1 year ago • 3 comments

There are multiple readdir improvement with this patch for fuse.

  1. Set the buffer size 128KB during wind a readdir call by fuse
  2. Wind a lookup call for all entries parallel by fuse_readdir_cbk
  3. Keep the inode in active list after taking extra reference for 10m(600s) sothat in next lookup is served by md-cache
  4. Take the reference only while table->active size is less than 1M

Fixes: #4176 Change-Id: Ic9c75799883d94ca473c230e6213cbf00b383684

mohit84 avatar Jul 28 '23 12:07 mohit84

CLANG-FORMAT FAILURE: Before merging the patch, this diff needs to be considered for passing clang-format

index a95a9ce0c..077d0be76 100644
--- a/glusterfsd/src/glusterfsd.c
+++ b/glusterfsd/src/glusterfsd.c
@@ -232,7 +232,8 @@ static struct argp_option gf_options[] = {
     {"use-readdirp", ARGP_FUSE_USE_READDIRP_KEY, "BOOL", OPTION_ARG_OPTIONAL,
      "Use readdirp mode in fuse kernel module"
      " [default: \"yes\"]"},
-    {"readdir-optimize", ARGP_FUSE_USE_READDIR_OPTIMIZE, "BOOL", OPTION_ARG_OPTIONAL,
+    {"readdir-optimize", ARGP_FUSE_USE_READDIR_OPTIMIZE, "BOOL",
+     OPTION_ARG_OPTIONAL,
      "Enable readdir optimize mode for fuse "
      " [default: \"no\"]"},
     {"secure-mgmt", ARGP_SECURE_MGMT_KEY, "BOOL", OPTION_ARG_OPTIONAL,
@@ -1254,8 +1255,8 @@ parse_opts(int key, char *arg, struct argp_state *state)
                 break;
             }
 
-            argp_failure(state, -1, 0, "unknown readdir-optimize setting \"%s\"",
-                         arg);
+            argp_failure(state, -1, 0,
+                         "unknown readdir-optimize setting \"%s\"", arg);
             break;
 
         case ARGP_LOGGER:
diff --git a/libglusterfs/src/glusterfs/glusterfs.h b/libglusterfs/src/glusterfs/glusterfs.h
index b8f66d861..3d709085e 100644
--- a/libglusterfs/src/glusterfs/glusterfs.h
+++ b/libglusterfs/src/glusterfs/glusterfs.h
@@ -63,7 +63,6 @@
 #define O_PATH 010000000 /* from asm-generic/fcntl.h */
 #endif
 
-
 #ifndef EBADFD
 /* Mac OS X does not have EBADFD */
 #define EBADFD EBADF
diff --git a/xlators/features/upcall/src/upcall.c b/xlators/features/upcall/src/upcall.c
index d13ac364f..fbddfd0af 100644
--- a/xlators/features/upcall/src/upcall.c
+++ b/xlators/features/upcall/src/upcall.c
@@ -745,14 +745,13 @@ out:
     return 0;
 }
 
-static void 
+static void
 up_inode_unref(void *data)
 {
     inode_t *inode = data;
 
     if (inode)
         inode_unref(inode);
-
 }
 
 static int32_t
@@ -774,8 +773,8 @@ up_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
         goto err;
     }
 
-    //delay.tv_sec = 600;
-    //delay.tv_nsec = 0;
+    // delay.tv_sec = 600;
+    // delay.tv_nsec = 0;
     if (!dict_get_int32_sizen(xattr_req, "inode-unref-delay", &delay_time)) {
         delay.tv_sec = delay_time;
         delay.tv_nsec = 0;
diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
index de1905156..822a6bcd1 100644
--- a/xlators/mount/fuse/src/fuse-bridge.c
+++ b/xlators/mount/fuse/src/fuse-bridge.c
@@ -1042,14 +1042,14 @@ fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
         feo.attr_valid = calc_timeout_sec(priv->attribute_timeout);
         feo.attr_valid_nsec = calc_timeout_nsec(priv->attribute_timeout);
         if (!state->wind_from_readdir) {
-            #if FUSE_KERNEL_MINOR_VERSION >= 9
-                priv->proto_minor >= 9
-                  ? send_fuse_obj(this, finh, &feo)
-                  : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
-            #else
-                send_fuse_obj(this, finh, &feo);
-           #endif
-       }
+#if FUSE_KERNEL_MINOR_VERSION >= 9
+            priv->proto_minor >= 9
+                ? send_fuse_obj(this, finh, &feo)
+                : send_fuse_data(this, finh, &feo, FUSE_COMPAT_ENTRY_OUT_SIZE);
+#else
+            send_fuse_obj(this, finh, &feo);
+#endif
+        }
     } else {
         gf_log("glusterfs-fuse",
                (op_errno == ENOENT ? GF_LOG_TRACE : GF_LOG_WARNING),
@@ -1060,7 +1060,8 @@ fuse_entry_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
         if (!state->wind_from_readdir) {
             if ((op_errno == ENOENT) && (priv->negative_timeout != 0)) {
                 feo.entry_valid = calc_timeout_sec(priv->negative_timeout);
-                feo.entry_valid_nsec = calc_timeout_nsec(priv->negative_timeout);
+                feo.entry_valid_nsec = calc_timeout_nsec(
+                    priv->negative_timeout);
                 send_fuse_obj(this, finh, &feo);
             } else {
                 send_fuse_err(this, state->finh, op_errno);
@@ -1153,54 +1154,54 @@ fuse_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
         }
         UNLOCK(&old_state->lock);
         if ((lkup_recvd == (lkup_sent - 1)) && (total_lookup == lkup_sent)) {
+            /* The above condition is true it means this is the last lookup
+               fop wind by readdir fop
+            */
+            fdctx = fd_ctx_get_ptr(old_state->fd, old_state->this);
+            ecache = fdctx->equeue;
+            main_frame = state->main_frame;
+            priv = old_state->this->private;
+
+            if (ecache) {
+                INIT_LIST_HEAD(&local_entries.list);
+                list_for_each_entry_safe(entry, tmp, &ecache->list, list)
+                {
+                    size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+                                                        (entry->d_len));
+                    max_size += fde_size;
 
-        /* The above condition is true it means this is the last lookup
-           fop wind by readdir fop
-        */
-        fdctx =  fd_ctx_get_ptr(old_state->fd, old_state->this);
-        ecache = fdctx->equeue;
-        main_frame = state->main_frame;
-        priv = old_state->this->private;
+                    if (max_size > old_state->size) {
+                        /* we received too many entries to fit in the reply */
+                        max_size -= fde_size;
+                        break;
+                    }
+                }
 
-        if (ecache) {
-            INIT_LIST_HEAD(&local_entries.list);
-            list_for_each_entry_safe(entry, tmp, &ecache->list, list)
-            {
-                size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (entry->d_len));
-                max_size += fde_size;
+                if (max_size == 0)
+                    goto skip;
 
-                if (max_size > old_state->size) {
-                    /* we received too many entries to fit in the reply */
-                    max_size -= fde_size;
-                    break;
+                buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
+                if (!buf) {
+                    gf_log("glusterfs-fuse", GF_LOG_DEBUG,
+                           ": READDIR => -1 (%s)", strerror(ENOMEM));
+                    goto skip;
                 }
-            }
 
-            if (max_size == 0)
-               goto skip;
-
-            buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
-            if (!buf) {
-                gf_log("glusterfs-fuse", GF_LOG_DEBUG,
-                       ": READDIR => -1 (%s)", strerror(ENOMEM));
-                goto skip;
-            }
+                list_for_each_entry_safe(entry, tmp, &ecache->list, list)
+                {
+                    fde = (struct fuse_dirent *)(buf + size);
+                    gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
+                    size += FUSE_DIRENT_SIZE(fde);
+                    list_del_init(&entry->list);
+                    list_add_tail(&entry->list, &local_entries.list);
 
-            list_for_each_entry_safe(entry, tmp, &ecache->list, list)
-            {
-                fde = (struct fuse_dirent *)(buf + size);
-                gf_fuse_fill_dirent(entry, fde, priv->enable_ino32);
-                size += FUSE_DIRENT_SIZE(fde);
-                list_del_init(&entry->list);
-                list_add_tail(&entry->list, &local_entries.list);
+                    if (size == max_size)
+                        break;
+                }
 
-                if (size == max_size)
-                    break;
+                send_fuse_data(old_state->this, old_state->finh, buf, size);
+                gf_dirent_free(&local_entries);
             }
-
-            send_fuse_data(old_state->this, old_state->finh, buf, size);
-            gf_dirent_free(&local_entries);
-        }
         skip:
             state->old_state = NULL;
             free_fuse_state(old_state);
@@ -1316,17 +1317,17 @@ fuse_lookup_resume(fuse_state_t *state)
         fuse_gfid_set(state);
         /* It means the lookup operations winds from readdir */
         if (state->wind_from_readdir) {
-            /* TODO Current value is 600s fixed but we can provide a configurable
-               option
+            /* TODO Current value is 600s fixed but we can provide a
+               configurable option
             */
             delay.tv_sec = 600;
             delay.tv_nsec = 0;
-            /* Take extra ref so that inode will be keep in active list */ 
+            /* Take extra ref so that inode will be keep in active list */
             inode = inode_new(state->loc.parent->table);
             table = state->loc.parent->table;
             if (table->active_size < 1000000) {
-            //if (table->active_size < 1000000) {
-              dict_set_int32(state->xdata, "inode-unref-delay", 600);
+                // if (table->active_size < 1000000) {
+                dict_set_int32(state->xdata, "inode-unref-delay", 600);
                 state->loc.inode = inode_ref(inode);
                 gf_timer_call_after(this->ctx, delay, fuse_inode_unref, inode);
             } else {
@@ -1348,7 +1349,7 @@ fuse_lookup_resume(fuse_state_t *state)
 out:
     if (!state->wind_from_readdir)
         send_fuse_err(state->this, finh, op_errno);
-    free_fuse_state(state);   
+    free_fuse_state(state);
 }
 
 static void
@@ -3727,9 +3728,9 @@ fuse_opendir(xlator_t *this, fuse_in_header_t *finh, void *msg,
     fuse_resolve_and_resume(state, fuse_opendir_resume);
 }
 
-
 static void
-fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg, void *old_state, void *main_frame)
+fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg,
+                    void *old_state, void *main_frame)
 {
     char *name = msg;
     fuse_state_t *state = NULL;
@@ -3738,15 +3739,15 @@ fuse_readdir_lookup(xlator_t *this, fuse_in_header_t *finh, void *msg, void *old
     if (state) {
         state->old_state = old_state;
         state->main_frame = main_frame;
-        state->wind_from_readdir = _gf_true; 
-        (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid, name);
+        state->wind_from_readdir = _gf_true;
+        (void)fuse_resolve_entry_init(state, &state->resolve, finh->nodeid,
+                                      name);
 
         fuse_resolve_and_resume(state, fuse_lookup_resume);
     }
     return;
 }
 
-
 static int
 fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
                  int32_t op_ret, int32_t op_errno, gf_dirent_t *entries,
@@ -3789,19 +3790,20 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
 
     readdir_optimize = priv->readdir_optimize;
     if (readdir_optimize) {
-        fdctx =  fd_ctx_get_ptr(state->fd, this);
+        fdctx = fd_ctx_get_ptr(state->fd, this);
         ecache = fdctx->equeue;
         INIT_LIST_HEAD(&local_entries.list);
 
         list_for_each_entry(orig_entry, (&entries->list), list)
         {
-            if (!strcmp(orig_entry->d_name, ".") || !strcmp(orig_entry->d_name, ".."))
+            if (!strcmp(orig_entry->d_name, ".") ||
+                !strcmp(orig_entry->d_name, ".."))
                 continue;
             total_lookup++;
         }
         state->total_lookup = total_lookup;
         if ((!total_lookup) || (table->active_size >= 1000000)) {
-        //if (!total_lookup) {
+            // if (!total_lookup) {
             readdir_optimize = 0;
             goto skip_lookup;
         }
@@ -3809,12 +3811,13 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
         flag = 1;
         list_for_each_entry(orig_entry, (&entries->list), list)
         {
-            new_entry = gf_dirent_for_name2(orig_entry->d_name, orig_entry->d_len,
-                                            orig_entry->d_ino, orig_entry->d_off,
-                                            orig_entry->d_type, NULL);
+            new_entry = gf_dirent_for_name2(
+                orig_entry->d_name, orig_entry->d_len, orig_entry->d_ino,
+                orig_entry->d_off, orig_entry->d_type, NULL);
             list_add_tail(&new_entry->list, &ecache->list);
             if (flag) {
-                fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + new_entry->d_len);
+                fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+                                             new_entry->d_len);
                 max_size += fde_size;
             }
 
@@ -3822,17 +3825,17 @@ fuse_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
                 max_size -= fde_size;
                 flag = 0;
             }
-            if (!strcmp(orig_entry->d_name, ".") || !strcmp(orig_entry->d_name, ".."))
+            if (!strcmp(orig_entry->d_name, ".") ||
+                !strcmp(orig_entry->d_name, ".."))
                 continue;
             /* Wind a lookup fop for every entry */
             fuse_readdir_lookup(this, finh, new_entry->d_name, state, frame);
-       }
-       return 0;
+        }
+        return 0;
     }
 
 skip_lookup:
     if (!readdir_optimize) {
-
         gf_log("glusterfs-fuse", GF_LOG_TRACE,
                "%" PRIu64 ": READDIR => %d/%" GF_PRI_SIZET ",%" PRId64,
                frame->root->unique, op_ret, state->size, state->off);
@@ -3876,8 +3879,8 @@ skip_lookup:
 
         send_fuse_data(this, finh, buf, size);
 
-       /* TODO: */
-       /* gf_link_inodes_from_dirent (state->fd->inode, entries); */
+        /* TODO: */
+        /* gf_link_inodes_from_dirent (state->fd->inode, entries); */
     }
 out:
     free_fuse_state(state);
@@ -3903,7 +3906,7 @@ fuse_readdir_resume(fuse_state_t *state)
     size_t new_size = state->size;
 
     if (priv->readdir_optimize) {
-        fdctx =  fd_ctx_get_ptr(state->fd, this);
+        fdctx = fd_ctx_get_ptr(state->fd, this);
         ecache = fdctx->equeue;
 
         new_size = 131072;
@@ -3917,7 +3920,8 @@ fuse_readdir_resume(fuse_state_t *state)
         INIT_LIST_HEAD(&local_entries.list);
         list_for_each_entry(entry, &ecache->list, list)
         {
-            size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (entry->d_len));
+            size_t fde_size = FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET +
+                                                (entry->d_len));
             max_size += fde_size;
 
             if (max_size > state->size) {
@@ -3932,8 +3936,8 @@ fuse_readdir_resume(fuse_state_t *state)
 
         buf = GF_CALLOC(1, max_size, gf_fuse_mt_char);
         if (!buf) {
-            gf_log("glusterfs-fuse", GF_LOG_DEBUG,
-                   ": READDIR => -1 (%s)", strerror(ENOMEM));
+            gf_log("glusterfs-fuse", GF_LOG_DEBUG, ": READDIR => -1 (%s)",
+                   strerror(ENOMEM));
             send_fuse_err(this, state->finh, ENOMEM);
             goto out;
         }
@@ -5460,7 +5464,8 @@ fuse_init(xlator_t *this, fuse_in_header_t *finh, void *msg,
 
     priv->init_recvd = 1;
 
-    gf_log("MY_LOG", GF_LOG_INFO, "readdir-optimize is %d", priv->readdir_optimize);
+    gf_log("MY_LOG", GF_LOG_INFO, "readdir-optimize is %d",
+           priv->readdir_optimize);
 
     if (fini->major != FUSE_KERNEL_VERSION) {
         gf_log("glusterfs-fuse", GF_LOG_ERROR,
@@ -6673,7 +6678,8 @@ fuse_priv_dump(xlator_t *this)
     if (!this)
         return -1;
 
-    private = this->private;
+   private
+    = this->private;
 
     if (!private)
         return -1;
@@ -6827,7 +6833,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
     glusterfs_graph_t *graph = NULL;
     struct pollfd pfd = {0};
 
-    private = this->private;
+   private
+    = this->private;
 
     graph = data;
 
@@ -6849,7 +6856,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
                 (event == GF_EVENT_CHILD_DOWN)) {
                 pthread_mutex_lock(&private->sync_mutex);
                 {
-                    private->event_recvd = 1;
+                   private
+                    ->event_recvd = 1;
                     pthread_cond_broadcast(&private->sync_cond);
                 }
                 pthread_mutex_unlock(&private->sync_mutex);
@@ -6858,16 +6866,18 @@ notify(xlator_t *this, int32_t event, void *data, ...)
             pthread_mutex_lock(&private->sync_mutex);
             {
                 if (!private->fuse_thread_started) {
-                    private->fuse_thread_started = 1;
+                   private
+                    ->fuse_thread_started = 1;
                     start_thread = _gf_true;
                 }
             }
             pthread_mutex_unlock(&private->sync_mutex);
 
             if (start_thread) {
-                private->fuse_thread = GF_CALLOC(private->reader_thread_count,
-                                                 sizeof(pthread_t),
-                                                 gf_fuse_mt_pthread_t);
+               private
+                ->fuse_thread = GF_CALLOC(private->reader_thread_count,
+                                          sizeof(pthread_t),
+                                          gf_fuse_mt_pthread_t);
                 for (i = 0; i < private->reader_thread_count; i++) {
                     ret = gf_thread_create(&private->fuse_thread[i], NULL,
                                            fuse_thread_proc, this, "fuseproc");
@@ -6901,7 +6911,8 @@ notify(xlator_t *this, int32_t event, void *data, ...)
                         if (fuse_get_mount_status(this) != 0) {
                             goto auth_fail_unlock;
                         }
-                        private->mount_finished = _gf_true;
+                       private
+                        ->mount_finished = _gf_true;
                     } else if (pfd.revents) {
                         gf_log(this->name, GF_LOG_ERROR,
                                "mount pipe closed without status");
@@ -7144,7 +7155,8 @@ init(xlator_t *this_xl)
 
     GF_OPTION_INIT("use-readdirp", priv->use_readdirp, bool, cleanup_exit);
     if (!priv->use_readdirp)
-        GF_OPTION_INIT("readdir-optimize", priv->readdir_optimize, bool, cleanup_exit);
+        GF_OPTION_INIT("readdir-optimize", priv->readdir_optimize, bool,
+                       cleanup_exit);
 
     priv->fuse_dump_fd = -1;
     ret = dict_get_str(options, "dump-fuse", &value_string);
diff --git a/xlators/mount/fuse/src/fuse-bridge.h b/xlators/mount/fuse/src/fuse-bridge.h
index 4a1f7367c..e3fc38b20 100644
--- a/xlators/mount/fuse/src/fuse-bridge.h
+++ b/xlators/mount/fuse/src/fuse-bridge.h
@@ -130,7 +130,6 @@ struct fuse_private {
     /* for using fuse-kernel readdirp*/
     gf_boolean_t use_readdirp;
 
-
     /* The option to enable/disable readdir_optimize at fuse level*/
     gf_boolean_t readdir_optimize;
 

gluster-ant avatar Jul 28 '23 12:07 gluster-ant

/run regression

amarts avatar Sep 22 '23 08:09 amarts

I haven't checked the patch in detail, but this change was tested by a user (#4176) and the results weren't good at all.

thanks for the information. I will keep it intact for now till we see more testing

amarts avatar Sep 22 '23 14:09 amarts