illumos-gate Cdiff usr/src/lib/libzfs/common/libzfs

Print this page

8115 parallel zfs mount


*** 20,30 ****
   */
  
  /*
   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
!  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
   * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
   * Copyright 2017 Joyent, Inc.
   * Copyright 2017 RackTop Systems.
   */
  
--- 20,30 ----
   */
  
  /*
   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
!  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
   * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
   * Copyright 2017 Joyent, Inc.
   * Copyright 2017 RackTop Systems.
   */
  
*** 77,95 ****
--- 77,99 ----
  #include <zone.h>
  #include <sys/mntent.h>
  #include <sys/mount.h>
  #include <sys/stat.h>
  #include <sys/statvfs.h>
+ #include <sys/taskq.h>
  
  #include <libzfs.h>
  
  #include "libzfs_impl.h"
  
  #include <libshare.h>
  #include <sys/systeminfo.h>
  #define MAXISALEN       257     /* based on sysinfo(2) man page */
  
+ static int mount_tq_nthr = 512; /* taskq threads for multi-threaded mounting */
+ 
+ static void zfs_mount_task(void *);
  static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
  zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
      zfs_share_proto_t);
  
  /*
*** 1075,1103 ****
                   */
                  (void) rmdir(mountpoint);
          }
  }
  
  void
  libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
  {
          if (cbp->cb_alloc == cbp->cb_used) {
                  size_t newsz;
!                 void *ptr;
  
!                 newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
!                 ptr = zfs_realloc(zhp->zfs_hdl,
!                     cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
!                     newsz * sizeof (void *));
!                 cbp->cb_handles = ptr;
                  cbp->cb_alloc = newsz;
          }
          cbp->cb_handles[cbp->cb_used++] = zhp;
  }
  
  static int
! mount_cb(zfs_handle_t *zhp, void *data)
  {
          get_all_cb_t *cbp = data;
  
          if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) {
                  zfs_close(zhp);
--- 1079,1114 ----
                   */
                  (void) rmdir(mountpoint);
          }
  }
  
+ /*
+  * Add the given zfs handle to the cb_handles array, dynamically reallocating
+  * the array if it is out of space.
+  */
  void
  libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
  {
          if (cbp->cb_alloc == cbp->cb_used) {
                  size_t newsz;
!                 zfs_handle_t **newhandles;
  
!                 newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
!                 newhandles = zfs_realloc(zhp->zfs_hdl,
!                     cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
!                     newsz * sizeof (zfs_handle_t *));
!                 cbp->cb_handles = newhandles;
                  cbp->cb_alloc = newsz;
          }
          cbp->cb_handles[cbp->cb_used++] = zhp;
  }
  
+ /*
+  * Recursive helper function used during file system enumeration
+  */
  static int
! zfs_iter_cb(zfs_handle_t *zhp, void *data)
  {
          get_all_cb_t *cbp = data;
  
          if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) {
                  zfs_close(zhp);
*** 1119,1226 ****
                  zfs_close(zhp);
                  return (0);
          }
  
          libzfs_add_handle(cbp, zhp);
!         if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
                  zfs_close(zhp);
                  return (-1);
          }
          return (0);
  }
  
  int
! libzfs_dataset_cmp(const void *a, const void *b)
  {
!         zfs_handle_t **za = (zfs_handle_t **)a;
!         zfs_handle_t **zb = (zfs_handle_t **)b;
          char mounta[MAXPATHLEN];
          char mountb[MAXPATHLEN];
          boolean_t gota, gotb;
  
!         if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
!                 verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
                      sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
!         if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
!                 verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
                      sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
  
!         if (gota && gotb)
!                 return (strcmp(mounta, mountb));
  
          if (gota)
                  return (-1);
          if (gotb)
                  return (1);
  
!         return (strcmp(zfs_get_name(a), zfs_get_name(b)));
  }
  
  /*
   * Mount and share all datasets within the given pool.  This assumes that no
!  * datasets within the pool are currently mounted.  Because users can create
!  * complicated nested hierarchies of mountpoints, we first gather all the
!  * datasets and mountpoints within the pool, and sort them by mountpoint.  Once
!  * we have the list of all filesystems, we iterate over them in order and mount
!  * and/or share each one.
   */
  #pragma weak zpool_mount_datasets = zpool_enable_datasets
  int
  zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
  {
          get_all_cb_t cb = { 0 };
!         libzfs_handle_t *hdl = zhp->zpool_hdl;
          zfs_handle_t *zfsp;
!         int i, ret = -1;
!         int *good;
  
!         /*
!          * Gather all non-snap datasets within the pool.
!          */
!         if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL)
                  goto out;
  
-         libzfs_add_handle(&cb, zfsp);
-         if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
-                 goto out;
-         /*
-          * Sort the datasets by mountpoint.
-          */
-         qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
-             libzfs_dataset_cmp);
  
          /*
!          * And mount all the datasets, keeping track of which ones
!          * succeeded or failed.
           */
!         if ((good = zfs_alloc(zhp->zpool_hdl,
!             cb.cb_used * sizeof (int))) == NULL)
                  goto out;
  
!         ret = 0;
!         for (i = 0; i < cb.cb_used; i++) {
!                 if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
!                         ret = -1;
!                 else
!                         good[i] = 1;
!         }
  
          /*
!          * Then share all the ones that need to be shared. This needs
!          * to be a separate pass in order to avoid excessive reloading
!          * of the configuration. Good should never be NULL since
!          * zfs_alloc is supposed to exit if memory isn't available.
           */
!         for (i = 0; i < cb.cb_used; i++) {
!                 if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
!                         ret = -1;
!         }
  
!         free(good);
  
  out:
!         for (i = 0; i < cb.cb_used; i++)
                  zfs_close(cb.cb_handles[i]);
          free(cb.cb_handles);
  
          return (ret);
  }
--- 1130,1482 ----
                  zfs_close(zhp);
                  return (0);
          }
  
          libzfs_add_handle(cbp, zhp);
!         if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
                  zfs_close(zhp);
                  return (-1);
          }
          return (0);
  }
  
+ /*
+  * Sort comparator that compares two mountpoint paths. We sort these paths so
+  * that subdirectories immediately follow their parents. This means that we
+  * effectively treat the '/' character as the lowest value non-nul char. An
+  * example sorted list using this comparator would look like:
+  *
+  * /foo
+  * /foo/bar
+  * /foo/bar/baz
+  * /foo/baz
+  * /foo.bar
+  *
+  * The mounting code depends on this ordering to deterministically iterate
+  * over filesystems in order to spawn parallel mount tasks.
+  */
  int
! mountpoint_cmp(const void *arga, const void *argb)
  {
!         zfs_handle_t *const *zap = arga;
!         zfs_handle_t *za = *zap;
!         zfs_handle_t *const *zbp = argb;
!         zfs_handle_t *zb = *zbp;
          char mounta[MAXPATHLEN];
          char mountb[MAXPATHLEN];
+         const char *a = mounta;
+         const char *b = mountb;
          boolean_t gota, gotb;
  
!         gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
!         if (gota) {
!                 verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
                      sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
!         }
!         gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
!         if (gotb) {
!                 verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
                      sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
+         }
  
!         if (gota && gotb) {
!                 while (*a != '\0' && (*a == *b)) {
!                         a++;
!                         b++;
!                 }
!                 if (*a == *b)
!                         return (0);
!                 if (*a == '\0')
!                         return (-1);
!                 if (*b == '\0')
!                         return (1);
!                 if (*a == '/')
!                         return (-1);
!                 if (*b == '/')
!                         return (1);
!                 return (*a < *b ? -1 : *a > *b);
!         }
  
          if (gota)
                  return (-1);
          if (gotb)
                  return (1);
  
!         /*
!          * If neither filesystem has a mountpoint, revert to sorting by
!          * dataset name.
!          */
!         return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
  }
  
  /*
+  * Return true if path2 is a child of path1.
+  */
+ static boolean_t
+ libzfs_path_contains(const char *path1, const char *path2)
+ {
+         return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
+ }
+ 
+ /*
+  * Given a mountpoint specified by idx in the handles array, find the first
+  * non-descendent of that mountpoint and return its index. Descendant paths
+  * start with the parent's path. This function relies on the ordering
+  * enforced by mountpoint_cmp().
+  */
+ static int
+ non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx)
+ {
+         char parent[ZFS_MAXPROPLEN];
+         char child[ZFS_MAXPROPLEN];
+         int i;
+ 
+         verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
+             sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
+ 
+         for (i = idx + 1; i < num_handles; i++) {
+                 verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
+                     sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
+                 if (!libzfs_path_contains(parent, child))
+                         break;
+         }
+         return (i);
+ }
+ 
+ typedef struct mnt_param {
+         libzfs_handle_t *mnt_hdl;
+         taskq_t         *mnt_tq;
+         zfs_handle_t    **mnt_zhps; /* filesystems to mount */
+         size_t          mnt_num_handles;
+         int             mnt_idx;        /* Index of selected entry to mount */
+         zfs_iter_f      mnt_func;
+         void            *mnt_data;
+ } mnt_param_t;
+ 
+ /*
+  * Allocate and populate the parameter struct for mount function, and
+  * schedule mounting of the entry selected by idx.
+  */
+ static void
+ zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
+     size_t num_handles, int idx, zfs_iter_f func, void *data, taskq_t *tq)
+ {
+         mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
+ 
+         mnt_param->mnt_hdl = hdl;
+         mnt_param->mnt_tq = tq;
+         mnt_param->mnt_zhps = handles;
+         mnt_param->mnt_num_handles = num_handles;
+         mnt_param->mnt_idx = idx;
+         mnt_param->mnt_func = func;
+         mnt_param->mnt_data = data;
+ 
+         (void) taskq_dispatch(tq, zfs_mount_task, (void*)mnt_param, TQ_SLEEP);
+ }
+ 
+ /*
+  * This is the structure used to keep state of mounting or sharing operations
+  * during a call to zpool_enable_datasets().
+  */
+ typedef struct mount_state {
+         /*
+          * ms_mntstatus is set to -1 if any mount fails. While multiple threads
+          * could update this variable concurrently, no synchronization is
+          * needed as it's only ever set to -1.
+          */
+         int             ms_mntstatus;
+         int             ms_mntflags;
+         const char      *ms_mntopts;
+ } mount_state_t;
+ 
+ static int
+ zfs_mount_one(zfs_handle_t *zhp, void *arg)
+ {
+         mount_state_t *ms = arg;
+         int ret = 0;
+ 
+         if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
+                 ret = ms->ms_mntstatus = -1;
+         return (ret);
+ }
+ 
+ static int
+ zfs_share_one(zfs_handle_t *zhp, void *arg)
+ {
+         mount_state_t *ms = arg;
+         int ret = 0;
+ 
+         if (zfs_share(zhp) != 0)
+                 ret = ms->ms_mntstatus = -1;
+         return (ret);
+ }
+ 
+ /*
+  * Task queue function to mount one file system. On completion, it finds and
+  * schedules its children to be mounted. This depends on the sorting done in
+  * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries
+  * each descending from the previous) will have no parallelism since we always
+  * have to wait for the parent to finish mounting before we can schedule
+  * its children.
+  */
+ static void
+ zfs_mount_task(void *arg)
+ {
+         mnt_param_t *mp = arg;
+         int idx = mp->mnt_idx;
+         zfs_handle_t **handles = mp->mnt_zhps;
+         size_t num_handles = mp->mnt_num_handles;
+         char mountpoint[ZFS_MAXPROPLEN];
+ 
+         verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
+             sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
+ 
+         if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
+                 return;
+ 
+         /*
+          * We dispatch tasks to mount filesystems with mountpoints underneath
+          * this one. We do this by dispatching the next filesystem with a
+          * descendant mountpoint of the one we just mounted, then skip all of
+          * its descendants, dispatch the next descendant mountpoint, and so on.
+          * The non_descendant_idx() function skips over filesystems that are
+          * descendants of the filesystem we just dispatched.
+          */
+         for (int i = idx + 1; i < num_handles;
+             i = non_descendant_idx(handles, num_handles, i)) {
+                 char child[ZFS_MAXPROPLEN];
+                 verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
+                     child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
+ 
+                 if (!libzfs_path_contains(mountpoint, child))
+                         break; /* not a descendant, return */
+                 zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
+                     mp->mnt_func, mp->mnt_data, mp->mnt_tq);
+         }
+         free(mp);
+ }
+ 
+ /*
+  * Issue the func callback for each ZFS handle contained in the handles
+  * array. This function is used to mount all datasets, and so this function
+  * guarantees that filesystems for parent mountpoints are called before their
+  * children. As such, before issuing any callbacks, we first sort the array
+  * of handles by mountpoint.
+  *
+  * Callbacks are issued in one of two ways:
+  *
+  * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT
+  *    environment variable is set, then we issue callbacks sequentially.
+  *
+  * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT
+  *    environment variable is not set, then we use a taskq to dispatch threads
+  *    to mount filesystems is parallel. This function dispatches tasks to mount
+  *    the filesystems at the top-level mountpoints, and these tasks in turn
+  *    are responsible for recursively mounting filesystems in their children
+  *    mountpoints.
+  */
+ void
+ zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
+     size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel)
+ {
+         /*
+          * The ZFS_SERIAL_MOUNT environment variable is an undocumented
+          * variable that can be used as a convenience to do a/b comparison
+          * of serial vs. parallel mounting.
+          */
+         boolean_t serial_mount = !parallel ||
+             (getenv("ZFS_SERIAL_MOUNT") != NULL);
+ 
+         /*
+          * Sort the datasets by mountpoint. See mountpoint_cmp for details
+          * of how these are sorted.
+          */
+         qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
+ 
+         if (serial_mount) {
+                 for (int i = 0; i < num_handles; i++) {
+                         func(handles[i], data);
+                 }
+                 return;
+         }
+ 
+         /*
+          * Issue the callback function for each dataset using a parallel
+          * algorithm that uses a taskq to manage threads.
+          */
+         taskq_t *tq = taskq_create("mount_taskq", mount_tq_nthr, 0,
+             mount_tq_nthr, mount_tq_nthr, TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
+ 
+         /*
+          * There may be multiple "top level" mountpoints outside of the pool's
+          * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of
+          * these.
+          */
+         for (int i = 0; i < num_handles;
+             i = non_descendant_idx(handles, num_handles, i)) {
+                 zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
+                     tq);
+         }
+ 
+         taskq_wait(tq); /* wait for all scheduled mounts to complete */
+         taskq_destroy(tq);
+ }
+ 
+ /*
   * Mount and share all datasets within the given pool.  This assumes that no
!  * datasets within the pool are currently mounted.
   */
  #pragma weak zpool_mount_datasets = zpool_enable_datasets
  int
  zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
  {
          get_all_cb_t cb = { 0 };
!         mount_state_t ms = { 0 };
          zfs_handle_t *zfsp;
!         sa_init_selective_arg_t sharearg;
!         int ret = 0;
  
!         if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
!             ZFS_TYPE_DATASET)) == NULL)
                  goto out;
  
  
          /*
!          * Gather all non-snapshot datasets within the pool. Start by adding
!          * the root filesystem for this pool to the list, and then iterate
!          * over all child filesystems.
           */
!         libzfs_add_handle(&cb, zfsp);
!         if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
                  goto out;
  
!         ms.ms_mntopts = mntopts;
!         ms.ms_mntflags = flags;
!         zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
!             zfs_mount_one, &ms, B_TRUE);
!         if (ms.ms_mntstatus != 0)
!                 ret = ms.ms_mntstatus;
  
          /*
!          * Share all filesystems that need to be shared. This needs to be
!          * a separate pass because libshare is not mt-safe, and so we need
!          * to share serially.
           */
!         sharearg.zhandle_arr = cb.cb_handles;
!         sharearg.zhandle_len = cb.cb_used;
!         if ((ret = zfs_init_libshare_arg(zhp->zpool_hdl,
!             SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != 0)
!                 goto out;
  
!         ms.ms_mntstatus = 0;
!         zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
!             zfs_share_one, &ms, B_FALSE);
!         if (ms.ms_mntstatus != 0)
!                 ret = ms.ms_mntstatus;
  
  out:
!         for (int i = 0; i < cb.cb_used; i++)
                  zfs_close(cb.cb_handles[i]);
          free(cb.cb_handles);
  
          return (ret);
  }