Print this page
8115 parallel zfs mount

*** 20,30 **** */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com> * Copyright 2017 Joyent, Inc. * Copyright 2017 RackTop Systems. */ --- 20,30 ---- */ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. ! * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com> * Copyright 2017 Joyent, Inc. * Copyright 2017 RackTop Systems. */
*** 77,95 **** --- 77,99 ---- #include <zone.h> #include <sys/mntent.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/statvfs.h> + #include <sys/taskq.h> #include <libzfs.h> #include "libzfs_impl.h" #include <libshare.h> #include <sys/systeminfo.h> #define MAXISALEN 257 /* based on sysinfo(2) man page */ + static int mount_tq_nthr = 512; /* taskq threads for multi-threaded mounting */ + + static void zfs_mount_task(void *); static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); /*
*** 1075,1103 **** */ (void) rmdir(mountpoint); } } void libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) { if (cbp->cb_alloc == cbp->cb_used) { size_t newsz; ! void *ptr; ! newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64; ! ptr = zfs_realloc(zhp->zfs_hdl, ! cbp->cb_handles, cbp->cb_alloc * sizeof (void *), ! newsz * sizeof (void *)); ! cbp->cb_handles = ptr; cbp->cb_alloc = newsz; } cbp->cb_handles[cbp->cb_used++] = zhp; } static int ! mount_cb(zfs_handle_t *zhp, void *data) { get_all_cb_t *cbp = data; if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { zfs_close(zhp); --- 1079,1114 ---- */ (void) rmdir(mountpoint); } } + /* + * Add the given zfs handle to the cb_handles array, dynamically reallocating + * the array if it is out of space. + */ void libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) { if (cbp->cb_alloc == cbp->cb_used) { size_t newsz; ! zfs_handle_t **newhandles; ! newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64; ! newhandles = zfs_realloc(zhp->zfs_hdl, ! cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), ! newsz * sizeof (zfs_handle_t *)); ! cbp->cb_handles = newhandles; cbp->cb_alloc = newsz; } cbp->cb_handles[cbp->cb_used++] = zhp; } + /* + * Recursive helper function used during file system enumeration + */ static int ! zfs_iter_cb(zfs_handle_t *zhp, void *data) { get_all_cb_t *cbp = data; if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { zfs_close(zhp);
*** 1119,1226 **** zfs_close(zhp); return (0); } libzfs_add_handle(cbp, zhp); ! if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) { zfs_close(zhp); return (-1); } return (0); } int ! libzfs_dataset_cmp(const void *a, const void *b) { ! zfs_handle_t **za = (zfs_handle_t **)a; ! zfs_handle_t **zb = (zfs_handle_t **)b; char mounta[MAXPATHLEN]; char mountb[MAXPATHLEN]; boolean_t gota, gotb; ! if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0) ! verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta, sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); ! if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0) ! verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb, sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); ! if (gota && gotb) ! return (strcmp(mounta, mountb)); if (gota) return (-1); if (gotb) return (1); ! return (strcmp(zfs_get_name(a), zfs_get_name(b))); } /* * Mount and share all datasets within the given pool. This assumes that no ! * datasets within the pool are currently mounted. Because users can create ! * complicated nested hierarchies of mountpoints, we first gather all the ! * datasets and mountpoints within the pool, and sort them by mountpoint. Once ! * we have the list of all filesystems, we iterate over them in order and mount ! * and/or share each one. */ #pragma weak zpool_mount_datasets = zpool_enable_datasets int zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) { get_all_cb_t cb = { 0 }; ! libzfs_handle_t *hdl = zhp->zpool_hdl; zfs_handle_t *zfsp; ! int i, ret = -1; ! int *good; ! /* ! * Gather all non-snap datasets within the pool. ! */ ! if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL) goto out; - libzfs_add_handle(&cb, zfsp); - if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0) - goto out; - /* - * Sort the datasets by mountpoint. - */ - qsort(cb.cb_handles, cb.cb_used, sizeof (void *), - libzfs_dataset_cmp); /* ! * And mount all the datasets, keeping track of which ones ! * succeeded or failed. */ ! if ((good = zfs_alloc(zhp->zpool_hdl, ! cb.cb_used * sizeof (int))) == NULL) goto out; ! ret = 0; ! for (i = 0; i < cb.cb_used; i++) { ! if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0) ! ret = -1; ! else ! good[i] = 1; ! } /* ! * Then share all the ones that need to be shared. This needs ! * to be a separate pass in order to avoid excessive reloading ! * of the configuration. Good should never be NULL since ! * zfs_alloc is supposed to exit if memory isn't available. */ ! for (i = 0; i < cb.cb_used; i++) { ! if (good[i] && zfs_share(cb.cb_handles[i]) != 0) ! ret = -1; ! } ! free(good); out: ! for (i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); return (ret); } --- 1130,1482 ---- zfs_close(zhp); return (0); } libzfs_add_handle(cbp, zhp); ! if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) { zfs_close(zhp); return (-1); } return (0); } + /* + * Sort comparator that compares two mountpoint paths. We sort these paths so + * that subdirectories immediately follow their parents. This means that we + * effectively treat the '/' character as the lowest value non-nul char. An + * example sorted list using this comparator would look like: + * + * /foo + * /foo/bar + * /foo/bar/baz + * /foo/baz + * /foo.bar + * + * The mounting code depends on this ordering to deterministically iterate + * over filesystems in order to spawn parallel mount tasks. + */ int ! mountpoint_cmp(const void *arga, const void *argb) { ! zfs_handle_t *const *zap = arga; ! zfs_handle_t *za = *zap; ! zfs_handle_t *const *zbp = argb; ! zfs_handle_t *zb = *zbp; char mounta[MAXPATHLEN]; char mountb[MAXPATHLEN]; + const char *a = mounta; + const char *b = mountb; boolean_t gota, gotb; ! gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM); ! if (gota) { ! verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); ! } ! gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM); ! if (gotb) { ! verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); + } ! if (gota && gotb) { ! while (*a != '\0' && (*a == *b)) { ! a++; ! b++; ! } ! if (*a == *b) ! return (0); ! if (*a == '\0') ! return (-1); ! if (*b == '\0') ! return (1); ! if (*a == '/') ! return (-1); ! if (*b == '/') ! return (1); ! return (*a < *b ? -1 : *a > *b); ! } if (gota) return (-1); if (gotb) return (1); ! /* ! * If neither filesystem has a mountpoint, revert to sorting by ! * dataset name. ! */ ! return (strcmp(zfs_get_name(za), zfs_get_name(zb))); } /* + * Return true if path2 is a child of path1. + */ + static boolean_t + libzfs_path_contains(const char *path1, const char *path2) + { + return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'); + } + + /* + * Given a mountpoint specified by idx in the handles array, find the first + * non-descendent of that mountpoint and return its index. Descendant paths + * start with the parent's path. This function relies on the ordering + * enforced by mountpoint_cmp(). + */ + static int + non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx) + { + char parent[ZFS_MAXPROPLEN]; + char child[ZFS_MAXPROPLEN]; + int i; + + verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, + sizeof (parent), NULL, NULL, 0, B_FALSE) == 0); + + for (i = idx + 1; i < num_handles; i++) { + verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, + sizeof (child), NULL, NULL, 0, B_FALSE) == 0); + if (!libzfs_path_contains(parent, child)) + break; + } + return (i); + } + + typedef struct mnt_param { + libzfs_handle_t *mnt_hdl; + taskq_t *mnt_tq; + zfs_handle_t **mnt_zhps; /* filesystems to mount */ + size_t mnt_num_handles; + int mnt_idx; /* Index of selected entry to mount */ + zfs_iter_f mnt_func; + void *mnt_data; + } mnt_param_t; + + /* + * Allocate and populate the parameter struct for mount function, and + * schedule mounting of the entry selected by idx. + */ + static void + zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, + size_t num_handles, int idx, zfs_iter_f func, void *data, taskq_t *tq) + { + mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t)); + + mnt_param->mnt_hdl = hdl; + mnt_param->mnt_tq = tq; + mnt_param->mnt_zhps = handles; + mnt_param->mnt_num_handles = num_handles; + mnt_param->mnt_idx = idx; + mnt_param->mnt_func = func; + mnt_param->mnt_data = data; + + (void) taskq_dispatch(tq, zfs_mount_task, (void*)mnt_param, TQ_SLEEP); + } + + /* + * This is the structure used to keep state of mounting or sharing operations + * during a call to zpool_enable_datasets(). + */ + typedef struct mount_state { + /* + * ms_mntstatus is set to -1 if any mount fails. While multiple threads + * could update this variable concurrently, no synchronization is + * needed as it's only ever set to -1. + */ + int ms_mntstatus; + int ms_mntflags; + const char *ms_mntopts; + } mount_state_t; + + static int + zfs_mount_one(zfs_handle_t *zhp, void *arg) + { + mount_state_t *ms = arg; + int ret = 0; + + if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0) + ret = ms->ms_mntstatus = -1; + return (ret); + } + + static int + zfs_share_one(zfs_handle_t *zhp, void *arg) + { + mount_state_t *ms = arg; + int ret = 0; + + if (zfs_share(zhp) != 0) + ret = ms->ms_mntstatus = -1; + return (ret); + } + + /* + * Task queue function to mount one file system. On completion, it finds and + * schedules its children to be mounted. This depends on the sorting done in + * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries + * each descending from the previous) will have no parallelism since we always + * have to wait for the parent to finish mounting before we can schedule + * its children. + */ + static void + zfs_mount_task(void *arg) + { + mnt_param_t *mp = arg; + int idx = mp->mnt_idx; + zfs_handle_t **handles = mp->mnt_zhps; + size_t num_handles = mp->mnt_num_handles; + char mountpoint[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + + if (mp->mnt_func(handles[idx], mp->mnt_data) != 0) + return; + + /* + * We dispatch tasks to mount filesystems with mountpoints underneath + * this one. We do this by dispatching the next filesystem with a + * descendant mountpoint of the one we just mounted, then skip all of + * its descendants, dispatch the next descendant mountpoint, and so on. + * The non_descendant_idx() function skips over filesystems that are + * descendants of the filesystem we just dispatched. + */ + for (int i = idx + 1; i < num_handles; + i = non_descendant_idx(handles, num_handles, i)) { + char child[ZFS_MAXPROPLEN]; + verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, + child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0); + + if (!libzfs_path_contains(mountpoint, child)) + break; /* not a descendant, return */ + zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, + mp->mnt_func, mp->mnt_data, mp->mnt_tq); + } + free(mp); + } + + /* + * Issue the func callback for each ZFS handle contained in the handles + * array. This function is used to mount all datasets, and so this function + * guarantees that filesystems for parent mountpoints are called before their + * children. As such, before issuing any callbacks, we first sort the array + * of handles by mountpoint. + * + * Callbacks are issued in one of two ways: + * + * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT + * environment variable is set, then we issue callbacks sequentially. + * + * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT + * environment variable is not set, then we use a taskq to dispatch threads + * to mount filesystems is parallel. This function dispatches tasks to mount + * the filesystems at the top-level mountpoints, and these tasks in turn + * are responsible for recursively mounting filesystems in their children + * mountpoints. + */ + void + zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, + size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) + { + /* + * The ZFS_SERIAL_MOUNT environment variable is an undocumented + * variable that can be used as a convenience to do a/b comparison + * of serial vs. parallel mounting. + */ + boolean_t serial_mount = !parallel || + (getenv("ZFS_SERIAL_MOUNT") != NULL); + + /* + * Sort the datasets by mountpoint. See mountpoint_cmp for details + * of how these are sorted. + */ + qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp); + + if (serial_mount) { + for (int i = 0; i < num_handles; i++) { + func(handles[i], data); + } + return; + } + + /* + * Issue the callback function for each dataset using a parallel + * algorithm that uses a taskq to manage threads. + */ + taskq_t *tq = taskq_create("mount_taskq", mount_tq_nthr, 0, + mount_tq_nthr, mount_tq_nthr, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); + + /* + * There may be multiple "top level" mountpoints outside of the pool's + * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of + * these. + */ + for (int i = 0; i < num_handles; + i = non_descendant_idx(handles, num_handles, i)) { + zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, + tq); + } + + taskq_wait(tq); /* wait for all scheduled mounts to complete */ + taskq_destroy(tq); + } + + /* * Mount and share all datasets within the given pool. This assumes that no ! * datasets within the pool are currently mounted. */ #pragma weak zpool_mount_datasets = zpool_enable_datasets int zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) { get_all_cb_t cb = { 0 }; ! mount_state_t ms = { 0 }; zfs_handle_t *zfsp; ! sa_init_selective_arg_t sharearg; ! int ret = 0; ! if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, ! ZFS_TYPE_DATASET)) == NULL) goto out; /* ! * Gather all non-snapshot datasets within the pool. Start by adding ! * the root filesystem for this pool to the list, and then iterate ! * over all child filesystems. */ ! libzfs_add_handle(&cb, zfsp); ! if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0) goto out; ! ms.ms_mntopts = mntopts; ! ms.ms_mntflags = flags; ! zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, ! zfs_mount_one, &ms, B_TRUE); ! if (ms.ms_mntstatus != 0) ! ret = ms.ms_mntstatus; /* ! * Share all filesystems that need to be shared. This needs to be ! * a separate pass because libshare is not mt-safe, and so we need ! * to share serially. */ ! sharearg.zhandle_arr = cb.cb_handles; ! sharearg.zhandle_len = cb.cb_used; ! if ((ret = zfs_init_libshare_arg(zhp->zpool_hdl, ! SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != 0) ! goto out; ! ms.ms_mntstatus = 0; ! zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, ! zfs_share_one, &ms, B_FALSE); ! if (ms.ms_mntstatus != 0) ! ret = ms.ms_mntstatus; out: ! for (int i = 0; i < cb.cb_used; i++) zfs_close(cb.cb_handles[i]); free(cb.cb_handles); return (ret); }