Print this page
5269 zfs: zpool import slow
PORTING: this code relies on the property of taskq_wait to wait
until no more tasks are queued and no more tasks are active. As
we always queue new tasks from within other tasks, task_wait
reliably waits for the full recursion to finish, even though we
enqueue new tasks after taskq_wait has been called.
On platforms other than illumos, taskq_wait may not have this
property.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: George Wilson <george.wilson@delphix.com>

*** 23,32 **** --- 23,33 ---- * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include <sys/cred.h>
*** 47,63 **** --- 48,75 ---- #include <sys/dmu_impl.h> #include <sys/zfs_ioctl.h> #include <sys/sa.h> #include <sys/zfs_onexit.h> #include <sys/dsl_destroy.h> + #include <sys/vdev.h> /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; + /* + * Tunable to overwrite the maximum number of threads for the parallization + * of dmu_objset_find_dp, needed to speed up the import of pools with many + * datasets. + * Default is 4 times the number of leaf vdevs. + */ + int dmu_find_threads = 0; + + static void dmu_objset_find_dp_cb(void *arg); + void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); }
*** 502,511 **** --- 514,542 ---- } return (err); } + static int + dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) + { + int err; + + err = dmu_objset_from_ds(ds, osp); + if (err != 0) { + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EINVAL)); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EROFS)); + } + return (err); + } + /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */
*** 523,547 **** err = dsl_dataset_own(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } ! ! err = dmu_objset_from_ds(ds, osp); dsl_pool_rele(dp, FTAG); ! if (err != 0) { ! dsl_dataset_disown(ds, tag); ! } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { ! dsl_dataset_disown(ds, tag); ! return (SET_ERROR(EINVAL)); ! } else if (!readonly && ds->ds_is_snapshot) { ! dsl_dataset_disown(ds, tag); ! return (SET_ERROR(EROFS)); ! } return (err); } void dmu_objset_rele(objset_t *os, void *tag) { dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); --- 554,583 ---- err = dsl_dataset_own(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } ! err = dmu_objset_own_impl(ds, type, readonly, tag, osp); dsl_pool_rele(dp, FTAG); ! return (err); } + int + dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) + { + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_own_obj(dp, obj, tag, &ds); + if (err != 0) + return (err); + + return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); + } + void dmu_objset_rele(objset_t *os, void *tag) { dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag);
*** 1578,1646 **** zap_cursor_fini(&cursor); return (0); } ! /* ! * Find objsets under and including ddobj, call func(ds) on each. ! */ ! int ! dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, ! int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; ! int err; ! ASSERT(dsl_pool_config_held(dp)); ! err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err != 0) ! return (err); /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); ! return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ ! if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); ! err = dmu_objset_find_dp(dp, attr->za_first_integer, ! func, arg, flags); ! if (err != 0) ! break; } zap_cursor_fini(&zc); - - if (err != 0) { - dsl_dir_rele(dd, FTAG); - kmem_free(attr, sizeof (zap_attribute_t)); - return (err); - } } /* * Iterate over all snapshots. */ ! if (flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; --- 1614,1691 ---- zap_cursor_fini(&cursor); return (0); } ! typedef struct dmu_objset_find_ctx { ! taskq_t *dc_tq; ! dsl_pool_t *dc_dp; ! uint64_t dc_ddobj; ! int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); ! void *dc_arg; ! int dc_flags; ! kmutex_t *dc_error_lock; ! int *dc_error; ! } dmu_objset_find_ctx_t; ! ! static void ! dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) { + dsl_pool_t *dp = dcp->dc_dp; + dmu_objset_find_ctx_t *child_dcp; dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; ! int err = 0; ! /* don't process if there already was an error */ ! if (*dcp->dc_error != 0) ! goto out; ! err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd); if (err != 0) ! goto out; /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); ! goto out; } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ ! if (dcp->dc_flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); ! child_dcp = kmem_alloc(sizeof(*child_dcp), KM_SLEEP); ! *child_dcp = *dcp; ! child_dcp->dc_ddobj = attr->za_first_integer; ! if (dcp->dc_tq != NULL) ! (void) taskq_dispatch(dcp->dc_tq, ! dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); ! else ! dmu_objset_find_dp_impl(child_dcp); } zap_cursor_fini(&zc); } /* * Iterate over all snapshots. */ ! if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj;
*** 1657,1667 **** err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; ! err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); --- 1702,1712 ---- err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; ! err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc);
*** 1670,1690 **** dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) ! return (err); /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err != 0) ! return (err); ! err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); ! return (err); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it --- 1715,1833 ---- dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) ! goto out; /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err != 0) ! goto out; ! err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); ! ! out: ! if (err != 0) { ! mutex_enter(dcp->dc_error_lock); ! /* only keep first error */ ! if (*dcp->dc_error == 0) ! *dcp->dc_error = err; ! mutex_exit(dcp->dc_error_lock); ! } ! ! kmem_free(dcp, sizeof(*dcp)); ! } ! ! static void ! dmu_objset_find_dp_cb(void *arg) ! { ! dmu_objset_find_ctx_t *dcp = arg; ! dsl_pool_t *dp = dcp->dc_dp; ! ! dsl_pool_config_enter(dp, FTAG); ! ! dmu_objset_find_dp_impl(dcp); ! ! dsl_pool_config_exit(dp, FTAG); ! } ! ! /* ! * Find objsets under and including ddobj, call func(ds) on each. ! * The order for the enumeration is completely undefined. ! * func is called with dsl_pool_config held. ! */ ! int ! dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, ! int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) ! { ! int error = 0; ! taskq_t *tq = NULL; ! int ntasks; ! dmu_objset_find_ctx_t *dcp; ! kmutex_t err_lock; ! ! mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); ! dcp = kmem_alloc(sizeof(*dcp), KM_SLEEP); ! dcp->dc_tq = NULL; ! dcp->dc_dp = dp; ! dcp->dc_ddobj = ddobj; ! dcp->dc_func = func; ! dcp->dc_arg = arg; ! dcp->dc_flags = flags; ! dcp->dc_error_lock = &err_lock; ! dcp->dc_error = &error; ! ! if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { ! /* ! * In case a write lock is held we can't make use of ! * parallelism, as down the stack of the worker threads ! * the lock is asserted via dsl_pool_config_held. ! * In case of a read lock this is solved by getting a read ! * lock in each worker thread, which isn't possible in case ! * of a writer lock. So we fall back to the synchronous path ! * here. ! * In the future it might be possible to get some magic into ! * dsl_pool_config_held in a way that it returns true for ! * the worker threads so that a single lock held from this ! * thread suffices. For now, stay single threaded. ! */ ! dmu_objset_find_dp_impl(dcp); ! ! return (error); ! } ! ! ntasks = dmu_find_threads; ! if (ntasks == 0) ! ntasks = vdev_count_leaves(dp->dp_spa) * 4; ! tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, ! INT_MAX, 0); ! if (tq == NULL) { ! kmem_free(dcp, sizeof(*dcp)); ! return (SET_ERROR(ENOMEM)); ! } ! dcp->dc_tq = tq; ! ! /* dcp will be freed by task */ ! (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); ! ! /* ! * PORTING: this code relies on the property of taskq_wait to wait ! * until no more tasks are queued and no more tasks are active. As ! * we always queue new tasks from within other tasks, task_wait ! * reliably waits for the full recursion to finish, even though we ! * enqueue new tasks after taskq_wait has been called. ! * On platforms other than illumos, taskq_wait may not have this ! * property. ! */ ! taskq_wait(tq); ! taskq_destroy(tq); ! mutex_destroy(&err_lock); ! ! return (error); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it