Print this page
5269 zfs: zpool import slow
PORTING: this code relies on the property of taskq_wait to wait
until no more tasks are queued and no more tasks are active. As
we always queue new tasks from within other tasks, task_wait
reliably waits for the full recursion to finish, even though we
enqueue new tasks after taskq_wait has been called.
On platforms other than illumos, taskq_wait may not have this
property.
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
*** 23,32 ****
--- 23,33 ----
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
*/
/* Portions Copyright 2010 Robert Milkowski */
#include <sys/cred.h>
*** 47,63 ****
--- 48,75 ----
#include <sys/dmu_impl.h>
#include <sys/zfs_ioctl.h>
#include <sys/sa.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_destroy.h>
+ #include <sys/vdev.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
* before it can be safely accessed.
*/
krwlock_t os_lock;
+ /*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+ int dmu_find_threads = 0;
+
+ static void dmu_objset_find_dp_cb(void *arg);
+
void
dmu_objset_init(void)
{
rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
}
*** 502,511 ****
--- 514,542 ----
}
return (err);
}
+ static int
+ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+ {
+ int err;
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, tag);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EINVAL));
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EROFS));
+ }
+ return (err);
+ }
+
/*
* dsl_pool must not be held when this is called.
* Upon successful return, there will be a longhold on the dataset,
* and the dsl_pool will not be held.
*/
*** 523,547 ****
err = dsl_dataset_own(dp, name, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
}
!
! err = dmu_objset_from_ds(ds, osp);
dsl_pool_rele(dp, FTAG);
! if (err != 0) {
! dsl_dataset_disown(ds, tag);
! } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
! dsl_dataset_disown(ds, tag);
! return (SET_ERROR(EINVAL));
! } else if (!readonly && ds->ds_is_snapshot) {
! dsl_dataset_disown(ds, tag);
! return (SET_ERROR(EROFS));
! }
return (err);
}
void
dmu_objset_rele(objset_t *os, void *tag)
{
dsl_pool_t *dp = dmu_objset_pool(os);
dsl_dataset_rele(os->os_dsl_dataset, tag);
--- 554,583 ----
err = dsl_dataset_own(dp, name, tag, &ds);
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
}
! err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
dsl_pool_rele(dp, FTAG);
!
return (err);
}
+ int
+ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+ {
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+ if (err != 0)
+ return (err);
+
+ return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+ }
+
void
dmu_objset_rele(objset_t *os, void *tag)
{
dsl_pool_t *dp = dmu_objset_pool(os);
dsl_dataset_rele(os->os_dsl_dataset, tag);
*** 1578,1646 ****
zap_cursor_fini(&cursor);
return (0);
}
! /*
! * Find objsets under and including ddobj, call func(ds) on each.
! */
! int
! dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
! int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
{
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
! int err;
! ASSERT(dsl_pool_config_held(dp));
! err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
if (err != 0)
! return (err);
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
! return (0);
}
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/*
* Iterate over all children.
*/
! if (flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
! err = dmu_objset_find_dp(dp, attr->za_first_integer,
! func, arg, flags);
! if (err != 0)
! break;
}
zap_cursor_fini(&zc);
-
- if (err != 0) {
- dsl_dir_rele(dd, FTAG);
- kmem_free(attr, sizeof (zap_attribute_t));
- return (err);
- }
}
/*
* Iterate over all snapshots.
*/
! if (flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err == 0) {
uint64_t snapobj;
--- 1614,1691 ----
zap_cursor_fini(&cursor);
return (0);
}
! typedef struct dmu_objset_find_ctx {
! taskq_t *dc_tq;
! dsl_pool_t *dc_dp;
! uint64_t dc_ddobj;
! int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
! void *dc_arg;
! int dc_flags;
! kmutex_t *dc_error_lock;
! int *dc_error;
! } dmu_objset_find_ctx_t;
!
! static void
! dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
{
+ dsl_pool_t *dp = dcp->dc_dp;
+ dmu_objset_find_ctx_t *child_dcp;
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
! int err = 0;
! /* don't process if there already was an error */
! if (*dcp->dc_error != 0)
! goto out;
! err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
if (err != 0)
! goto out;
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
! goto out;
}
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/*
* Iterate over all children.
*/
! if (dcp->dc_flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
! child_dcp = kmem_alloc(sizeof(*child_dcp), KM_SLEEP);
! *child_dcp = *dcp;
! child_dcp->dc_ddobj = attr->za_first_integer;
! if (dcp->dc_tq != NULL)
! (void) taskq_dispatch(dcp->dc_tq,
! dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
! else
! dmu_objset_find_dp_impl(child_dcp);
}
zap_cursor_fini(&zc);
}
/*
* Iterate over all snapshots.
*/
! if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err == 0) {
uint64_t snapobj;
*** 1657,1667 ****
err = dsl_dataset_hold_obj(dp,
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
! err = func(dp, ds, arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
}
zap_cursor_fini(&zc);
--- 1702,1712 ----
err = dsl_dataset_hold_obj(dp,
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
! err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
}
zap_cursor_fini(&zc);
*** 1670,1690 ****
dsl_dir_rele(dd, FTAG);
kmem_free(attr, sizeof (zap_attribute_t));
if (err != 0)
! return (err);
/*
* Apply to self.
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err != 0)
! return (err);
! err = func(dp, ds, arg);
dsl_dataset_rele(ds, FTAG);
! return (err);
}
/*
* Find all objsets under name, and for each, call 'func(child_name, arg)'.
* The dp_config_rwlock must not be held when this is called, and it
--- 1715,1833 ----
dsl_dir_rele(dd, FTAG);
kmem_free(attr, sizeof (zap_attribute_t));
if (err != 0)
! goto out;
/*
* Apply to self.
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err != 0)
! goto out;
! err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
!
! out:
! if (err != 0) {
! mutex_enter(dcp->dc_error_lock);
! /* only keep first error */
! if (*dcp->dc_error == 0)
! *dcp->dc_error = err;
! mutex_exit(dcp->dc_error_lock);
! }
!
! kmem_free(dcp, sizeof(*dcp));
! }
!
! static void
! dmu_objset_find_dp_cb(void *arg)
! {
! dmu_objset_find_ctx_t *dcp = arg;
! dsl_pool_t *dp = dcp->dc_dp;
!
! dsl_pool_config_enter(dp, FTAG);
!
! dmu_objset_find_dp_impl(dcp);
!
! dsl_pool_config_exit(dp, FTAG);
! }
!
! /*
! * Find objsets under and including ddobj, call func(ds) on each.
! * The order for the enumeration is completely undefined.
! * func is called with dsl_pool_config held.
! */
! int
! dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
! int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
! {
! int error = 0;
! taskq_t *tq = NULL;
! int ntasks;
! dmu_objset_find_ctx_t *dcp;
! kmutex_t err_lock;
!
! mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
! dcp = kmem_alloc(sizeof(*dcp), KM_SLEEP);
! dcp->dc_tq = NULL;
! dcp->dc_dp = dp;
! dcp->dc_ddobj = ddobj;
! dcp->dc_func = func;
! dcp->dc_arg = arg;
! dcp->dc_flags = flags;
! dcp->dc_error_lock = &err_lock;
! dcp->dc_error = &error;
!
! if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
! /*
! * In case a write lock is held we can't make use of
! * parallelism, as down the stack of the worker threads
! * the lock is asserted via dsl_pool_config_held.
! * In case of a read lock this is solved by getting a read
! * lock in each worker thread, which isn't possible in case
! * of a writer lock. So we fall back to the synchronous path
! * here.
! * In the future it might be possible to get some magic into
! * dsl_pool_config_held in a way that it returns true for
! * the worker threads so that a single lock held from this
! * thread suffices. For now, stay single threaded.
! */
! dmu_objset_find_dp_impl(dcp);
!
! return (error);
! }
!
! ntasks = dmu_find_threads;
! if (ntasks == 0)
! ntasks = vdev_count_leaves(dp->dp_spa) * 4;
! tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
! INT_MAX, 0);
! if (tq == NULL) {
! kmem_free(dcp, sizeof(*dcp));
! return (SET_ERROR(ENOMEM));
! }
! dcp->dc_tq = tq;
!
! /* dcp will be freed by task */
! (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
!
! /*
! * PORTING: this code relies on the property of taskq_wait to wait
! * until no more tasks are queued and no more tasks are active. As
! * we always queue new tasks from within other tasks, task_wait
! * reliably waits for the full recursion to finish, even though we
! * enqueue new tasks after taskq_wait has been called.
! * On platforms other than illumos, taskq_wait may not have this
! * property.
! */
! taskq_wait(tq);
! taskq_destroy(tq);
! mutex_destroy(&err_lock);
!
! return (error);
}
/*
* Find all objsets under name, and for each, call 'func(child_name, arg)'.
* The dp_config_rwlock must not be held when this is called, and it