Print this page
5269 zfs: zpool import slow
While importing a pool all objsets are enumerated twice, once to check
the zil log chains and once to claim them. On pools with many datasets
this process might take a substantial amount of time.
Speed up the process by parallelizing it utilizing a taskq. The number
of parallel tasks is limited to 4 times the number of leaf vdevs.

@@ -21,10 +21,11 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, STRATO AG, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>

@@ -45,10 +46,11 @@
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  * before it can be safely accessed.
  */

@@ -489,10 +491,29 @@
         }
 
         return (err);
 }
 
+static int
+dmu_objset_own_common(dsl_dataset_t *ds, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+        int err;
+
+        err = dmu_objset_from_ds(ds, osp);
+        if (err != 0) {
+                dsl_dataset_disown(ds, tag);
+        } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+                dsl_dataset_disown(ds, tag);
+                return (SET_ERROR(EINVAL));
+        } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+                dsl_dataset_disown(ds, tag);
+                return (SET_ERROR(EROFS));
+        }
+        return (err);
+}
+
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
  * and the dsl_pool will not be held.
  */

@@ -510,33 +531,44 @@
         err = dsl_dataset_own(dp, name, tag, &ds);
         if (err != 0) {
                 dsl_pool_rele(dp, FTAG);
                 return (err);
         }
-
-        err = dmu_objset_from_ds(ds, osp);
+        err = dmu_objset_own_common(ds, type, readonly, tag, osp);
         dsl_pool_rele(dp, FTAG);
-        if (err != 0) {
-                dsl_dataset_disown(ds, tag);
-        } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
-                dsl_dataset_disown(ds, tag);
-                return (SET_ERROR(EINVAL));
-        } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
-                dsl_dataset_disown(ds, tag);
-                return (SET_ERROR(EROFS));
-        }
+
+        return (err);
+}
+
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
+{
+        dsl_dataset_t *ds;
+        int err;
+
+        err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+        if (err != 0)
         return (err);
+
+        return (dmu_objset_own_common(ds, type, readonly, tag, osp));
 }
 
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
         dsl_pool_t *dp = dmu_objset_pool(os);
         dsl_dataset_rele(os->os_dsl_dataset, tag);
         dsl_pool_rele(dp, tag);
 }
 
+void
+dmu_objset_rele_obj(objset_t *os, void *tag)
+{
+        dsl_dataset_rele(os->os_dsl_dataset, tag);
+}
+
 /*
  * When we are called, os MUST refer to an objset associated with a dataset
  * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
  * == tag.  We will then release and reacquire ownership of the dataset while
  * holding the pool config_rwlock to avoid intervening namespace or ownership

@@ -1543,69 +1575,78 @@
         zap_cursor_fini(&cursor);
 
         return (0);
 }
 
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
-    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+typedef struct dmu_objset_find_ctx {
+        taskq_t         *dc_tq;
+        dsl_pool_t      *dc_dp;
+        uint64_t        dc_obj;
+        int             (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+        void            *dc_arg;
+        int             dc_flags;
+        kmutex_t        *dc_error_lock;
+        int             *dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(void *arg)
 {
+        dmu_objset_find_ctx_t *dcp = arg;
+        dsl_pool_t *dp = dcp->dc_dp;
+        dmu_objset_find_ctx_t *child_dcp;
         dsl_dir_t *dd;
         dsl_dataset_t *ds;
         zap_cursor_t zc;
         zap_attribute_t *attr;
         uint64_t thisobj;
         int err;
 
-        ASSERT(dsl_pool_config_held(dp));
+        dsl_pool_config_enter(dp, FTAG);
+
+        /* don't process if there already was an error */
+        if (*dcp->dc_error)
+                goto out;
 
-        err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+        err = dsl_dir_hold_obj(dp, dcp->dc_obj, NULL, FTAG, &dd);
         if (err != 0)
-                return (err);
+                goto fail;
 
         /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
         if (dd->dd_myname[0] == '$') {
                 dsl_dir_rele(dd, FTAG);
-                return (0);
+                goto out;
         }
 
         thisobj = dd->dd_phys->dd_head_dataset_obj;
         attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
         /*
          * Iterate over all children.
          */
-        if (flags & DS_FIND_CHILDREN) {
+        if (dcp->dc_flags & DS_FIND_CHILDREN) {
                 for (zap_cursor_init(&zc, dp->dp_meta_objset,
                     dd->dd_phys->dd_child_dir_zapobj);
                     zap_cursor_retrieve(&zc, attr) == 0;
                     (void) zap_cursor_advance(&zc)) {
                         ASSERT3U(attr->za_integer_length, ==,
                             sizeof (uint64_t));
                         ASSERT3U(attr->za_num_integers, ==, 1);
 
-                        err = dmu_objset_find_dp(dp, attr->za_first_integer,
-                            func, arg, flags);
-                        if (err != 0)
-                                break;
+                        child_dcp = kmem_alloc(sizeof(*child_dcp), KM_SLEEP);
+                        *child_dcp = *dcp;
+                        child_dcp->dc_obj = attr->za_first_integer;
+                        taskq_dispatch(dcp->dc_tq, dmu_objset_find_dp_impl,
+                            child_dcp, TQ_SLEEP);
                 }
                 zap_cursor_fini(&zc);
-
-                if (err != 0) {
-                        dsl_dir_rele(dd, FTAG);
-                        kmem_free(attr, sizeof (zap_attribute_t));
-                        return (err);
-                }
         }
 
         /*
          * Iterate over all snapshots.
          */
-        if (flags & DS_FIND_SNAPSHOTS) {
+        if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
                 dsl_dataset_t *ds;
                 err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
                 if (err == 0) {
                         uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;

@@ -1620,11 +1661,11 @@
 
                                 err = dsl_dataset_hold_obj(dp,
                                     attr->za_first_integer, FTAG, &ds);
                                 if (err != 0)
                                         break;
-                                err = func(dp, ds, arg);
+                                err = dcp->dc_func(dp, ds, dcp->dc_arg);
                                 dsl_dataset_rele(ds, FTAG);
                                 if (err != 0)
                                         break;
                         }
                         zap_cursor_fini(&zc);

@@ -1633,21 +1674,74 @@
 
         dsl_dir_rele(dd, FTAG);
         kmem_free(attr, sizeof (zap_attribute_t));
 
         if (err != 0)
-                return (err);
+                goto fail;
 
         /*
          * Apply to self.
          */
         err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
         if (err != 0)
-                return (err);
-        err = func(dp, ds, arg);
+                goto fail;
+        err = dcp->dc_func(dp, ds, dcp->dc_arg);
         dsl_dataset_rele(ds, FTAG);
-        return (err);
+
+fail:
+        if (err) {
+                mutex_enter(dcp->dc_error_lock);
+                /* only keep first error */
+                if (*dcp->dc_error == 0)
+                        *dcp->dc_error = err;
+                mutex_exit(dcp->dc_error_lock);
+        }
+
+out:
+        dsl_pool_config_exit(dp, FTAG);
+        kmem_free(dcp, sizeof(*dcp));
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+        int error = 0;
+        taskq_t *tq = NULL;
+        int ntasks;
+        dmu_objset_find_ctx_t *dcp;
+        kmutex_t err_lock;
+
+        ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+        tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+            INT_MAX, 0);
+        if (!tq)
+                return (SET_ERROR(ENOMEM));
+
+        mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+        dcp = kmem_alloc(sizeof(*dcp), KM_SLEEP);
+        dcp->dc_tq = tq;
+        dcp->dc_dp = dp;
+        dcp->dc_obj = ddobj;
+        dcp->dc_func = func;
+        dcp->dc_arg = arg;
+        dcp->dc_flags = flags;
+        dcp->dc_error_lock = &err_lock;
+        dcp->dc_error = &error;
+        /* dcp and dc_name will be freed by task */
+        taskq_dispatch(tq, dmu_objset_find_dp_impl, dcp, TQ_SLEEP);
+
+        taskq_wait(tq);
+        taskq_destroy(tq);
+        mutex_destroy(&err_lock);
+
+        return (error);
 }
 
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * The dp_config_rwlock must not be held when this is called, and it