Print this page
3956 ::vdev -r should work with pipelines
3957 ztest should update the cachefile before killing itself
3958 multiple scans can lead to partial resilvering
3959 ddt entries are not always resilvered
3960 dsl_scan can skip over dedup-ed blocks if physical birth != logical birth
3961 freed gang blocks are not resilvered and can cause pool to suspend
3962 ztest should print out zfs debug buffer before exiting
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>


   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  26  */
  27 
  28 /*
  29  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  30  * that runs entirely in userland, is easy to use, and easy to extend.
  31  *
  32  * The overall design of the ztest program is as follows:
  33  *
  34  * (1) For each major functional area (e.g. adding vdevs to a pool,
  35  *     creating and destroying datasets, reading and writing objects, etc)
  36  *     we have a simple routine to test that functionality.  These
  37  *     individual routines do not have to do anything "stressful".
  38  *
  39  * (2) We turn these simple functionality tests into a stress test by
  40  *     running them all in parallel, with as many threads as desired,
  41  *     and spread across as many datasets, objects, and vdevs as desired.
  42  *
  43  * (3) While all this is happening, we inject faults into the pool to


 750                 if (0 != access(zo->zo_alt_ztest, X_OK)) {
 751                         ztest_dump_core = B_FALSE;
 752                         fatal(B_TRUE, "invalid alternate ztest: %s",
 753                             zo->zo_alt_ztest);
 754                 } else if (0 != access(zo->zo_alt_libpath, X_OK)) {
 755                         ztest_dump_core = B_FALSE;
 756                         fatal(B_TRUE, "invalid alternate lib directory %s",
 757                             zo->zo_alt_libpath);
 758                 }
 759 
 760                 umem_free(cmd, MAXPATHLEN);
 761                 umem_free(realaltdir, MAXPATHLEN);
 762         }
 763 }
 764 
 765 static void
 766 ztest_kill(ztest_shared_t *zs)
 767 {
 768         zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
 769         zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));










 770         (void) kill(getpid(), SIGKILL);
 771 }
 772 
 773 static uint64_t
 774 ztest_random(uint64_t range)
 775 {
 776         uint64_t r;
 777 
 778         ASSERT3S(ztest_fd_rand, >=, 0);
 779 
 780         if (range == 0)
 781                 return (0);
 782 
 783         if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
 784                 fatal(1, "short read from /dev/urandom");
 785 
 786         return (r % range);
 787 }
 788 
 789 /* ARGSUSED */


2714 
2715 }
2716 
2717 /*
2718  * Verify that we can attach and detach devices.
2719  */
2720 /* ARGSUSED */
2721 void
2722 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
2723 {
2724         ztest_shared_t *zs = ztest_shared;
2725         spa_t *spa = ztest_spa;
2726         spa_aux_vdev_t *sav = &spa->spa_spares;
2727         vdev_t *rvd = spa->spa_root_vdev;
2728         vdev_t *oldvd, *newvd, *pvd;
2729         nvlist_t *root;
2730         uint64_t leaves;
2731         uint64_t leaf, top;
2732         uint64_t ashift = ztest_get_ashift();
2733         uint64_t oldguid, pguid;
2734         size_t oldsize, newsize;
2735         char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
2736         int replacing;
2737         int oldvd_has_siblings = B_FALSE;
2738         int newvd_is_spare = B_FALSE;
2739         int oldvd_is_log;
2740         int error, expected_error;
2741 
2742         VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2743         leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
2744 
2745         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2746 
2747         /*
2748          * Decide whether to do an attach or a replace.
2749          */
2750         replacing = ztest_random(2);
2751 
2752         /*
2753          * Pick a random top-level vdev.
2754          */


2873 
2874         /*
2875          * If our parent was the replacing vdev, but the replace completed,
2876          * then instead of failing with ENOTSUP we may either succeed,
2877          * fail with ENODEV, or fail with EOVERFLOW.
2878          */
2879         if (expected_error == ENOTSUP &&
2880             (error == 0 || error == ENODEV || error == EOVERFLOW))
2881                 expected_error = error;
2882 
2883         /*
2884          * If someone grew the LUN, the replacement may be too small.
2885          */
2886         if (error == EOVERFLOW || error == EBUSY)
2887                 expected_error = error;
2888 
2889         /* XXX workaround 6690467 */
2890         if (error != expected_error && expected_error != EBUSY) {
2891                 fatal(0, "attach (%s %llu, %s %llu, %d) "
2892                     "returned %d, expected %d",
2893                     oldpath, (longlong_t)oldsize, newpath,
2894                     (longlong_t)newsize, replacing, error, expected_error);
2895         }
2896 
2897         VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2898 }
2899 
2900 /*
2901  * Callback function which expands the physical size of the vdev.
2902  */
2903 vdev_t *
2904 grow_vdev(vdev_t *vd, void *arg)
2905 {
2906         spa_t *spa = vd->vdev_spa;
2907         size_t *newsize = arg;
2908         size_t fsize;
2909         int fd;
2910 
2911         ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2912         ASSERT(vd->vdev_ops->vdev_op_leaf);
2913 
2914         if ((fd = open(vd->vdev_path, O_RDWR)) == -1)


4784                  * and we'll write random garbage to the randomly chosen leaf.
4785                  */
4786                 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
4787                     ztest_opts.zo_dir, ztest_opts.zo_pool,
4788                     top * leaves + zs->zs_splits);
4789                 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
4790                     ztest_opts.zo_dir, ztest_opts.zo_pool,
4791                     top * leaves + leaf);
4792 
4793                 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
4794                 if (vd0 != NULL && vd0->vdev_top->vdev_islog)
4795                         islog = B_TRUE;
4796 
4797                 /*
4798                  * If the top-level vdev needs to be resilvered
4799                  * then we only allow faults on the device that is
4800                  * resilvering.
4801                  */
4802                 if (vd0 != NULL && maxfaults != 1 &&
4803                     (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
4804                     vd0->vdev_resilvering)) {
4805                         /*
4806                          * Make vd0 explicitly claim to be unreadable,
4807                          * or unwriteable, or reach behind its back
4808                          * and close the underlying fd.  We can do this if
4809                          * maxfaults == 0 because we'll fail and reexecute,
4810                          * and we can do it if maxfaults >= 2 because we'll
4811                          * have enough redundancy.  If maxfaults == 1, the
4812                          * combination of this with injection of random data
4813                          * corruption below exceeds the pool's fault tolerance.
4814                          */
4815                         vdev_file_t *vf = vd0->vdev_tsd;
4816 
4817                         if (vf != NULL && ztest_random(3) == 0) {
4818                                 (void) close(vf->vf_vnode->v_fd);
4819                                 vf->vf_vnode->v_fd = -1;
4820                         } else if (ztest_random(2) == 0) {
4821                                 vd0->vdev_cant_read = B_TRUE;
4822                         } else {
4823                                 vd0->vdev_cant_write = B_TRUE;
4824                         }


5634                     ztest_dataset_open(t) != 0)
5635                         return;
5636                 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
5637                     THR_BOUND, &tid[t]) == 0);
5638         }
5639 
5640         /*
5641          * Wait for all of the tests to complete.  We go in reverse order
5642          * so we don't close datasets while threads are still using them.
5643          */
5644         for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
5645                 VERIFY(thr_join(tid[t], NULL, NULL) == 0);
5646                 if (t < ztest_opts.zo_datasets)
5647                         ztest_dataset_close(t);
5648         }
5649 
5650         txg_wait_synced(spa_get_dsl(spa), 0);
5651 
5652         zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
5653         zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));

5654 
5655         umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
5656 
5657         /* Kill the resume thread */
5658         ztest_exiting = B_TRUE;
5659         VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
5660         ztest_resume(spa);
5661 
5662         /*
5663          * Right before closing the pool, kick off a bunch of async I/O;
5664          * spa_close() should wait for it to complete.
5665          */
5666         for (uint64_t object = 1; object < 50; object++)
5667                 dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
5668 
5669         spa_close(spa, FTAG);
5670 
5671         /*
5672          * Verify that we can loop over all pools.
5673          */




   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  26  */
  27 
  28 /*
  29  * The objective of this program is to provide a DMU/ZAP/SPA stress test
  30  * that runs entirely in userland, is easy to use, and easy to extend.
  31  *
  32  * The overall design of the ztest program is as follows:
  33  *
  34  * (1) For each major functional area (e.g. adding vdevs to a pool,
  35  *     creating and destroying datasets, reading and writing objects, etc)
  36  *     we have a simple routine to test that functionality.  These
  37  *     individual routines do not have to do anything "stressful".
  38  *
  39  * (2) We turn these simple functionality tests into a stress test by
  40  *     running them all in parallel, with as many threads as desired,
  41  *     and spread across as many datasets, objects, and vdevs as desired.
  42  *
  43  * (3) While all this is happening, we inject faults into the pool to


 750                 if (0 != access(zo->zo_alt_ztest, X_OK)) {
 751                         ztest_dump_core = B_FALSE;
 752                         fatal(B_TRUE, "invalid alternate ztest: %s",
 753                             zo->zo_alt_ztest);
 754                 } else if (0 != access(zo->zo_alt_libpath, X_OK)) {
 755                         ztest_dump_core = B_FALSE;
 756                         fatal(B_TRUE, "invalid alternate lib directory %s",
 757                             zo->zo_alt_libpath);
 758                 }
 759 
 760                 umem_free(cmd, MAXPATHLEN);
 761                 umem_free(realaltdir, MAXPATHLEN);
 762         }
 763 }
 764 
 765 static void
 766 ztest_kill(ztest_shared_t *zs)
 767 {
 768         zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
 769         zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
 770 
 771         /*
 772          * Before we kill off ztest, make sure that the config is updated.
 773          * See comment above spa_config_sync().
 774          */
 775         mutex_enter(&spa_namespace_lock);
 776         spa_config_sync(ztest_spa, B_FALSE, B_FALSE);
 777         mutex_exit(&spa_namespace_lock);
 778 
 779         zfs_dbgmsg_print(FTAG);
 780         (void) kill(getpid(), SIGKILL);
 781 }
 782 
 783 static uint64_t
 784 ztest_random(uint64_t range)
 785 {
 786         uint64_t r;
 787 
 788         ASSERT3S(ztest_fd_rand, >=, 0);
 789 
 790         if (range == 0)
 791                 return (0);
 792 
 793         if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
 794                 fatal(1, "short read from /dev/urandom");
 795 
 796         return (r % range);
 797 }
 798 
 799 /* ARGSUSED */


2724 
2725 }
2726 
2727 /*
2728  * Verify that we can attach and detach devices.
2729  */
2730 /* ARGSUSED */
2731 void
2732 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
2733 {
2734         ztest_shared_t *zs = ztest_shared;
2735         spa_t *spa = ztest_spa;
2736         spa_aux_vdev_t *sav = &spa->spa_spares;
2737         vdev_t *rvd = spa->spa_root_vdev;
2738         vdev_t *oldvd, *newvd, *pvd;
2739         nvlist_t *root;
2740         uint64_t leaves;
2741         uint64_t leaf, top;
2742         uint64_t ashift = ztest_get_ashift();
2743         uint64_t oldguid, pguid;
2744         uint64_t oldsize, newsize;
2745         char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
2746         int replacing;
2747         int oldvd_has_siblings = B_FALSE;
2748         int newvd_is_spare = B_FALSE;
2749         int oldvd_is_log;
2750         int error, expected_error;
2751 
2752         VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2753         leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
2754 
2755         spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2756 
2757         /*
2758          * Decide whether to do an attach or a replace.
2759          */
2760         replacing = ztest_random(2);
2761 
2762         /*
2763          * Pick a random top-level vdev.
2764          */


2883 
2884         /*
2885          * If our parent was the replacing vdev, but the replace completed,
2886          * then instead of failing with ENOTSUP we may either succeed,
2887          * fail with ENODEV, or fail with EOVERFLOW.
2888          */
2889         if (expected_error == ENOTSUP &&
2890             (error == 0 || error == ENODEV || error == EOVERFLOW))
2891                 expected_error = error;
2892 
2893         /*
2894          * If someone grew the LUN, the replacement may be too small.
2895          */
2896         if (error == EOVERFLOW || error == EBUSY)
2897                 expected_error = error;
2898 
2899         /* XXX workaround 6690467 */
2900         if (error != expected_error && expected_error != EBUSY) {
2901                 fatal(0, "attach (%s %llu, %s %llu, %d) "
2902                     "returned %d, expected %d",
2903                     oldpath, oldsize, newpath,
2904                     newsize, replacing, error, expected_error);
2905         }
2906 
2907         VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2908 }
2909 
2910 /*
2911  * Callback function which expands the physical size of the vdev.
2912  */
2913 vdev_t *
2914 grow_vdev(vdev_t *vd, void *arg)
2915 {
2916         spa_t *spa = vd->vdev_spa;
2917         size_t *newsize = arg;
2918         size_t fsize;
2919         int fd;
2920 
2921         ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2922         ASSERT(vd->vdev_ops->vdev_op_leaf);
2923 
2924         if ((fd = open(vd->vdev_path, O_RDWR)) == -1)


4794                  * and we'll write random garbage to the randomly chosen leaf.
4795                  */
4796                 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
4797                     ztest_opts.zo_dir, ztest_opts.zo_pool,
4798                     top * leaves + zs->zs_splits);
4799                 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
4800                     ztest_opts.zo_dir, ztest_opts.zo_pool,
4801                     top * leaves + leaf);
4802 
4803                 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
4804                 if (vd0 != NULL && vd0->vdev_top->vdev_islog)
4805                         islog = B_TRUE;
4806 
4807                 /*
4808                  * If the top-level vdev needs to be resilvered
4809                  * then we only allow faults on the device that is
4810                  * resilvering.
4811                  */
4812                 if (vd0 != NULL && maxfaults != 1 &&
4813                     (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
4814                     vd0->vdev_resilver_txg != 0)) {
4815                         /*
4816                          * Make vd0 explicitly claim to be unreadable,
4817                          * or unwriteable, or reach behind its back
4818                          * and close the underlying fd.  We can do this if
4819                          * maxfaults == 0 because we'll fail and reexecute,
4820                          * and we can do it if maxfaults >= 2 because we'll
4821                          * have enough redundancy.  If maxfaults == 1, the
4822                          * combination of this with injection of random data
4823                          * corruption below exceeds the pool's fault tolerance.
4824                          */
4825                         vdev_file_t *vf = vd0->vdev_tsd;
4826 
4827                         if (vf != NULL && ztest_random(3) == 0) {
4828                                 (void) close(vf->vf_vnode->v_fd);
4829                                 vf->vf_vnode->v_fd = -1;
4830                         } else if (ztest_random(2) == 0) {
4831                                 vd0->vdev_cant_read = B_TRUE;
4832                         } else {
4833                                 vd0->vdev_cant_write = B_TRUE;
4834                         }


5644                     ztest_dataset_open(t) != 0)
5645                         return;
5646                 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
5647                     THR_BOUND, &tid[t]) == 0);
5648         }
5649 
5650         /*
5651          * Wait for all of the tests to complete.  We go in reverse order
5652          * so we don't close datasets while threads are still using them.
5653          */
5654         for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
5655                 VERIFY(thr_join(tid[t], NULL, NULL) == 0);
5656                 if (t < ztest_opts.zo_datasets)
5657                         ztest_dataset_close(t);
5658         }
5659 
5660         txg_wait_synced(spa_get_dsl(spa), 0);
5661 
5662         zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
5663         zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
5664         zfs_dbgmsg_print(FTAG);
5665 
5666         umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
5667 
5668         /* Kill the resume thread */
5669         ztest_exiting = B_TRUE;
5670         VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
5671         ztest_resume(spa);
5672 
5673         /*
5674          * Right before closing the pool, kick off a bunch of async I/O;
5675          * spa_close() should wait for it to complete.
5676          */
5677         for (uint64_t object = 1; object < 50; object++)
5678                 dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
5679 
5680         spa_close(spa, FTAG);
5681 
5682         /*
5683          * Verify that we can loop over all pools.
5684          */