Print this page
3949 ztest fault injection should avoid resilvering devices
3950 ztest: deadman fires when we're doing a scan
3951 ztest hang when running dedup test
3952 ztest: ztest_reguid test and ztest_fault_inject don't place nice together
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>

@@ -182,10 +182,11 @@
         .zo_metaslab_gang_bang = 32 << 10
 };
 
 extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
+extern uint64_t zfs_deadman_synctime;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
 
 typedef struct ztest_shared_ds {

@@ -361,11 +362,11 @@
         { ztest_dmu_snapshot_create_destroy,    1,      &zopt_sometimes },
         { ztest_spa_create_destroy,             1,      &zopt_sometimes },
         { ztest_fault_inject,                   1,      &zopt_sometimes },
         { ztest_ddt_repair,                     1,      &zopt_sometimes },
         { ztest_dmu_snapshot_hold,              1,      &zopt_sometimes },
-        { ztest_reguid,                         1,      &zopt_sometimes },
+        { ztest_reguid,                         1,      &zopt_rarely    },
         { ztest_spa_rename,                     1,      &zopt_rarely    },
         { ztest_scrub,                          1,      &zopt_rarely    },
         { ztest_spa_upgrade,                    1,      &zopt_rarely    },
         { ztest_dsl_dataset_promote_busy,       1,      &zopt_rarely    },
         { ztest_vdev_attach_detach,             1,      &zopt_sometimes },

@@ -4752,10 +4753,18 @@
         VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
 
         ASSERT(leaves >= 1);
 
         /*
+         * Grab the name lock as reader. There are some operations
+         * which don't like to have their vdevs changed while
+         * they are in progress (i.e. spa_change_guid). Those
+         * operations will have grabbed the name lock as writer.
+         */
+        (void) rw_rdlock(&ztest_name_lock);
+
+        /*
          * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
          */
         spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
         if (ztest_random(2) == 0) {

@@ -4780,12 +4789,19 @@
 
                 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
                 if (vd0 != NULL && vd0->vdev_top->vdev_islog)
                         islog = B_TRUE;
 
-                if (vd0 != NULL && maxfaults != 1) {
                         /*
+                 * If the top-level vdev needs to be resilvered
+                 * then we only allow faults on the device that is
+                 * resilvering.
+                 */
+                if (vd0 != NULL && maxfaults != 1 &&
+                    (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) ||
+                    vd0->vdev_resilvering)) {
+                        /*
                          * Make vd0 explicitly claim to be unreadable,
                          * or unwriteable, or reach behind its back
                          * and close the underlying fd.  We can do this if
                          * maxfaults == 0 because we'll fail and reexecute,
                          * and we can do it if maxfaults >= 2 because we'll

@@ -4811,10 +4827,11 @@
                  */
                 spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
                 if (sav->sav_count == 0) {
                         spa_config_exit(spa, SCL_STATE, FTAG);
+                        (void) rw_unlock(&ztest_name_lock);
                         return;
                 }
                 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
                 guid0 = vd0->vdev_guid;
                 (void) strcpy(path0, vd0->vdev_path);

@@ -4824,10 +4841,11 @@
                 leaves = 1;
                 maxfaults = INT_MAX;    /* no limit on cache devices */
         }
 
         spa_config_exit(spa, SCL_STATE, FTAG);
+        (void) rw_unlock(&ztest_name_lock);
 
         /*
          * If we can tolerate two or more faults, or we're dealing
          * with a slog, randomly online/offline vd0.
          */

@@ -5288,20 +5306,37 @@
 
 static void *
 ztest_deadman_thread(void *arg)
 {
         ztest_shared_t *zs = arg;
-        int grace = 300;
-        hrtime_t delta;
+        spa_t *spa = ztest_spa;
+        hrtime_t delta, total = 0;
 
-        delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+        for (;;) {
+                delta = (zs->zs_thread_stop - zs->zs_thread_start) /
+                    NANOSEC + zfs_deadman_synctime;
 
         (void) poll(NULL, 0, (int)(1000 * delta));
 
-        fatal(0, "failed to complete within %d seconds of deadline", grace);
-
+                /*
+                 * If the pool is suspended then fail immediately. Otherwise,
+                 * check to see if the pool is making any progress. If
+                 * vdev_deadman() discovers that there hasn't been any recent
+                 * I/Os then it will end up aborting the tests.
+                 */
+                if (spa_suspended(spa)) {
+                        fatal(0, "aborting test after %llu seconds because "
+                            "pool has transitioned to a suspended state.",
+                            zfs_deadman_synctime);
         return (NULL);
+                }
+                vdev_deadman(spa->spa_root_vdev);
+
+                total += zfs_deadman_synctime;
+                (void) printf("ztest has been running for %lld seconds\n",
+                    total);
+        }
 }
 
 static void
 ztest_execute(int test, ztest_info_t *zi, uint64_t id)
 {

@@ -6022,10 +6057,11 @@
         char *fd_data_str = getenv("ZTEST_FD_DATA");
 
         (void) setvbuf(stdout, NULL, _IOLBF, 0);
 
         dprintf_setup(&argc, argv);
+        zfs_deadman_synctime = 300;
 
         ztest_fd_rand = open("/dev/urandom", O_RDONLY);
         ASSERT3S(ztest_fd_rand, >=, 0);
 
         if (!fd_data_str) {