8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2013 Steven Hartland. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2017 Joyent, Inc.
28 */
29
30 /*
31 * The objective of this program is to provide a DMU/ZAP/SPA stress test
32 * that runs entirely in userland, is easy to use, and easy to extend.
33 *
34 * The overall design of the ztest program is as follows:
35 *
36 * (1) For each major functional area (e.g. adding vdevs to a pool,
37 * creating and destroying datasets, reading and writing objects, etc)
38 * we have a simple routine to test that functionality. These
39 * individual routines do not have to do anything "stressful".
40 *
41 * (2) We turn these simple functionality tests into a stress test by
42 * running them all in parallel, with as many threads as desired,
43 * and spread across as many datasets, objects, and vdevs as desired.
44 *
45 * (3) While all this is happening, we inject faults into the pool to
46 * verify that self-healing data really works.
47 *
228 } ztest_block_tag_t;
229
230 typedef struct bufwad {
231 uint64_t bw_index;
232 uint64_t bw_txg;
233 uint64_t bw_data;
234 } bufwad_t;
235
236 /*
237 * XXX -- fix zfs range locks to be generic so we can use them here.
238 */
239 typedef enum {
240 RL_READER,
241 RL_WRITER,
242 RL_APPEND
243 } rl_type_t;
244
245 typedef struct rll {
246 void *rll_writer;
247 int rll_readers;
248 mutex_t rll_lock;
249 cond_t rll_cv;
250 } rll_t;
251
252 typedef struct rl {
253 uint64_t rl_object;
254 uint64_t rl_offset;
255 uint64_t rl_size;
256 rll_t *rl_lock;
257 } rl_t;
258
259 #define ZTEST_RANGE_LOCKS 64
260 #define ZTEST_OBJECT_LOCKS 64
261
262 /*
263 * Object descriptor. Used as a template for object lookup/create/remove.
264 */
265 typedef struct ztest_od {
266 uint64_t od_dir;
267 uint64_t od_object;
268 dmu_object_type_t od_type;
269 dmu_object_type_t od_crtype;
270 uint64_t od_blocksize;
271 uint64_t od_crblocksize;
272 uint64_t od_gen;
273 uint64_t od_crgen;
274 char od_name[ZFS_MAX_DATASET_NAME_LEN];
275 } ztest_od_t;
276
277 /*
278 * Per-dataset state.
279 */
280 typedef struct ztest_ds {
281 ztest_shared_ds_t *zd_shared;
282 objset_t *zd_os;
283 rwlock_t zd_zilog_lock;
284 zilog_t *zd_zilog;
285 ztest_od_t *zd_od; /* debugging aid */
286 char zd_name[ZFS_MAX_DATASET_NAME_LEN];
287 mutex_t zd_dirobj_lock;
288 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
289 rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
290 } ztest_ds_t;
291
292 /*
293 * Per-iteration state.
294 */
295 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
296
297 typedef struct ztest_info {
298 ztest_func_t *zi_func; /* test function */
299 uint64_t zi_iters; /* iterations per execution */
300 uint64_t *zi_interval; /* execute every <interval> seconds */
301 } ztest_info_t;
302
303 typedef struct ztest_shared_callstate {
304 uint64_t zc_count; /* per-pass count */
305 uint64_t zc_time; /* per-pass time */
306 uint64_t zc_next; /* next time to call this function */
307 } ztest_shared_callstate_t;
374 { ztest_reguid, 1, &zopt_rarely },
375 { ztest_spa_rename, 1, &zopt_rarely },
376 { ztest_scrub, 1, &zopt_rarely },
377 { ztest_spa_upgrade, 1, &zopt_rarely },
378 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
379 { ztest_vdev_attach_detach, 1, &zopt_sometimes },
380 { ztest_vdev_LUN_growth, 1, &zopt_rarely },
381 { ztest_vdev_add_remove, 1,
382 &ztest_opts.zo_vdevtime },
383 { ztest_vdev_aux_add_remove, 1,
384 &ztest_opts.zo_vdevtime },
385 };
386
387 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
388
389 /*
390 * The following struct is used to hold a list of uncalled commit callbacks.
391 * The callbacks are ordered by txg number.
392 */
393 typedef struct ztest_cb_list {
394 mutex_t zcl_callbacks_lock;
395 list_t zcl_callbacks;
396 } ztest_cb_list_t;
397
398 /*
399 * Stuff we need to share writably between parent and child.
400 */
401 typedef struct ztest_shared {
402 boolean_t zs_do_init;
403 hrtime_t zs_proc_start;
404 hrtime_t zs_proc_stop;
405 hrtime_t zs_thread_start;
406 hrtime_t zs_thread_stop;
407 hrtime_t zs_thread_kill;
408 uint64_t zs_enospc_count;
409 uint64_t zs_vdev_next_leaf;
410 uint64_t zs_vdev_aux;
411 uint64_t zs_alloc;
412 uint64_t zs_space;
413 uint64_t zs_splits;
414 uint64_t zs_mirrors;
415 uint64_t zs_metaslab_sz;
416 uint64_t zs_metaslab_df_alloc_threshold;
417 uint64_t zs_guid;
418 } ztest_shared_t;
419
420 #define ID_PARALLEL -1ULL
421
422 static char ztest_dev_template[] = "%s/%s.%llua";
423 static char ztest_aux_template[] = "%s/%s.%s.%llu";
424 ztest_shared_t *ztest_shared;
425
426 static spa_t *ztest_spa = NULL;
427 static ztest_ds_t *ztest_ds;
428
429 static mutex_t ztest_vdev_lock;
430
431 /*
432 * The ztest_name_lock protects the pool and dataset namespace used by
433 * the individual tests. To modify the namespace, consumers must grab
434 * this lock as writer. Grabbing the lock as reader will ensure that the
435 * namespace does not change while the lock is held.
436 */
437 static rwlock_t ztest_name_lock;
438
439 static boolean_t ztest_dump_core = B_TRUE;
440 static boolean_t ztest_exiting;
441
442 /* Global commit callback list */
443 static ztest_cb_list_t zcl;
444
445 enum ztest_object {
446 ZTEST_META_DNODE = 0,
447 ZTEST_DIROBJ,
448 ZTEST_OBJECTS
449 };
450
451 static void usage(boolean_t) __NORETURN;
452
453 /*
454 * These libumem hooks provide a reasonable set of defaults for the allocator's
455 * debugging facilities.
456 */
457 const char *
1073 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
1074
1075 error = spa_prop_set(spa, props);
1076
1077 nvlist_free(props);
1078
1079 if (error == ENOSPC) {
1080 ztest_record_enospc(FTAG);
1081 return (error);
1082 }
1083 ASSERT0(error);
1084
1085 return (error);
1086 }
1087
1088 static void
1089 ztest_rll_init(rll_t *rll)
1090 {
1091 rll->rll_writer = NULL;
1092 rll->rll_readers = 0;
1093 VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
1094 VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
1095 }
1096
1097 static void
1098 ztest_rll_destroy(rll_t *rll)
1099 {
1100 ASSERT(rll->rll_writer == NULL);
1101 ASSERT(rll->rll_readers == 0);
1102 VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
1103 VERIFY(cond_destroy(&rll->rll_cv) == 0);
1104 }
1105
1106 static void
1107 ztest_rll_lock(rll_t *rll, rl_type_t type)
1108 {
1109 VERIFY(mutex_lock(&rll->rll_lock) == 0);
1110
1111 if (type == RL_READER) {
1112 while (rll->rll_writer != NULL)
1113 (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
1114 rll->rll_readers++;
1115 } else {
1116 while (rll->rll_writer != NULL || rll->rll_readers)
1117 (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
1118 rll->rll_writer = curthread;
1119 }
1120
1121 VERIFY(mutex_unlock(&rll->rll_lock) == 0);
1122 }
1123
1124 static void
1125 ztest_rll_unlock(rll_t *rll)
1126 {
1127 VERIFY(mutex_lock(&rll->rll_lock) == 0);
1128
1129 if (rll->rll_writer) {
1130 ASSERT(rll->rll_readers == 0);
1131 rll->rll_writer = NULL;
1132 } else {
1133 ASSERT(rll->rll_readers != 0);
1134 ASSERT(rll->rll_writer == NULL);
1135 rll->rll_readers--;
1136 }
1137
1138 if (rll->rll_writer == NULL && rll->rll_readers == 0)
1139 VERIFY(cond_broadcast(&rll->rll_cv) == 0);
1140
1141 VERIFY(mutex_unlock(&rll->rll_lock) == 0);
1142 }
1143
1144 static void
1145 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
1146 {
1147 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
1148
1149 ztest_rll_lock(rll, type);
1150 }
1151
1152 static void
1153 ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
1154 {
1155 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
1156
1157 ztest_rll_unlock(rll);
1158 }
1159
1160 static rl_t *
1161 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
1180 ztest_range_unlock(rl_t *rl)
1181 {
1182 rll_t *rll = rl->rl_lock;
1183
1184 ztest_rll_unlock(rll);
1185
1186 umem_free(rl, sizeof (*rl));
1187 }
1188
1189 static void
1190 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
1191 {
1192 zd->zd_os = os;
1193 zd->zd_zilog = dmu_objset_zil(os);
1194 zd->zd_shared = szd;
1195 dmu_objset_name(os, zd->zd_name);
1196
1197 if (zd->zd_shared != NULL)
1198 zd->zd_shared->zd_seq = 0;
1199
1200 VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
1201 VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
1202
1203 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
1204 ztest_rll_init(&zd->zd_object_lock[l]);
1205
1206 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
1207 ztest_rll_init(&zd->zd_range_lock[l]);
1208 }
1209
1210 static void
1211 ztest_zd_fini(ztest_ds_t *zd)
1212 {
1213 VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
1214
1215 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
1216 ztest_rll_destroy(&zd->zd_object_lock[l]);
1217
1218 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
1219 ztest_rll_destroy(&zd->zd_range_lock[l]);
1220 }
1221
1222 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
1223
1224 static uint64_t
1225 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
1226 {
1227 uint64_t txg;
1228 int error;
1229
1230 /*
1231 * Attempt to assign tx to some transaction group.
1232 */
1233 error = dmu_tx_assign(tx, txg_how);
1948 return (lr);
1949 }
1950
1951 void
1952 ztest_lr_free(void *lr, size_t lrsize, char *name)
1953 {
1954 size_t namesize = name ? strlen(name) + 1 : 0;
1955
1956 umem_free(lr, lrsize + namesize);
1957 }
1958
1959 /*
1960 * Lookup a bunch of objects. Returns the number of objects not found.
1961 */
1962 static int
1963 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
1964 {
1965 int missing = 0;
1966 int error;
1967
1968 ASSERT(_mutex_held(&zd->zd_dirobj_lock));
1969
1970 for (int i = 0; i < count; i++, od++) {
1971 od->od_object = 0;
1972 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
1973 sizeof (uint64_t), 1, &od->od_object);
1974 if (error) {
1975 ASSERT(error == ENOENT);
1976 ASSERT(od->od_object == 0);
1977 missing++;
1978 } else {
1979 dmu_buf_t *db;
1980 ztest_block_tag_t *bbt;
1981 dmu_object_info_t doi;
1982
1983 ASSERT(od->od_object != 0);
1984 ASSERT(missing == 0); /* there should be no gaps */
1985
1986 ztest_object_lock(zd, od->od_object, RL_READER);
1987 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
1988 od->od_object, FTAG, &db));
1989 dmu_object_info_from_db(db, &doi);
1990 bbt = ztest_bt_bonus(db);
1991 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1992 od->od_type = doi.doi_type;
1993 od->od_blocksize = doi.doi_data_block_size;
1994 od->od_gen = bbt->bt_gen;
1995 dmu_buf_rele(db, FTAG);
1996 ztest_object_unlock(zd, od->od_object);
1997 }
1998 }
1999
2000 return (missing);
2001 }
2002
2003 static int
2004 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
2005 {
2006 int missing = 0;
2007
2008 ASSERT(_mutex_held(&zd->zd_dirobj_lock));
2009
2010 for (int i = 0; i < count; i++, od++) {
2011 if (missing) {
2012 od->od_object = 0;
2013 missing++;
2014 continue;
2015 }
2016
2017 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
2018
2019 lr->lr_doid = od->od_dir;
2020 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
2021 lr->lrz_type = od->od_crtype;
2022 lr->lrz_blocksize = od->od_crblocksize;
2023 lr->lrz_ibshift = ztest_random_ibshift();
2024 lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
2025 lr->lrz_bonuslen = dmu_bonus_max();
2026 lr->lr_gen = od->od_crgen;
2027 lr->lr_crtime[0] = time(NULL);
2028
2033 } else {
2034 od->od_object = lr->lr_foid;
2035 od->od_type = od->od_crtype;
2036 od->od_blocksize = od->od_crblocksize;
2037 od->od_gen = od->od_crgen;
2038 ASSERT(od->od_object != 0);
2039 }
2040
2041 ztest_lr_free(lr, sizeof (*lr), od->od_name);
2042 }
2043
2044 return (missing);
2045 }
2046
2047 static int
2048 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
2049 {
2050 int missing = 0;
2051 int error;
2052
2053 ASSERT(_mutex_held(&zd->zd_dirobj_lock));
2054
2055 od += count - 1;
2056
2057 for (int i = count - 1; i >= 0; i--, od--) {
2058 if (missing) {
2059 missing++;
2060 continue;
2061 }
2062
2063 /*
2064 * No object was found.
2065 */
2066 if (od->od_object == 0)
2067 continue;
2068
2069 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
2070
2071 lr->lr_doid = od->od_dir;
2072
2073 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
2179 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
2180 {
2181 int err;
2182 ztest_block_tag_t wbt;
2183 dmu_object_info_t doi;
2184 enum ztest_io_type io_type;
2185 uint64_t blocksize;
2186 void *data;
2187
2188 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
2189 blocksize = doi.doi_data_block_size;
2190 data = umem_alloc(blocksize, UMEM_NOFAIL);
2191
2192 /*
2193 * Pick an i/o type at random, biased toward writing block tags.
2194 */
2195 io_type = ztest_random(ZTEST_IO_TYPES);
2196 if (ztest_random(2) == 0)
2197 io_type = ZTEST_IO_WRITE_TAG;
2198
2199 (void) rw_rdlock(&zd->zd_zilog_lock);
2200
2201 switch (io_type) {
2202
2203 case ZTEST_IO_WRITE_TAG:
2204 ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
2205 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
2206 break;
2207
2208 case ZTEST_IO_WRITE_PATTERN:
2209 (void) memset(data, 'a' + (object + offset) % 5, blocksize);
2210 if (ztest_random(2) == 0) {
2211 /*
2212 * Induce fletcher2 collisions to ensure that
2213 * zio_ddt_collision() detects and resolves them
2214 * when using fletcher2-verify for deduplication.
2215 */
2216 ((uint64_t *)data)[0] ^= 1ULL << 63;
2217 ((uint64_t *)data)[4] ^= 1ULL << 63;
2218 }
2219 (void) ztest_write(zd, object, offset, blocksize, data);
2220 break;
2221
2222 case ZTEST_IO_WRITE_ZEROES:
2223 bzero(data, blocksize);
2224 (void) ztest_write(zd, object, offset, blocksize, data);
2225 break;
2226
2227 case ZTEST_IO_TRUNCATE:
2228 (void) ztest_truncate(zd, object, offset, blocksize);
2229 break;
2230
2231 case ZTEST_IO_SETATTR:
2232 (void) ztest_setattr(zd, object);
2233 break;
2234
2235 case ZTEST_IO_REWRITE:
2236 (void) rw_rdlock(&ztest_name_lock);
2237 err = ztest_dsl_prop_set_uint64(zd->zd_name,
2238 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
2239 B_FALSE);
2240 VERIFY(err == 0 || err == ENOSPC);
2241 err = ztest_dsl_prop_set_uint64(zd->zd_name,
2242 ZFS_PROP_COMPRESSION,
2243 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
2244 B_FALSE);
2245 VERIFY(err == 0 || err == ENOSPC);
2246 (void) rw_unlock(&ztest_name_lock);
2247
2248 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
2249 DMU_READ_NO_PREFETCH));
2250
2251 (void) ztest_write(zd, object, offset, blocksize, data);
2252 break;
2253 }
2254
2255 (void) rw_unlock(&zd->zd_zilog_lock);
2256
2257 umem_free(data, blocksize);
2258 }
2259
2260 /*
2261 * Initialize an object description template.
2262 */
2263 static void
2264 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
2265 dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
2266 {
2267 od->od_dir = ZTEST_DIROBJ;
2268 od->od_object = 0;
2269
2270 od->od_crtype = type;
2271 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
2272 od->od_crgen = gen;
2273
2274 od->od_type = DMU_OT_NONE;
2275 od->od_blocksize = 0;
2276 od->od_gen = 0;
2277
2278 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
2279 tag, (int64_t)id, index);
2280 }
2281
2282 /*
2283 * Lookup or create the objects for a test using the od template.
2284 * If the objects do not all exist, or if 'remove' is specified,
2285 * remove any existing objects and create new ones. Otherwise,
2286 * use the existing objects.
2287 */
2288 static int
2289 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
2290 {
2291 int count = size / sizeof (*od);
2292 int rv = 0;
2293
2294 VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
2295 if ((ztest_lookup(zd, od, count) != 0 || remove) &&
2296 (ztest_remove(zd, od, count) != 0 ||
2297 ztest_create(zd, od, count) != 0))
2298 rv = -1;
2299 zd->zd_od = od;
2300 VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
2301
2302 return (rv);
2303 }
2304
2305 /* ARGSUSED */
2306 void
2307 ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
2308 {
2309 zilog_t *zilog = zd->zd_zilog;
2310
2311 (void) rw_rdlock(&zd->zd_zilog_lock);
2312
2313 zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
2314
2315 /*
2316 * Remember the committed values in zd, which is in parent/child
2317 * shared memory. If we die, the next iteration of ztest_run()
2318 * will verify that the log really does contain this record.
2319 */
2320 mutex_enter(&zilog->zl_lock);
2321 ASSERT(zd->zd_shared != NULL);
2322 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
2323 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
2324 mutex_exit(&zilog->zl_lock);
2325
2326 (void) rw_unlock(&zd->zd_zilog_lock);
2327 }
2328
2329 /*
2330 * This function is designed to simulate the operations that occur during a
2331 * mount/unmount operation. We hold the dataset across these operations in an
2332 * attempt to expose any implicit assumptions about ZIL management.
2333 */
2334 /* ARGSUSED */
2335 void
2336 ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
2337 {
2338 objset_t *os = zd->zd_os;
2339
2340 /*
2341 * We grab the zd_dirobj_lock to ensure that no other thread is
2342 * updating the zil (i.e. adding in-memory log records) and the
2343 * zd_zilog_lock to block any I/O.
2344 */
2345 VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
2346 (void) rw_wrlock(&zd->zd_zilog_lock);
2347
2348 /* zfsvfs_teardown() */
2349 zil_close(zd->zd_zilog);
2350
2351 /* zfsvfs_setup() */
2352 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
2353 zil_replay(os, zd, ztest_replay_vector);
2354
2355 (void) rw_unlock(&zd->zd_zilog_lock);
2356 VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
2357 }
2358
2359 /*
2360 * Verify that we can't destroy an active pool, create an existing pool,
2361 * or create a pool with a bad vdev spec.
2362 */
2363 /* ARGSUSED */
2364 void
2365 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
2366 {
2367 ztest_shared_opts_t *zo = &ztest_opts;
2368 spa_t *spa;
2369 nvlist_t *nvroot;
2370
2371 /*
2372 * Attempt to create using a bad file.
2373 */
2374 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
2375 VERIFY3U(ENOENT, ==,
2376 spa_create("ztest_bad_file", nvroot, NULL, NULL));
2377 nvlist_free(nvroot);
2378
2379 /*
2380 * Attempt to create using a bad mirror.
2381 */
2382 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
2383 VERIFY3U(ENOENT, ==,
2384 spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
2385 nvlist_free(nvroot);
2386
2387 /*
2388 * Attempt to create an existing pool. It shouldn't matter
2389 * what's in the nvroot; we should fail with EEXIST.
2390 */
2391 (void) rw_rdlock(&ztest_name_lock);
2392 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
2393 VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
2394 nvlist_free(nvroot);
2395 VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
2396 VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
2397 spa_close(spa, FTAG);
2398
2399 (void) rw_unlock(&ztest_name_lock);
2400 }
2401
2402 /* ARGSUSED */
2403 void
2404 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
2405 {
2406 spa_t *spa;
2407 uint64_t initial_version = SPA_VERSION_INITIAL;
2408 uint64_t version, newversion;
2409 nvlist_t *nvroot, *props;
2410 char *name;
2411
2412 VERIFY0(mutex_lock(&ztest_vdev_lock));
2413 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
2414
2415 /*
2416 * Clean up from previous runs.
2417 */
2418 (void) spa_destroy(name);
2419
2420 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
2421 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
2422
2423 /*
2424 * If we're configuring a RAIDZ device then make sure that the
2425 * the initial version is capable of supporting that feature.
2426 */
2427 switch (ztest_opts.zo_raidz_parity) {
2428 case 0:
2429 case 1:
2430 initial_version = SPA_VERSION_INITIAL;
2431 break;
2432 case 2:
2451 VERIFY0(spa_create(name, nvroot, props, NULL));
2452 fnvlist_free(nvroot);
2453 fnvlist_free(props);
2454
2455 VERIFY0(spa_open(name, &spa, FTAG));
2456 VERIFY3U(spa_version(spa), ==, version);
2457 newversion = ztest_random_spa_version(version + 1);
2458
2459 if (ztest_opts.zo_verbose >= 4) {
2460 (void) printf("upgrading spa version from %llu to %llu\n",
2461 (u_longlong_t)version, (u_longlong_t)newversion);
2462 }
2463
2464 spa_upgrade(spa, newversion);
2465 VERIFY3U(spa_version(spa), >, version);
2466 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
2467 zpool_prop_to_name(ZPOOL_PROP_VERSION)));
2468 spa_close(spa, FTAG);
2469
2470 strfree(name);
2471 VERIFY0(mutex_unlock(&ztest_vdev_lock));
2472 }
2473
2474 static vdev_t *
2475 vdev_lookup_by_path(vdev_t *vd, const char *path)
2476 {
2477 vdev_t *mvd;
2478
2479 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
2480 return (vd);
2481
2482 for (int c = 0; c < vd->vdev_children; c++)
2483 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
2484 NULL)
2485 return (mvd);
2486
2487 return (NULL);
2488 }
2489
2490 /*
2491 * Find the first available hole which can be used as a top-level.
2504 if (cvd->vdev_ishole)
2505 break;
2506 }
2507 return (c);
2508 }
2509
2510 /*
2511 * Verify that vdev_add() works as expected.
2512 */
2513 /* ARGSUSED */
2514 void
2515 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
2516 {
2517 ztest_shared_t *zs = ztest_shared;
2518 spa_t *spa = ztest_spa;
2519 uint64_t leaves;
2520 uint64_t guid;
2521 nvlist_t *nvroot;
2522 int error;
2523
2524 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2525 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
2526
2527 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2528
2529 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
2530
2531 /*
2532 * If we have slogs then remove them 1/4 of the time.
2533 */
2534 if (spa_has_slogs(spa) && ztest_random(4) == 0) {
2535 /*
2536 * Grab the guid from the head of the log class rotor.
2537 */
2538 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
2539
2540 spa_config_exit(spa, SCL_VDEV, FTAG);
2541
2542 /*
2543 * We have to grab the zs_name_lock as writer to
2544 * prevent a race between removing a slog (dmu_objset_find)
2545 * and destroying a dataset. Removing the slog will
2546 * grab a reference on the dataset which may cause
2547 * dmu_objset_destroy() to fail with EBUSY thus
2548 * leaving the dataset in an inconsistent state.
2549 */
2550 VERIFY(rw_wrlock(&ztest_name_lock) == 0);
2551 error = spa_vdev_remove(spa, guid, B_FALSE);
2552 VERIFY(rw_unlock(&ztest_name_lock) == 0);
2553
2554 if (error && error != EEXIST)
2555 fatal(0, "spa_vdev_remove() = %d", error);
2556 } else {
2557 spa_config_exit(spa, SCL_VDEV, FTAG);
2558
2559 /*
2560 * Make 1/4 of the devices be log devices.
2561 */
2562 nvroot = make_vdev_root(NULL, NULL, NULL,
2563 ztest_opts.zo_vdev_size, 0,
2564 ztest_random(4) == 0, ztest_opts.zo_raidz,
2565 zs->zs_mirrors, 1);
2566
2567 error = spa_vdev_add(spa, nvroot);
2568 nvlist_free(nvroot);
2569
2570 if (error == ENOSPC)
2571 ztest_record_enospc("spa_vdev_add");
2572 else if (error != 0)
2573 fatal(0, "spa_vdev_add() = %d", error);
2574 }
2575
2576 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2577 }
2578
2579 /*
2580 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
2581 */
2582 /* ARGSUSED */
2583 void
2584 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
2585 {
2586 ztest_shared_t *zs = ztest_shared;
2587 spa_t *spa = ztest_spa;
2588 vdev_t *rvd = spa->spa_root_vdev;
2589 spa_aux_vdev_t *sav;
2590 char *aux;
2591 uint64_t guid = 0;
2592 int error;
2593
2594 if (ztest_random(2) == 0) {
2595 sav = &spa->spa_spares;
2596 aux = ZPOOL_CONFIG_SPARES;
2597 } else {
2598 sav = &spa->spa_l2cache;
2599 aux = ZPOOL_CONFIG_L2CACHE;
2600 }
2601
2602 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2603
2604 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2605
2606 if (sav->sav_count != 0 && ztest_random(4) == 0) {
2607 /*
2608 * Pick a random device to remove.
2609 */
2610 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
2611 } else {
2612 /*
2613 * Find an unused device we can add.
2614 */
2615 zs->zs_vdev_aux = 0;
2616 for (;;) {
2617 char path[MAXPATHLEN];
2618 int c;
2619 (void) snprintf(path, sizeof (path), ztest_aux_template,
2620 ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
2621 zs->zs_vdev_aux);
2622 for (c = 0; c < sav->sav_count; c++)
2639 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
2640 (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
2641 error = spa_vdev_add(spa, nvroot);
2642 if (error != 0)
2643 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
2644 nvlist_free(nvroot);
2645 } else {
2646 /*
2647 * Remove an existing device. Sometimes, dirty its
2648 * vdev state first to make sure we handle removal
2649 * of devices that have pending state changes.
2650 */
2651 if (ztest_random(2) == 0)
2652 (void) vdev_online(spa, guid, 0, NULL);
2653
2654 error = spa_vdev_remove(spa, guid, B_FALSE);
2655 if (error != 0 && error != EBUSY)
2656 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
2657 }
2658
2659 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2660 }
2661
2662 /*
2663 * split a pool if it has mirror tlvdevs
2664 */
2665 /* ARGSUSED */
2666 void
2667 ztest_split_pool(ztest_ds_t *zd, uint64_t id)
2668 {
2669 ztest_shared_t *zs = ztest_shared;
2670 spa_t *spa = ztest_spa;
2671 vdev_t *rvd = spa->spa_root_vdev;
2672 nvlist_t *tree, **child, *config, *split, **schild;
2673 uint_t c, children, schildren = 0, lastlogid = 0;
2674 int error = 0;
2675
2676 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2677
2678 /* ensure we have a useable config; mirrors of raidz aren't supported */
2679 if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
2680 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2681 return;
2682 }
2683
2684 /* clean up the old pool, if any */
2685 (void) spa_destroy("splitp");
2686
2687 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2688
2689 /* generate a config from the existing config */
2690 mutex_enter(&spa->spa_props_lock);
2691 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
2692 &tree) == 0);
2693 mutex_exit(&spa->spa_props_lock);
2694
2695 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
2696 &children) == 0);
2697
2698 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
2699 for (c = 0; c < children; c++) {
2700 vdev_t *tvd = rvd->vdev_child[c];
2719 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
2720 }
2721
2722 /* OK, create a config that can be used to split */
2723 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
2724 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
2725 VDEV_TYPE_ROOT) == 0);
2726 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
2727 lastlogid != 0 ? lastlogid : schildren) == 0);
2728
2729 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
2730 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
2731
2732 for (c = 0; c < schildren; c++)
2733 nvlist_free(schild[c]);
2734 free(schild);
2735 nvlist_free(split);
2736
2737 spa_config_exit(spa, SCL_VDEV, FTAG);
2738
2739 (void) rw_wrlock(&ztest_name_lock);
2740 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
2741 (void) rw_unlock(&ztest_name_lock);
2742
2743 nvlist_free(config);
2744
2745 if (error == 0) {
2746 (void) printf("successful split - results:\n");
2747 mutex_enter(&spa_namespace_lock);
2748 show_pool_stats(spa);
2749 show_pool_stats(spa_lookup("splitp"));
2750 mutex_exit(&spa_namespace_lock);
2751 ++zs->zs_splits;
2752 --zs->zs_mirrors;
2753 }
2754 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2755
2756 }
2757
2758 /*
2759 * Verify that we can attach and detach devices.
2760 */
2761 /* ARGSUSED */
2762 void
2763 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
2764 {
2765 ztest_shared_t *zs = ztest_shared;
2766 spa_t *spa = ztest_spa;
2767 spa_aux_vdev_t *sav = &spa->spa_spares;
2768 vdev_t *rvd = spa->spa_root_vdev;
2769 vdev_t *oldvd, *newvd, *pvd;
2770 nvlist_t *root;
2771 uint64_t leaves;
2772 uint64_t leaf, top;
2773 uint64_t ashift = ztest_get_ashift();
2774 uint64_t oldguid, pguid;
2775 uint64_t oldsize, newsize;
2776 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
2777 int replacing;
2778 int oldvd_has_siblings = B_FALSE;
2779 int newvd_is_spare = B_FALSE;
2780 int oldvd_is_log;
2781 int error, expected_error;
2782
2783 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2784 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
2785
2786 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2787
2788 /*
2789 * Decide whether to do an attach or a replace.
2790 */
2791 replacing = ztest_random(2);
2792
2793 /*
2794 * Pick a random top-level vdev.
2795 */
2796 top = ztest_random_vdev_top(spa, B_TRUE);
2797
2798 /*
2799 * Pick a random leaf within it.
2800 */
2801 leaf = ztest_random(leaves);
2802
2803 /*
2824 ASSERT(oldvd->vdev_children >= 2);
2825 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
2826 }
2827
2828 oldguid = oldvd->vdev_guid;
2829 oldsize = vdev_get_min_asize(oldvd);
2830 oldvd_is_log = oldvd->vdev_top->vdev_islog;
2831 (void) strcpy(oldpath, oldvd->vdev_path);
2832 pvd = oldvd->vdev_parent;
2833 pguid = pvd->vdev_guid;
2834
2835 /*
2836 * If oldvd has siblings, then half of the time, detach it.
2837 */
2838 if (oldvd_has_siblings && ztest_random(2) == 0) {
2839 spa_config_exit(spa, SCL_VDEV, FTAG);
2840 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
2841 if (error != 0 && error != ENODEV && error != EBUSY &&
2842 error != ENOTSUP)
2843 fatal(0, "detach (%s) returned %d", oldpath, error);
2844 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2845 return;
2846 }
2847
2848 /*
2849 * For the new vdev, choose with equal probability between the two
2850 * standard paths (ending in either 'a' or 'b') or a random hot spare.
2851 */
2852 if (sav->sav_count != 0 && ztest_random(3) == 0) {
2853 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
2854 newvd_is_spare = B_TRUE;
2855 (void) strcpy(newpath, newvd->vdev_path);
2856 } else {
2857 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
2858 ztest_opts.zo_dir, ztest_opts.zo_pool,
2859 top * leaves + leaf);
2860 if (ztest_random(2) == 0)
2861 newpath[strlen(newpath) - 1] = 'b';
2862 newvd = vdev_lookup_by_path(rvd, newpath);
2863 }
2864
2918 * fail with ENODEV, or fail with EOVERFLOW.
2919 */
2920 if (expected_error == ENOTSUP &&
2921 (error == 0 || error == ENODEV || error == EOVERFLOW))
2922 expected_error = error;
2923
2924 /*
2925 * If someone grew the LUN, the replacement may be too small.
2926 */
2927 if (error == EOVERFLOW || error == EBUSY)
2928 expected_error = error;
2929
2930 /* XXX workaround 6690467 */
2931 if (error != expected_error && expected_error != EBUSY) {
2932 fatal(0, "attach (%s %llu, %s %llu, %d) "
2933 "returned %d, expected %d",
2934 oldpath, oldsize, newpath,
2935 newsize, replacing, error, expected_error);
2936 }
2937
2938 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2939 }
2940
2941 /*
2942 * Callback function which expands the physical size of the vdev.
2943 */
2944 vdev_t *
2945 grow_vdev(vdev_t *vd, void *arg)
2946 {
2947 spa_t *spa = vd->vdev_spa;
2948 size_t *newsize = arg;
2949 size_t fsize;
2950 int fd;
2951
2952 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2953 ASSERT(vd->vdev_ops->vdev_op_leaf);
2954
2955 if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
2956 return (vd);
2957
2958 fsize = lseek(fd, 0, SEEK_END);
3046 return (cvd);
3047 }
3048 return (NULL);
3049 }
3050
3051 /*
3052 * Verify that dynamic LUN growth works as expected.
3053 */
3054 /* ARGSUSED */
3055 void
3056 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
3057 {
3058 spa_t *spa = ztest_spa;
3059 vdev_t *vd, *tvd;
3060 metaslab_class_t *mc;
3061 metaslab_group_t *mg;
3062 size_t psize, newsize;
3063 uint64_t top;
3064 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
3065
3066 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
3067 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3068
3069 top = ztest_random_vdev_top(spa, B_TRUE);
3070
3071 tvd = spa->spa_root_vdev->vdev_child[top];
3072 mg = tvd->vdev_mg;
3073 mc = mg->mg_class;
3074 old_ms_count = tvd->vdev_ms_count;
3075 old_class_space = metaslab_class_get_space(mc);
3076
3077 /*
3078 * Determine the size of the first leaf vdev associated with
3079 * our top-level device.
3080 */
3081 vd = vdev_walk_tree(tvd, NULL, NULL);
3082 ASSERT3P(vd, !=, NULL);
3083 ASSERT(vd->vdev_ops->vdev_op_leaf);
3084
3085 psize = vd->vdev_psize;
3086
3087 /*
3088 * We only try to expand the vdev if it's healthy, less than 4x its
3089 * original size, and it has a valid psize.
3090 */
3091 if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
3092 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
3093 spa_config_exit(spa, SCL_STATE, spa);
3094 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3095 return;
3096 }
3097 ASSERT(psize > 0);
3098 newsize = psize + psize / 8;
3099 ASSERT3U(newsize, >, psize);
3100
3101 if (ztest_opts.zo_verbose >= 6) {
3102 (void) printf("Expanding LUN %s from %lu to %lu\n",
3103 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
3104 }
3105
3106 /*
3107 * Growing the vdev is a two step process:
3108 * 1). expand the physical size (i.e. relabel)
3109 * 2). online the vdev to create the new metaslabs
3110 */
3111 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
3112 vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
3113 tvd->vdev_state != VDEV_STATE_HEALTHY) {
3114 if (ztest_opts.zo_verbose >= 5) {
3115 (void) printf("Could not expand LUN because "
3116 "the vdev configuration changed.\n");
3117 }
3118 spa_config_exit(spa, SCL_STATE, spa);
3119 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3120 return;
3121 }
3122
3123 spa_config_exit(spa, SCL_STATE, spa);
3124
3125 /*
3126 * Expanding the LUN will update the config asynchronously,
3127 * thus we must wait for the async thread to complete any
3128 * pending tasks before proceeding.
3129 */
3130 for (;;) {
3131 boolean_t done;
3132 mutex_enter(&spa->spa_async_lock);
3133 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
3134 mutex_exit(&spa->spa_async_lock);
3135 if (done)
3136 break;
3137 txg_wait_synced(spa_get_dsl(spa), 0);
3138 (void) poll(NULL, 0, 100);
3139 }
3140
3141 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3142
3143 tvd = spa->spa_root_vdev->vdev_child[top];
3144 new_ms_count = tvd->vdev_ms_count;
3145 new_class_space = metaslab_class_get_space(mc);
3146
3147 if (tvd->vdev_mg != mg || mg->mg_class != mc) {
3148 if (ztest_opts.zo_verbose >= 5) {
3149 (void) printf("Could not verify LUN expansion due to "
3150 "intervening vdev offline or remove.\n");
3151 }
3152 spa_config_exit(spa, SCL_STATE, spa);
3153 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3154 return;
3155 }
3156
3157 /*
3158 * Make sure we were able to grow the vdev.
3159 */
3160 if (new_ms_count <= old_ms_count)
3161 fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
3162 old_ms_count, new_ms_count);
3163
3164 /*
3165 * Make sure we were able to grow the pool.
3166 */
3167 if (new_class_space <= old_class_space)
3168 fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
3169 old_class_space, new_class_space);
3170
3171 if (ztest_opts.zo_verbose >= 5) {
3172 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
3173
3174 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
3175 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
3176 (void) printf("%s grew from %s to %s\n",
3177 spa->spa_name, oldnumbuf, newnumbuf);
3178 }
3179
3180 spa_config_exit(spa, SCL_STATE, spa);
3181 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3182 }
3183
3184 /*
3185 * Verify that dmu_objset_{create,destroy,open,close} work as expected.
3186 */
3187 /* ARGSUSED */
3188 static void
3189 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
3190 {
3191 /*
3192 * Create the objects common to all ztest datasets.
3193 */
3194 VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
3195 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
3196 }
3197
3198 static int
3199 ztest_dataset_create(char *dsname)
3200 {
3201 uint64_t zilset = ztest_random(100);
3275 (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
3276 (u_longlong_t)id);
3277
3278 error = dsl_destroy_snapshot(snapname, B_FALSE);
3279 if (error != 0 && error != ENOENT)
3280 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
3281 return (B_TRUE);
3282 }
3283
3284 /* ARGSUSED */
3285 void
3286 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
3287 {
3288 ztest_ds_t zdtmp;
3289 int iters;
3290 int error;
3291 objset_t *os, *os2;
3292 char name[ZFS_MAX_DATASET_NAME_LEN];
3293 zilog_t *zilog;
3294
3295 (void) rw_rdlock(&ztest_name_lock);
3296
3297 (void) snprintf(name, sizeof (name), "%s/temp_%llu",
3298 ztest_opts.zo_pool, (u_longlong_t)id);
3299
3300 /*
3301 * If this dataset exists from a previous run, process its replay log
3302 * half of the time. If we don't replay it, then dmu_objset_destroy()
3303 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
3304 */
3305 if (ztest_random(2) == 0 &&
3306 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
3307 ztest_zd_init(&zdtmp, NULL, os);
3308 zil_replay(os, &zdtmp, ztest_replay_vector);
3309 ztest_zd_fini(&zdtmp);
3310 dmu_objset_disown(os, FTAG);
3311 }
3312
3313 /*
3314 * There may be an old instance of the dataset we're about to
3315 * create lying around from a previous run. If so, destroy it
3316 * and all of its snapshots.
3317 */
3318 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
3319 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
3320
3321 /*
3322 * Verify that the destroyed dataset is no longer in the namespace.
3323 */
3324 VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
3325 FTAG, &os));
3326
3327 /*
3328 * Verify that we can create a new dataset.
3329 */
3330 error = ztest_dataset_create(name);
3331 if (error) {
3332 if (error == ENOSPC) {
3333 ztest_record_enospc(FTAG);
3334 (void) rw_unlock(&ztest_name_lock);
3335 return;
3336 }
3337 fatal(0, "dmu_objset_create(%s) = %d", name, error);
3338 }
3339
3340 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
3341
3342 ztest_zd_init(&zdtmp, NULL, os);
3343
3344 /*
3345 * Open the intent log for it.
3346 */
3347 zilog = zil_open(os, ztest_get_data);
3348
3349 /*
3350 * Put some objects in there, do a little I/O to them,
3351 * and randomly take a couple of snapshots along the way.
3352 */
3353 iters = ztest_random(5);
3354 for (int i = 0; i < iters; i++) {
3362 */
3363 VERIFY3U(EEXIST, ==,
3364 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
3365
3366 /*
3367 * Verify that we can hold an objset that is also owned.
3368 */
3369 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
3370 dmu_objset_rele(os2, FTAG);
3371
3372 /*
3373 * Verify that we cannot own an objset that is already owned.
3374 */
3375 VERIFY3U(EBUSY, ==,
3376 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
3377
3378 zil_close(zilog);
3379 dmu_objset_disown(os, FTAG);
3380 ztest_zd_fini(&zdtmp);
3381
3382 (void) rw_unlock(&ztest_name_lock);
3383 }
3384
3385 /*
3386 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
3387 */
3388 void
3389 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
3390 {
3391 (void) rw_rdlock(&ztest_name_lock);
3392 (void) ztest_snapshot_destroy(zd->zd_name, id);
3393 (void) ztest_snapshot_create(zd->zd_name, id);
3394 (void) rw_unlock(&ztest_name_lock);
3395 }
3396
3397 /*
3398 * Cleanup non-standard snapshots and clones.
3399 */
3400 void
3401 ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
3402 {
3403 char snap1name[ZFS_MAX_DATASET_NAME_LEN];
3404 char clone1name[ZFS_MAX_DATASET_NAME_LEN];
3405 char snap2name[ZFS_MAX_DATASET_NAME_LEN];
3406 char clone2name[ZFS_MAX_DATASET_NAME_LEN];
3407 char snap3name[ZFS_MAX_DATASET_NAME_LEN];
3408 int error;
3409
3410 (void) snprintf(snap1name, sizeof (snap1name),
3411 "%s@s1_%llu", osname, id);
3412 (void) snprintf(clone1name, sizeof (clone1name),
3413 "%s/c1_%llu", osname, id);
3414 (void) snprintf(snap2name, sizeof (snap2name),
3433 error = dsl_destroy_snapshot(snap1name, B_FALSE);
3434 if (error && error != ENOENT)
3435 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
3436 }
3437
3438 /*
3439 * Verify dsl_dataset_promote handles EBUSY
3440 */
3441 void
3442 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
3443 {
3444 objset_t *os;
3445 char snap1name[ZFS_MAX_DATASET_NAME_LEN];
3446 char clone1name[ZFS_MAX_DATASET_NAME_LEN];
3447 char snap2name[ZFS_MAX_DATASET_NAME_LEN];
3448 char clone2name[ZFS_MAX_DATASET_NAME_LEN];
3449 char snap3name[ZFS_MAX_DATASET_NAME_LEN];
3450 char *osname = zd->zd_name;
3451 int error;
3452
3453 (void) rw_rdlock(&ztest_name_lock);
3454
3455 ztest_dsl_dataset_cleanup(osname, id);
3456
3457 (void) snprintf(snap1name, sizeof (snap1name),
3458 "%s@s1_%llu", osname, id);
3459 (void) snprintf(clone1name, sizeof (clone1name),
3460 "%s/c1_%llu", osname, id);
3461 (void) snprintf(snap2name, sizeof (snap2name),
3462 "%s@s2_%llu", clone1name, id);
3463 (void) snprintf(clone2name, sizeof (clone2name),
3464 "%s/c2_%llu", osname, id);
3465 (void) snprintf(snap3name, sizeof (snap3name),
3466 "%s@s3_%llu", clone1name, id);
3467
3468 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
3469 if (error && error != EEXIST) {
3470 if (error == ENOSPC) {
3471 ztest_record_enospc(FTAG);
3472 goto out;
3473 }
3510 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
3511 }
3512
3513 error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
3514 if (error)
3515 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
3516 error = dsl_dataset_promote(clone2name, NULL);
3517 if (error == ENOSPC) {
3518 dmu_objset_disown(os, FTAG);
3519 ztest_record_enospc(FTAG);
3520 goto out;
3521 }
3522 if (error != EBUSY)
3523 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
3524 error);
3525 dmu_objset_disown(os, FTAG);
3526
3527 out:
3528 ztest_dsl_dataset_cleanup(osname, id);
3529
3530 (void) rw_unlock(&ztest_name_lock);
3531 }
3532
3533 /*
3534 * Verify that dmu_object_{alloc,free} work as expected.
3535 */
3536 void
3537 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
3538 {
3539 ztest_od_t od[4];
3540 int batchsize = sizeof (od) / sizeof (od[0]);
3541
3542 for (int b = 0; b < batchsize; b++)
3543 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
3544
3545 /*
3546 * Destroy the previous batch of objects, create a new batch,
3547 * and do some I/O on the new objects.
3548 */
3549 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
3550 return;
4444
4445 if (error == ECANCELED) {
4446 ASSERT0(data->zcd_txg);
4447 ASSERT(!data->zcd_added);
4448
4449 /*
4450 * The private callback data should be destroyed here, but
4451 * since we are going to check the zcd_called field after
4452 * dmu_tx_abort(), we will destroy it there.
4453 */
4454 return;
4455 }
4456
4457 /* Was this callback added to the global callback list? */
4458 if (!data->zcd_added)
4459 goto out;
4460
4461 ASSERT3U(data->zcd_txg, !=, 0);
4462
4463 /* Remove our callback from the list */
4464 (void) mutex_lock(&zcl.zcl_callbacks_lock);
4465 list_remove(&zcl.zcl_callbacks, data);
4466 (void) mutex_unlock(&zcl.zcl_callbacks_lock);
4467
4468 out:
4469 umem_free(data, sizeof (ztest_cb_data_t));
4470 }
4471
4472 /* Allocate and initialize callback data structure */
4473 static ztest_cb_data_t *
4474 ztest_create_cb_data(objset_t *os, uint64_t txg)
4475 {
4476 ztest_cb_data_t *cb_data;
4477
4478 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
4479
4480 cb_data->zcd_txg = txg;
4481 cb_data->zcd_spa = dmu_objset_spa(os);
4482
4483 return (cb_data);
4484 }
4485
4486 /*
4548 }
4549
4550 return;
4551 }
4552
4553 cb_data[2] = ztest_create_cb_data(os, txg);
4554 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
4555
4556 /*
4557 * Read existing data to make sure there isn't a future leak.
4558 */
4559 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
4560 &old_txg, DMU_READ_PREFETCH));
4561
4562 if (old_txg > txg)
4563 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
4564 old_txg, txg);
4565
4566 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
4567
4568 (void) mutex_lock(&zcl.zcl_callbacks_lock);
4569
4570 /*
4571 * Since commit callbacks don't have any ordering requirement and since
4572 * it is theoretically possible for a commit callback to be called
4573 * after an arbitrary amount of time has elapsed since its txg has been
4574 * synced, it is difficult to reliably determine whether a commit
4575 * callback hasn't been called due to high load or due to a flawed
4576 * implementation.
4577 *
4578 * In practice, we will assume that if after a certain number of txgs a
4579 * commit callback hasn't been called, then most likely there's an
4580 * implementation bug..
4581 */
4582 tmp_cb = list_head(&zcl.zcl_callbacks);
4583 if (tmp_cb != NULL &&
4584 (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
4585 fatal(0, "Commit callback threshold exceeded, oldest txg: %"
4586 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
4587 }
4588
4595 * (from other objsets) may have sneaked in.
4596 */
4597 tmp_cb = list_tail(&zcl.zcl_callbacks);
4598 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
4599 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
4600
4601 /* Add the 3 callbacks to the list */
4602 for (i = 0; i < 3; i++) {
4603 if (tmp_cb == NULL)
4604 list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
4605 else
4606 list_insert_after(&zcl.zcl_callbacks, tmp_cb,
4607 cb_data[i]);
4608
4609 cb_data[i]->zcd_added = B_TRUE;
4610 VERIFY(!cb_data[i]->zcd_called);
4611
4612 tmp_cb = cb_data[i];
4613 }
4614
4615 (void) mutex_unlock(&zcl.zcl_callbacks_lock);
4616
4617 dmu_tx_commit(tx);
4618 }
4619
4620 /* ARGSUSED */
4621 void
4622 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
4623 {
4624 zfs_prop_t proplist[] = {
4625 ZFS_PROP_CHECKSUM,
4626 ZFS_PROP_COMPRESSION,
4627 ZFS_PROP_COPIES,
4628 ZFS_PROP_DEDUP
4629 };
4630
4631 (void) rw_rdlock(&ztest_name_lock);
4632
4633 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
4634 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
4635 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
4636
4637 (void) rw_unlock(&ztest_name_lock);
4638 }
4639
4640 /* ARGSUSED */
4641 void
4642 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
4643 {
4644 nvlist_t *props = NULL;
4645
4646 (void) rw_rdlock(&ztest_name_lock);
4647
4648 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
4649 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
4650
4651 VERIFY0(spa_prop_get(ztest_spa, &props));
4652
4653 if (ztest_opts.zo_verbose >= 6)
4654 dump_nvlist(props, 4);
4655
4656 nvlist_free(props);
4657
4658 (void) rw_unlock(&ztest_name_lock);
4659 }
4660
4661 static int
4662 user_release_one(const char *snapname, const char *holdname)
4663 {
4664 nvlist_t *snaps, *holds;
4665 int error;
4666
4667 snaps = fnvlist_alloc();
4668 holds = fnvlist_alloc();
4669 fnvlist_add_boolean(holds, holdname);
4670 fnvlist_add_nvlist(snaps, snapname, holds);
4671 fnvlist_free(holds);
4672 error = dsl_dataset_user_release(snaps, NULL);
4673 fnvlist_free(snaps);
4674 return (error);
4675 }
4676
4677 /*
4678 * Test snapshot hold/release and deferred destroy.
4679 */
4680 void
4681 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
4682 {
4683 int error;
4684 objset_t *os = zd->zd_os;
4685 objset_t *origin;
4686 char snapname[100];
4687 char fullname[100];
4688 char clonename[100];
4689 char tag[100];
4690 char osname[ZFS_MAX_DATASET_NAME_LEN];
4691 nvlist_t *holds;
4692
4693 (void) rw_rdlock(&ztest_name_lock);
4694
4695 dmu_objset_name(os, osname);
4696
4697 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
4698 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
4699 (void) snprintf(clonename, sizeof (clonename),
4700 "%s/ch1_%llu", osname, id);
4701 (void) snprintf(tag, sizeof (tag), "tag_%llu", id);
4702
4703 /*
4704 * Clean up from any previous run.
4705 */
4706 error = dsl_destroy_head(clonename);
4707 if (error != ENOENT)
4708 ASSERT0(error);
4709 error = user_release_one(fullname, tag);
4710 if (error != ESRCH && error != ENOENT)
4711 ASSERT0(error);
4712 error = dsl_destroy_snapshot(fullname, B_FALSE);
4713 if (error != ENOENT)
4778
4779 error = dsl_destroy_snapshot(fullname, B_FALSE);
4780 if (error != EBUSY) {
4781 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
4782 fullname, error);
4783 }
4784
4785 error = dsl_destroy_snapshot(fullname, B_TRUE);
4786 if (error) {
4787 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
4788 fullname, error);
4789 }
4790
4791 error = user_release_one(fullname, tag);
4792 if (error)
4793 fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
4794
4795 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
4796
4797 out:
4798 (void) rw_unlock(&ztest_name_lock);
4799 }
4800
4801 /*
4802 * Inject random faults into the on-disk data.
4803 */
4804 /* ARGSUSED */
4805 void
4806 ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
4807 {
4808 ztest_shared_t *zs = ztest_shared;
4809 spa_t *spa = ztest_spa;
4810 int fd;
4811 uint64_t offset;
4812 uint64_t leaves;
4813 uint64_t bad = 0x1990c0ffeedecade;
4814 uint64_t top, leaf;
4815 char path0[MAXPATHLEN];
4816 char pathrand[MAXPATHLEN];
4817 size_t fsize;
4818 int bshift = SPA_MAXBLOCKSHIFT + 2;
4819 int iters = 1000;
4820 int maxfaults;
4821 int mirror_save;
4822 vdev_t *vd0 = NULL;
4823 uint64_t guid0 = 0;
4824 boolean_t islog = B_FALSE;
4825
4826 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
4827 maxfaults = MAXFAULTS();
4828 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
4829 mirror_save = zs->zs_mirrors;
4830 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
4831
4832 ASSERT(leaves >= 1);
4833
4834 /*
4835 * Grab the name lock as reader. There are some operations
4836 * which don't like to have their vdevs changed while
4837 * they are in progress (i.e. spa_change_guid). Those
4838 * operations will have grabbed the name lock as writer.
4839 */
4840 (void) rw_rdlock(&ztest_name_lock);
4841
4842 /*
4843 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
4844 */
4845 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4846
4847 if (ztest_random(2) == 0) {
4848 /*
4849 * Inject errors on a normal data device or slog device.
4850 */
4851 top = ztest_random_vdev_top(spa, B_TRUE);
4852 leaf = ztest_random(leaves) + zs->zs_splits;
4853
4854 /*
4855 * Generate paths to the first leaf in this top-level vdev,
4856 * and to the random leaf we selected. We'll induce transient
4857 * write failures and random online/offline activity on leaf 0,
4858 * and we'll write random garbage to the randomly chosen leaf.
4859 */
4860 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
4889 vdev_file_t *vf = vd0->vdev_tsd;
4890
4891 if (vf != NULL && ztest_random(3) == 0) {
4892 (void) close(vf->vf_vnode->v_fd);
4893 vf->vf_vnode->v_fd = -1;
4894 } else if (ztest_random(2) == 0) {
4895 vd0->vdev_cant_read = B_TRUE;
4896 } else {
4897 vd0->vdev_cant_write = B_TRUE;
4898 }
4899 guid0 = vd0->vdev_guid;
4900 }
4901 } else {
4902 /*
4903 * Inject errors on an l2cache device.
4904 */
4905 spa_aux_vdev_t *sav = &spa->spa_l2cache;
4906
4907 if (sav->sav_count == 0) {
4908 spa_config_exit(spa, SCL_STATE, FTAG);
4909 (void) rw_unlock(&ztest_name_lock);
4910 return;
4911 }
4912 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
4913 guid0 = vd0->vdev_guid;
4914 (void) strcpy(path0, vd0->vdev_path);
4915 (void) strcpy(pathrand, vd0->vdev_path);
4916
4917 leaf = 0;
4918 leaves = 1;
4919 maxfaults = INT_MAX; /* no limit on cache devices */
4920 }
4921
4922 spa_config_exit(spa, SCL_STATE, FTAG);
4923 (void) rw_unlock(&ztest_name_lock);
4924
4925 /*
4926 * If we can tolerate two or more faults, or we're dealing
4927 * with a slog, randomly online/offline vd0.
4928 */
4929 if ((maxfaults >= 2 || islog) && guid0 != 0) {
4930 if (ztest_random(10) < 6) {
4931 int flags = (ztest_random(2) == 0 ?
4932 ZFS_OFFLINE_TEMPORARY : 0);
4933
4934 /*
4935 * We have to grab the zs_name_lock as writer to
4936 * prevent a race between offlining a slog and
4937 * destroying a dataset. Offlining the slog will
4938 * grab a reference on the dataset which may cause
4939 * dmu_objset_destroy() to fail with EBUSY thus
4940 * leaving the dataset in an inconsistent state.
4941 */
4942 if (islog)
4943 (void) rw_wrlock(&ztest_name_lock);
4944
4945 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
4946
4947 if (islog)
4948 (void) rw_unlock(&ztest_name_lock);
4949 } else {
4950 /*
4951 * Ideally we would like to be able to randomly
4952 * call vdev_[on|off]line without holding locks
4953 * to force unpredictable failures but the side
4954 * effects of vdev_[on|off]line prevent us from
4955 * doing so. We grab the ztest_vdev_lock here to
4956 * prevent a race between injection testing and
4957 * aux_vdev removal.
4958 */
4959 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
4960 (void) vdev_online(spa, guid0, 0, NULL);
4961 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
4962 }
4963 }
4964
4965 if (maxfaults == 0)
4966 return;
4967
4968 /*
4969 * We have at least single-fault tolerance, so inject data corruption.
4970 */
4971 fd = open(pathrand, O_RDWR);
4972
4973 if (fd == -1) /* we hit a gap in the device namespace */
4974 return;
4975
4976 fsize = lseek(fd, 0, SEEK_END);
4977
4978 while (--iters != 0) {
4979 /*
4980 * The offset must be chosen carefully to ensure that
4981 * we do not inject a given logical block with errors
5013 * because we also damage (parts of) the other side of
5014 * the mirror/raidz.
5015 *
5016 * Additionally, we will always have both an even and an
5017 * odd label, so that we can handle crashes in the
5018 * middle of vdev_config_sync().
5019 */
5020 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
5021 continue;
5022
5023 /*
5024 * The two end labels are stored at the "end" of the disk, but
5025 * the end of the disk (vdev_psize) is aligned to
5026 * sizeof (vdev_label_t).
5027 */
5028 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
5029 if ((leaf & 1) == 1 &&
5030 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
5031 continue;
5032
5033 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
5034 if (mirror_save != zs->zs_mirrors) {
5035 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
5036 (void) close(fd);
5037 return;
5038 }
5039
5040 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
5041 fatal(1, "can't inject bad word at 0x%llx in %s",
5042 offset, pathrand);
5043
5044 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
5045
5046 if (ztest_opts.zo_verbose >= 7)
5047 (void) printf("injected bad word into %s,"
5048 " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
5049 }
5050
5051 (void) close(fd);
5052 }
5053
5054 /*
5055 * Verify that DDT repair works as expected.
5056 */
5057 void
5058 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
5059 {
5060 ztest_shared_t *zs = ztest_shared;
5061 spa_t *spa = ztest_spa;
5062 objset_t *os = zd->zd_os;
5063 ztest_od_t od[1];
5064 uint64_t object, blocksize, txg, pattern, psize;
5065 enum zio_checksum checksum = spa_dedup_checksum(spa);
5066 dmu_buf_t *db;
5067 dmu_tx_t *tx;
5068 abd_t *abd;
5069 blkptr_t blk;
5070 int copies = 2 * ZIO_DEDUPDITTO_MIN;
5071
5072 blocksize = ztest_random_blocksize();
5073 blocksize = MIN(blocksize, 2048); /* because we write so many */
5074
5075 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
5076
5077 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
5078 return;
5079
5080 /*
5081 * Take the name lock as writer to prevent anyone else from changing
5082 * the pool and dataset properies we need to maintain during this test.
5083 */
5084 (void) rw_wrlock(&ztest_name_lock);
5085
5086 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
5087 B_FALSE) != 0 ||
5088 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
5089 B_FALSE) != 0) {
5090 (void) rw_unlock(&ztest_name_lock);
5091 return;
5092 }
5093
5094 dmu_objset_stats_t dds;
5095 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
5096 dmu_objset_fast_stat(os, &dds);
5097 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
5098
5099 object = od[0].od_object;
5100 blocksize = od[0].od_blocksize;
5101 pattern = zs->zs_guid ^ dds.dds_guid;
5102
5103 ASSERT(object != 0);
5104
5105 tx = dmu_tx_create(os);
5106 dmu_tx_hold_write(tx, object, 0, copies * blocksize);
5107 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
5108 if (txg == 0) {
5109 (void) rw_unlock(&ztest_name_lock);
5110 return;
5111 }
5112
5113 /*
5114 * Write all the copies of our block.
5115 */
5116 for (int i = 0; i < copies; i++) {
5117 uint64_t offset = i * blocksize;
5118 int error = dmu_buf_hold(os, object, offset, FTAG, &db,
5119 DMU_READ_NO_PREFETCH);
5120 if (error != 0) {
5121 fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
5122 os, (long long)object, (long long) offset, error);
5123 }
5124 ASSERT(db->db_offset == offset);
5125 ASSERT(db->db_size == blocksize);
5126 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
5127 ztest_pattern_match(db->db_data, db->db_size, 0ULL));
5128 dmu_buf_will_fill(db, tx);
5129 ztest_pattern_set(db->db_data, db->db_size, pattern);
5137 * Find out what block we got.
5138 */
5139 VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
5140 DMU_READ_NO_PREFETCH));
5141 blk = *((dmu_buf_impl_t *)db)->db_blkptr;
5142 dmu_buf_rele(db, FTAG);
5143
5144 /*
5145 * Damage the block. Dedup-ditto will save us when we read it later.
5146 */
5147 psize = BP_GET_PSIZE(&blk);
5148 abd = abd_alloc_linear(psize, B_TRUE);
5149 ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
5150
5151 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
5152 abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
5153 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
5154
5155 abd_free(abd);
5156
5157 (void) rw_unlock(&ztest_name_lock);
5158 }
5159
5160 /*
5161 * Scrub the pool.
5162 */
5163 /* ARGSUSED */
5164 void
5165 ztest_scrub(ztest_ds_t *zd, uint64_t id)
5166 {
5167 spa_t *spa = ztest_spa;
5168
5169 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5170 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
5171 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5172 }
5173
5174 /*
5175 * Change the guid for the pool.
5176 */
5177 /* ARGSUSED */
5178 void
5179 ztest_reguid(ztest_ds_t *zd, uint64_t id)
5180 {
5181 spa_t *spa = ztest_spa;
5182 uint64_t orig, load;
5183 int error;
5184
5185 orig = spa_guid(spa);
5186 load = spa_load_guid(spa);
5187
5188 (void) rw_wrlock(&ztest_name_lock);
5189 error = spa_change_guid(spa);
5190 (void) rw_unlock(&ztest_name_lock);
5191
5192 if (error != 0)
5193 return;
5194
5195 if (ztest_opts.zo_verbose >= 4) {
5196 (void) printf("Changed guid old %llu -> %llu\n",
5197 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
5198 }
5199
5200 VERIFY3U(orig, !=, spa_guid(spa));
5201 VERIFY3U(load, ==, spa_load_guid(spa));
5202 }
5203
5204 /*
5205 * Rename the pool to a different name and then rename it back.
5206 */
5207 /* ARGSUSED */
5208 void
5209 ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
5210 {
5211 char *oldname, *newname;
5212 spa_t *spa;
5213
5214 (void) rw_wrlock(&ztest_name_lock);
5215
5216 oldname = ztest_opts.zo_pool;
5217 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
5218 (void) strcpy(newname, oldname);
5219 (void) strcat(newname, "_tmp");
5220
5221 /*
5222 * Do the rename
5223 */
5224 VERIFY3U(0, ==, spa_rename(oldname, newname));
5225
5226 /*
5227 * Try to open it under the old name, which shouldn't exist
5228 */
5229 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
5230
5231 /*
5232 * Open it under the new name and make sure it's still the same spa_t.
5233 */
5234 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
5235
5236 ASSERT(spa == ztest_spa);
5237 spa_close(spa, FTAG);
5238
5239 /*
5240 * Rename it back to the original
5241 */
5242 VERIFY3U(0, ==, spa_rename(newname, oldname));
5243
5244 /*
5245 * Make sure it can still be opened
5246 */
5247 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
5248
5249 ASSERT(spa == ztest_spa);
5250 spa_close(spa, FTAG);
5251
5252 umem_free(newname, strlen(newname) + 1);
5253
5254 (void) rw_unlock(&ztest_name_lock);
5255 }
5256
5257 /*
5258 * Verify pool integrity by running zdb.
5259 */
5260 static void
5261 ztest_run_zdb(char *pool)
5262 {
5263 int status;
5264 char zdb[MAXPATHLEN + MAXNAMELEN + 20];
5265 char zbuf[1024];
5266 char *bin;
5267 char *ztest;
5268 char *isa;
5269 int isalen;
5270 FILE *fp;
5271
5272 (void) realpath(getexecname(), zdb);
5273
5274 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
5589 * That's because zap_count() returns the open-context value,
5590 * while dmu_objset_space() returns the rootbp fill count.
5591 */
5592 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
5593 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
5594 ASSERT3U(dirobjs + 1, ==, usedobjs);
5595 }
5596
5597 static int
5598 ztest_dataset_open(int d)
5599 {
5600 ztest_ds_t *zd = &ztest_ds[d];
5601 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
5602 objset_t *os;
5603 zilog_t *zilog;
5604 char name[ZFS_MAX_DATASET_NAME_LEN];
5605 int error;
5606
5607 ztest_dataset_name(name, ztest_opts.zo_pool, d);
5608
5609 (void) rw_rdlock(&ztest_name_lock);
5610
5611 error = ztest_dataset_create(name);
5612 if (error == ENOSPC) {
5613 (void) rw_unlock(&ztest_name_lock);
5614 ztest_record_enospc(FTAG);
5615 return (error);
5616 }
5617 ASSERT(error == 0 || error == EEXIST);
5618
5619 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
5620 (void) rw_unlock(&ztest_name_lock);
5621
5622 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
5623
5624 zilog = zd->zd_zilog;
5625
5626 if (zilog->zl_header->zh_claim_lr_seq != 0 &&
5627 zilog->zl_header->zh_claim_lr_seq < committed_seq)
5628 fatal(0, "missing log records: claimed %llu < committed %llu",
5629 zilog->zl_header->zh_claim_lr_seq, committed_seq);
5630
5631 ztest_dataset_dirobj_verify(zd);
5632
5633 zil_replay(os, zd, ztest_replay_vector);
5634
5635 ztest_dataset_dirobj_verify(zd);
5636
5637 if (ztest_opts.zo_verbose >= 6)
5638 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
5639 zd->zd_name,
5640 (u_longlong_t)zilog->zl_parse_blk_count,
5651 return (0);
5652 }
5653
5654 static void
5655 ztest_dataset_close(int d)
5656 {
5657 ztest_ds_t *zd = &ztest_ds[d];
5658
5659 zil_close(zd->zd_zilog);
5660 dmu_objset_disown(zd->zd_os, zd);
5661
5662 ztest_zd_fini(zd);
5663 }
5664
5665 /*
5666 * Kick off threads to run tests on all datasets in parallel.
5667 */
5668 static void
5669 ztest_run(ztest_shared_t *zs)
5670 {
5671 thread_t *tid;
5672 spa_t *spa;
5673 objset_t *os;
5674 thread_t resume_tid;
5675 int error;
5676
5677 ztest_exiting = B_FALSE;
5678
5679 /*
5680 * Initialize parent/child shared state.
5681 */
5682 VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
5683 VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
5684
5685 zs->zs_thread_start = gethrtime();
5686 zs->zs_thread_stop =
5687 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
5688 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
5689 zs->zs_thread_kill = zs->zs_thread_stop;
5690 if (ztest_random(100) < ztest_opts.zo_killrate) {
5691 zs->zs_thread_kill -=
5692 ztest_random(ztest_opts.zo_passtime * NANOSEC);
5693 }
5694
5695 (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
5696
5697 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
5698 offsetof(ztest_cb_data_t, zcd_node));
5699
5700 /*
5701 * Open our pool.
5702 */
5703 kernel_init(FREAD | FWRITE);
5704 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
5705 spa->spa_debug = B_TRUE;
5706 metaslab_preload_limit = ztest_random(20) + 1;
5707 ztest_spa = spa;
5708
5709 dmu_objset_stats_t dds;
5710 VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
5711 DMU_OST_ANY, B_TRUE, FTAG, &os));
5712 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
5713 dmu_objset_fast_stat(os, &dds);
5714 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
5715 zs->zs_guid = dds.dds_guid;
5716 dmu_objset_disown(os, FTAG);
5717
5718 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
5719
5720 /*
5721 * We don't expect the pool to suspend unless maxfaults == 0,
5722 * in which case ztest_fault_inject() temporarily takes away
5723 * the only valid replica.
5724 */
5725 if (MAXFAULTS() == 0)
5726 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
5727 else
5728 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
5729
5730 /*
5731 * Create a thread to periodically resume suspended I/O.
5732 */
5733 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
5734 &resume_tid) == 0);
5735
5736 /*
5737 * Create a deadman thread to abort() if we hang.
5738 */
5739 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
5740 NULL) == 0);
5741
5742 /*
5743 * Verify that we can safely inquire about about any object,
5744 * whether it's allocated or not. To make it interesting,
5745 * we probe a 5-wide window around each power of two.
5746 * This hits all edge cases, including zero and the max.
5747 */
5748 for (int t = 0; t < 64; t++) {
5749 for (int d = -5; d <= 5; d++) {
5750 error = dmu_object_info(spa->spa_meta_objset,
5751 (1ULL << t) + d, NULL);
5752 ASSERT(error == 0 || error == ENOENT ||
5753 error == EINVAL);
5754 }
5755 }
5756
5757 /*
5758 * If we got any ENOSPC errors on the previous run, destroy something.
5759 */
5760 if (zs->zs_enospc_count != 0) {
5761 int d = ztest_random(ztest_opts.zo_datasets);
5762 ztest_dataset_destroy(d);
5763 }
5764 zs->zs_enospc_count = 0;
5765
5766 tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
5767 UMEM_NOFAIL);
5768
5769 if (ztest_opts.zo_verbose >= 4)
5770 (void) printf("starting main threads...\n");
5771
5772 /*
5773 * Kick off all the tests that run in parallel.
5774 */
5775 for (int t = 0; t < ztest_opts.zo_threads; t++) {
5776 if (t < ztest_opts.zo_datasets &&
5777 ztest_dataset_open(t) != 0)
5778 return;
5779 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
5780 THR_BOUND, &tid[t]) == 0);
5781 }
5782
5783 /*
5784 * Wait for all of the tests to complete. We go in reverse order
5785 * so we don't close datasets while threads are still using them.
5786 */
5787 for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
5788 VERIFY(thr_join(tid[t], NULL, NULL) == 0);
5789 if (t < ztest_opts.zo_datasets)
5790 ztest_dataset_close(t);
5791 }
5792
5793 txg_wait_synced(spa_get_dsl(spa), 0);
5794
5795 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
5796 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
5797 zfs_dbgmsg_print(FTAG);
5798
5799 umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
5800
5801 /* Kill the resume thread */
5802 ztest_exiting = B_TRUE;
5803 VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
5804 ztest_resume(spa);
5805
5806 /*
5807 * Right before closing the pool, kick off a bunch of async I/O;
5808 * spa_close() should wait for it to complete.
5809 */
5810 for (uint64_t object = 1; object < 50; object++) {
5811 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
5812 ZIO_PRIORITY_SYNC_READ);
5813 }
5814
5815 spa_close(spa, FTAG);
5816
5817 /*
5818 * Verify that we can loop over all pools.
5819 */
5820 mutex_enter(&spa_namespace_lock);
5821 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
5822 if (ztest_opts.zo_verbose > 3)
5823 (void) printf("spa_next: found %s\n", spa_name(spa));
5824 mutex_exit(&spa_namespace_lock);
5825
5826 /*
5827 * Verify that we can export the pool and reimport it under a
5828 * different name.
5829 */
5830 if (ztest_random(2) == 0) {
5831 char name[ZFS_MAX_DATASET_NAME_LEN];
5832 (void) snprintf(name, sizeof (name), "%s_import",
5833 ztest_opts.zo_pool);
5834 ztest_spa_import_export(ztest_opts.zo_pool, name);
5835 ztest_spa_import_export(name, ztest_opts.zo_pool);
5836 }
5837
5838 kernel_fini();
5839
5840 list_destroy(&zcl.zcl_callbacks);
5841
5842 (void) _mutex_destroy(&zcl.zcl_callbacks_lock);
5843
5844 (void) rwlock_destroy(&ztest_name_lock);
5845 (void) _mutex_destroy(&ztest_vdev_lock);
5846 }
5847
5848 static void
5849 ztest_freeze(void)
5850 {
5851 ztest_ds_t *zd = &ztest_ds[0];
5852 spa_t *spa;
5853 int numloops = 0;
5854
5855 if (ztest_opts.zo_verbose >= 3)
5856 (void) printf("testing spa_freeze()...\n");
5857
5858 kernel_init(FREAD | FWRITE);
5859 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
5860 VERIFY3U(0, ==, ztest_dataset_open(0));
5861 spa->spa_debug = B_TRUE;
5862 ztest_spa = spa;
5863
5864 /*
5865 * Force the first log block to be transactionally allocated.
5969 nvlist_t *props;
5970
5971 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
5972 if (ztest_random(2) == 0)
5973 return (props);
5974 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
5975
5976 return (props);
5977 }
5978
5979 /*
5980 * Create a storage pool with the given name and initial vdev size.
5981 * Then test spa_freeze() functionality.
5982 */
5983 static void
5984 ztest_init(ztest_shared_t *zs)
5985 {
5986 spa_t *spa;
5987 nvlist_t *nvroot, *props;
5988
5989 VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
5990 VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
5991
5992 kernel_init(FREAD | FWRITE);
5993
5994 /*
5995 * Create the storage pool.
5996 */
5997 (void) spa_destroy(ztest_opts.zo_pool);
5998 ztest_shared->zs_vdev_next_leaf = 0;
5999 zs->zs_splits = 0;
6000 zs->zs_mirrors = ztest_opts.zo_mirrors;
6001 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
6002 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
6003 props = make_random_props();
6004 for (int i = 0; i < SPA_FEATURES; i++) {
6005 char buf[1024];
6006 (void) snprintf(buf, sizeof (buf), "feature@%s",
6007 spa_feature_table[i].fi_uname);
6008 VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
6009 }
6010 VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
6011 nvlist_free(nvroot);
6012 nvlist_free(props);
6013
6014 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
6015 zs->zs_metaslab_sz =
6016 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
6017
6018 spa_close(spa, FTAG);
6019
6020 kernel_fini();
6021
6022 ztest_run_zdb(ztest_opts.zo_pool);
6023
6024 ztest_freeze();
6025
6026 ztest_run_zdb(ztest_opts.zo_pool);
6027
6028 (void) rwlock_destroy(&ztest_name_lock);
6029 (void) _mutex_destroy(&ztest_vdev_lock);
6030 }
6031
6032 static void
6033 setup_data_fd(void)
6034 {
6035 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
6036
6037 ztest_fd_data = mkstemp(ztest_name_data);
6038 ASSERT3S(ztest_fd_data, >=, 0);
6039 (void) unlink(ztest_name_data);
6040 }
6041
6042
6043 static int
6044 shared_data_size(ztest_shared_hdr_t *hdr)
6045 {
6046 int size;
6047
6048 size = hdr->zh_hdr_size;
6049 size += hdr->zh_opts_size;
|
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2013 Steven Hartland. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2017 Joyent, Inc.
28 * Copyright 2017 RackTop Systems.
29 */
30
31 /*
32 * The objective of this program is to provide a DMU/ZAP/SPA stress test
33 * that runs entirely in userland, is easy to use, and easy to extend.
34 *
35 * The overall design of the ztest program is as follows:
36 *
37 * (1) For each major functional area (e.g. adding vdevs to a pool,
38 * creating and destroying datasets, reading and writing objects, etc)
39 * we have a simple routine to test that functionality. These
40 * individual routines do not have to do anything "stressful".
41 *
42 * (2) We turn these simple functionality tests into a stress test by
43 * running them all in parallel, with as many threads as desired,
44 * and spread across as many datasets, objects, and vdevs as desired.
45 *
46 * (3) While all this is happening, we inject faults into the pool to
47 * verify that self-healing data really works.
48 *
229 } ztest_block_tag_t;
230
231 typedef struct bufwad {
232 uint64_t bw_index;
233 uint64_t bw_txg;
234 uint64_t bw_data;
235 } bufwad_t;
236
237 /*
238 * XXX -- fix zfs range locks to be generic so we can use them here.
239 */
240 typedef enum {
241 RL_READER,
242 RL_WRITER,
243 RL_APPEND
244 } rl_type_t;
245
246 typedef struct rll {
247 void *rll_writer;
248 int rll_readers;
249 kmutex_t rll_lock;
250 kcondvar_t rll_cv;
251 } rll_t;
252
253 typedef struct rl {
254 uint64_t rl_object;
255 uint64_t rl_offset;
256 uint64_t rl_size;
257 rll_t *rl_lock;
258 } rl_t;
259
260 #define ZTEST_RANGE_LOCKS 64
261 #define ZTEST_OBJECT_LOCKS 64
262
263 /*
264 * Object descriptor. Used as a template for object lookup/create/remove.
265 */
266 typedef struct ztest_od {
267 uint64_t od_dir;
268 uint64_t od_object;
269 dmu_object_type_t od_type;
270 dmu_object_type_t od_crtype;
271 uint64_t od_blocksize;
272 uint64_t od_crblocksize;
273 uint64_t od_gen;
274 uint64_t od_crgen;
275 char od_name[ZFS_MAX_DATASET_NAME_LEN];
276 } ztest_od_t;
277
278 /*
279 * Per-dataset state.
280 */
281 typedef struct ztest_ds {
282 ztest_shared_ds_t *zd_shared;
283 objset_t *zd_os;
284 krwlock_t zd_zilog_lock;
285 zilog_t *zd_zilog;
286 ztest_od_t *zd_od; /* debugging aid */
287 char zd_name[ZFS_MAX_DATASET_NAME_LEN];
288 kmutex_t zd_dirobj_lock;
289 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
290 rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
291 } ztest_ds_t;
292
293 /*
294 * Per-iteration state.
295 */
296 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
297
298 typedef struct ztest_info {
299 ztest_func_t *zi_func; /* test function */
300 uint64_t zi_iters; /* iterations per execution */
301 uint64_t *zi_interval; /* execute every <interval> seconds */
302 } ztest_info_t;
303
304 typedef struct ztest_shared_callstate {
305 uint64_t zc_count; /* per-pass count */
306 uint64_t zc_time; /* per-pass time */
307 uint64_t zc_next; /* next time to call this function */
308 } ztest_shared_callstate_t;
375 { ztest_reguid, 1, &zopt_rarely },
376 { ztest_spa_rename, 1, &zopt_rarely },
377 { ztest_scrub, 1, &zopt_rarely },
378 { ztest_spa_upgrade, 1, &zopt_rarely },
379 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
380 { ztest_vdev_attach_detach, 1, &zopt_sometimes },
381 { ztest_vdev_LUN_growth, 1, &zopt_rarely },
382 { ztest_vdev_add_remove, 1,
383 &ztest_opts.zo_vdevtime },
384 { ztest_vdev_aux_add_remove, 1,
385 &ztest_opts.zo_vdevtime },
386 };
387
388 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
389
390 /*
391 * The following struct is used to hold a list of uncalled commit callbacks.
392 * The callbacks are ordered by txg number.
393 */
394 typedef struct ztest_cb_list {
395 kmutex_t zcl_callbacks_lock;
396 list_t zcl_callbacks;
397 } ztest_cb_list_t;
398
399 /*
400 * Stuff we need to share writably between parent and child.
401 */
402 typedef struct ztest_shared {
403 boolean_t zs_do_init;
404 hrtime_t zs_proc_start;
405 hrtime_t zs_proc_stop;
406 hrtime_t zs_thread_start;
407 hrtime_t zs_thread_stop;
408 hrtime_t zs_thread_kill;
409 uint64_t zs_enospc_count;
410 uint64_t zs_vdev_next_leaf;
411 uint64_t zs_vdev_aux;
412 uint64_t zs_alloc;
413 uint64_t zs_space;
414 uint64_t zs_splits;
415 uint64_t zs_mirrors;
416 uint64_t zs_metaslab_sz;
417 uint64_t zs_metaslab_df_alloc_threshold;
418 uint64_t zs_guid;
419 } ztest_shared_t;
420
421 #define ID_PARALLEL -1ULL
422
423 static char ztest_dev_template[] = "%s/%s.%llua";
424 static char ztest_aux_template[] = "%s/%s.%s.%llu";
425 ztest_shared_t *ztest_shared;
426
427 static spa_t *ztest_spa = NULL;
428 static ztest_ds_t *ztest_ds;
429
430 static kmutex_t ztest_vdev_lock;
431
432 /*
433 * The ztest_name_lock protects the pool and dataset namespace used by
434 * the individual tests. To modify the namespace, consumers must grab
435 * this lock as writer. Grabbing the lock as reader will ensure that the
436 * namespace does not change while the lock is held.
437 */
438 static krwlock_t ztest_name_lock;
439
440 static boolean_t ztest_dump_core = B_TRUE;
441 static boolean_t ztest_exiting;
442
443 /* Global commit callback list */
444 static ztest_cb_list_t zcl;
445
446 enum ztest_object {
447 ZTEST_META_DNODE = 0,
448 ZTEST_DIROBJ,
449 ZTEST_OBJECTS
450 };
451
452 static void usage(boolean_t) __NORETURN;
453
454 /*
455 * These libumem hooks provide a reasonable set of defaults for the allocator's
456 * debugging facilities.
457 */
458 const char *
1074 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
1075
1076 error = spa_prop_set(spa, props);
1077
1078 nvlist_free(props);
1079
1080 if (error == ENOSPC) {
1081 ztest_record_enospc(FTAG);
1082 return (error);
1083 }
1084 ASSERT0(error);
1085
1086 return (error);
1087 }
1088
1089 static void
1090 ztest_rll_init(rll_t *rll)
1091 {
1092 rll->rll_writer = NULL;
1093 rll->rll_readers = 0;
1094 mutex_init(&rll->rll_lock, NULL, USYNC_THREAD, NULL);
1095 cv_init(&rll->rll_cv, NULL, USYNC_THREAD, NULL);
1096 }
1097
1098 static void
1099 ztest_rll_destroy(rll_t *rll)
1100 {
1101 ASSERT(rll->rll_writer == NULL);
1102 ASSERT(rll->rll_readers == 0);
1103 mutex_destroy(&rll->rll_lock);
1104 cv_destroy(&rll->rll_cv);
1105 }
1106
1107 static void
1108 ztest_rll_lock(rll_t *rll, rl_type_t type)
1109 {
1110 mutex_enter(&rll->rll_lock);
1111
1112 if (type == RL_READER) {
1113 while (rll->rll_writer != NULL)
1114 cv_wait(&rll->rll_cv, &rll->rll_lock);
1115 rll->rll_readers++;
1116 } else {
1117 while (rll->rll_writer != NULL || rll->rll_readers)
1118 cv_wait(&rll->rll_cv, &rll->rll_lock);
1119 rll->rll_writer = curthread;
1120 }
1121
1122 mutex_exit(&rll->rll_lock);
1123 }
1124
1125 static void
1126 ztest_rll_unlock(rll_t *rll)
1127 {
1128 mutex_enter(&rll->rll_lock);
1129
1130 if (rll->rll_writer) {
1131 ASSERT(rll->rll_readers == 0);
1132 rll->rll_writer = NULL;
1133 } else {
1134 ASSERT(rll->rll_readers != 0);
1135 ASSERT(rll->rll_writer == NULL);
1136 rll->rll_readers--;
1137 }
1138
1139 if (rll->rll_writer == NULL && rll->rll_readers == 0)
1140 cv_broadcast(&rll->rll_cv);
1141
1142 mutex_exit(&rll->rll_lock);
1143 }
1144
1145 static void
1146 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
1147 {
1148 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
1149
1150 ztest_rll_lock(rll, type);
1151 }
1152
1153 static void
1154 ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
1155 {
1156 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
1157
1158 ztest_rll_unlock(rll);
1159 }
1160
1161 static rl_t *
1162 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
1181 ztest_range_unlock(rl_t *rl)
1182 {
1183 rll_t *rll = rl->rl_lock;
1184
1185 ztest_rll_unlock(rll);
1186
1187 umem_free(rl, sizeof (*rl));
1188 }
1189
1190 static void
1191 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
1192 {
1193 zd->zd_os = os;
1194 zd->zd_zilog = dmu_objset_zil(os);
1195 zd->zd_shared = szd;
1196 dmu_objset_name(os, zd->zd_name);
1197
1198 if (zd->zd_shared != NULL)
1199 zd->zd_shared->zd_seq = 0;
1200
1201 rw_init(&zd->zd_zilog_lock, NULL, USYNC_THREAD, NULL);
1202 mutex_init(&zd->zd_dirobj_lock, NULL, USYNC_THREAD, NULL);
1203
1204 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
1205 ztest_rll_init(&zd->zd_object_lock[l]);
1206
1207 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
1208 ztest_rll_init(&zd->zd_range_lock[l]);
1209 }
1210
1211 static void
1212 ztest_zd_fini(ztest_ds_t *zd)
1213 {
1214 mutex_destroy(&zd->zd_dirobj_lock);
1215
1216 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
1217 ztest_rll_destroy(&zd->zd_object_lock[l]);
1218
1219 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
1220 ztest_rll_destroy(&zd->zd_range_lock[l]);
1221 }
1222
1223 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
1224
1225 static uint64_t
1226 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
1227 {
1228 uint64_t txg;
1229 int error;
1230
1231 /*
1232 * Attempt to assign tx to some transaction group.
1233 */
1234 error = dmu_tx_assign(tx, txg_how);
1949 return (lr);
1950 }
1951
1952 void
1953 ztest_lr_free(void *lr, size_t lrsize, char *name)
1954 {
1955 size_t namesize = name ? strlen(name) + 1 : 0;
1956
1957 umem_free(lr, lrsize + namesize);
1958 }
1959
1960 /*
1961 * Lookup a bunch of objects. Returns the number of objects not found.
1962 */
1963 static int
1964 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
1965 {
1966 int missing = 0;
1967 int error;
1968
1969 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
1970
1971 for (int i = 0; i < count; i++, od++) {
1972 od->od_object = 0;
1973 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
1974 sizeof (uint64_t), 1, &od->od_object);
1975 if (error) {
1976 ASSERT(error == ENOENT);
1977 ASSERT(od->od_object == 0);
1978 missing++;
1979 } else {
1980 dmu_buf_t *db;
1981 ztest_block_tag_t *bbt;
1982 dmu_object_info_t doi;
1983
1984 ASSERT(od->od_object != 0);
1985 ASSERT(missing == 0); /* there should be no gaps */
1986
1987 ztest_object_lock(zd, od->od_object, RL_READER);
1988 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
1989 od->od_object, FTAG, &db));
1990 dmu_object_info_from_db(db, &doi);
1991 bbt = ztest_bt_bonus(db);
1992 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1993 od->od_type = doi.doi_type;
1994 od->od_blocksize = doi.doi_data_block_size;
1995 od->od_gen = bbt->bt_gen;
1996 dmu_buf_rele(db, FTAG);
1997 ztest_object_unlock(zd, od->od_object);
1998 }
1999 }
2000
2001 return (missing);
2002 }
2003
2004 static int
2005 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
2006 {
2007 int missing = 0;
2008
2009 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
2010
2011 for (int i = 0; i < count; i++, od++) {
2012 if (missing) {
2013 od->od_object = 0;
2014 missing++;
2015 continue;
2016 }
2017
2018 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
2019
2020 lr->lr_doid = od->od_dir;
2021 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
2022 lr->lrz_type = od->od_crtype;
2023 lr->lrz_blocksize = od->od_crblocksize;
2024 lr->lrz_ibshift = ztest_random_ibshift();
2025 lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
2026 lr->lrz_bonuslen = dmu_bonus_max();
2027 lr->lr_gen = od->od_crgen;
2028 lr->lr_crtime[0] = time(NULL);
2029
2034 } else {
2035 od->od_object = lr->lr_foid;
2036 od->od_type = od->od_crtype;
2037 od->od_blocksize = od->od_crblocksize;
2038 od->od_gen = od->od_crgen;
2039 ASSERT(od->od_object != 0);
2040 }
2041
2042 ztest_lr_free(lr, sizeof (*lr), od->od_name);
2043 }
2044
2045 return (missing);
2046 }
2047
2048 static int
2049 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
2050 {
2051 int missing = 0;
2052 int error;
2053
2054 ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock));
2055
2056 od += count - 1;
2057
2058 for (int i = count - 1; i >= 0; i--, od--) {
2059 if (missing) {
2060 missing++;
2061 continue;
2062 }
2063
2064 /*
2065 * No object was found.
2066 */
2067 if (od->od_object == 0)
2068 continue;
2069
2070 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
2071
2072 lr->lr_doid = od->od_dir;
2073
2074 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
2180 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
2181 {
2182 int err;
2183 ztest_block_tag_t wbt;
2184 dmu_object_info_t doi;
2185 enum ztest_io_type io_type;
2186 uint64_t blocksize;
2187 void *data;
2188
2189 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
2190 blocksize = doi.doi_data_block_size;
2191 data = umem_alloc(blocksize, UMEM_NOFAIL);
2192
2193 /*
2194 * Pick an i/o type at random, biased toward writing block tags.
2195 */
2196 io_type = ztest_random(ZTEST_IO_TYPES);
2197 if (ztest_random(2) == 0)
2198 io_type = ZTEST_IO_WRITE_TAG;
2199
2200 rw_enter(&zd->zd_zilog_lock, RW_READER);
2201
2202 switch (io_type) {
2203
2204 case ZTEST_IO_WRITE_TAG:
2205 ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
2206 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
2207 break;
2208
2209 case ZTEST_IO_WRITE_PATTERN:
2210 (void) memset(data, 'a' + (object + offset) % 5, blocksize);
2211 if (ztest_random(2) == 0) {
2212 /*
2213 * Induce fletcher2 collisions to ensure that
2214 * zio_ddt_collision() detects and resolves them
2215 * when using fletcher2-verify for deduplication.
2216 */
2217 ((uint64_t *)data)[0] ^= 1ULL << 63;
2218 ((uint64_t *)data)[4] ^= 1ULL << 63;
2219 }
2220 (void) ztest_write(zd, object, offset, blocksize, data);
2221 break;
2222
2223 case ZTEST_IO_WRITE_ZEROES:
2224 bzero(data, blocksize);
2225 (void) ztest_write(zd, object, offset, blocksize, data);
2226 break;
2227
2228 case ZTEST_IO_TRUNCATE:
2229 (void) ztest_truncate(zd, object, offset, blocksize);
2230 break;
2231
2232 case ZTEST_IO_SETATTR:
2233 (void) ztest_setattr(zd, object);
2234 break;
2235
2236 case ZTEST_IO_REWRITE:
2237 rw_enter(&ztest_name_lock, RW_READER);
2238 err = ztest_dsl_prop_set_uint64(zd->zd_name,
2239 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
2240 B_FALSE);
2241 VERIFY(err == 0 || err == ENOSPC);
2242 err = ztest_dsl_prop_set_uint64(zd->zd_name,
2243 ZFS_PROP_COMPRESSION,
2244 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
2245 B_FALSE);
2246 VERIFY(err == 0 || err == ENOSPC);
2247 rw_exit(&ztest_name_lock);
2248
2249 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
2250 DMU_READ_NO_PREFETCH));
2251
2252 (void) ztest_write(zd, object, offset, blocksize, data);
2253 break;
2254 }
2255
2256 rw_exit(&zd->zd_zilog_lock);
2257
2258 umem_free(data, blocksize);
2259 }
2260
2261 /*
2262 * Initialize an object description template.
2263 */
2264 static void
2265 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
2266 dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
2267 {
2268 od->od_dir = ZTEST_DIROBJ;
2269 od->od_object = 0;
2270
2271 od->od_crtype = type;
2272 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
2273 od->od_crgen = gen;
2274
2275 od->od_type = DMU_OT_NONE;
2276 od->od_blocksize = 0;
2277 od->od_gen = 0;
2278
2279 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
2280 tag, (int64_t)id, index);
2281 }
2282
2283 /*
2284 * Lookup or create the objects for a test using the od template.
2285 * If the objects do not all exist, or if 'remove' is specified,
2286 * remove any existing objects and create new ones. Otherwise,
2287 * use the existing objects.
2288 */
2289 static int
2290 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
2291 {
2292 int count = size / sizeof (*od);
2293 int rv = 0;
2294
2295 mutex_enter(&zd->zd_dirobj_lock);
2296 if ((ztest_lookup(zd, od, count) != 0 || remove) &&
2297 (ztest_remove(zd, od, count) != 0 ||
2298 ztest_create(zd, od, count) != 0))
2299 rv = -1;
2300 zd->zd_od = od;
2301 mutex_exit(&zd->zd_dirobj_lock);
2302
2303 return (rv);
2304 }
2305
2306 /* ARGSUSED */
2307 void
2308 ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
2309 {
2310 zilog_t *zilog = zd->zd_zilog;
2311
2312 rw_enter(&zd->zd_zilog_lock, RW_READER);
2313
2314 zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
2315
2316 /*
2317 * Remember the committed values in zd, which is in parent/child
2318 * shared memory. If we die, the next iteration of ztest_run()
2319 * will verify that the log really does contain this record.
2320 */
2321 mutex_enter(&zilog->zl_lock);
2322 ASSERT(zd->zd_shared != NULL);
2323 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
2324 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
2325 mutex_exit(&zilog->zl_lock);
2326
2327 rw_exit(&zd->zd_zilog_lock);
2328 }
2329
2330 /*
2331 * This function is designed to simulate the operations that occur during a
2332 * mount/unmount operation. We hold the dataset across these operations in an
2333 * attempt to expose any implicit assumptions about ZIL management.
2334 */
2335 /* ARGSUSED */
2336 void
2337 ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
2338 {
2339 objset_t *os = zd->zd_os;
2340
2341 /*
2342 * We grab the zd_dirobj_lock to ensure that no other thread is
2343 * updating the zil (i.e. adding in-memory log records) and the
2344 * zd_zilog_lock to block any I/O.
2345 */
2346 mutex_enter(&zd->zd_dirobj_lock);
2347 rw_enter(&zd->zd_zilog_lock, RW_WRITER);
2348
2349 /* zfsvfs_teardown() */
2350 zil_close(zd->zd_zilog);
2351
2352 /* zfsvfs_setup() */
2353 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
2354 zil_replay(os, zd, ztest_replay_vector);
2355
2356 rw_exit(&zd->zd_zilog_lock);
2357 mutex_exit(&zd->zd_dirobj_lock);
2358 }
2359
2360 /*
2361 * Verify that we can't destroy an active pool, create an existing pool,
2362 * or create a pool with a bad vdev spec.
2363 */
2364 /* ARGSUSED */
2365 void
2366 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
2367 {
2368 ztest_shared_opts_t *zo = &ztest_opts;
2369 spa_t *spa;
2370 nvlist_t *nvroot;
2371
2372 /*
2373 * Attempt to create using a bad file.
2374 */
2375 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
2376 VERIFY3U(ENOENT, ==,
2377 spa_create("ztest_bad_file", nvroot, NULL, NULL));
2378 nvlist_free(nvroot);
2379
2380 /*
2381 * Attempt to create using a bad mirror.
2382 */
2383 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
2384 VERIFY3U(ENOENT, ==,
2385 spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
2386 nvlist_free(nvroot);
2387
2388 /*
2389 * Attempt to create an existing pool. It shouldn't matter
2390 * what's in the nvroot; we should fail with EEXIST.
2391 */
2392 rw_enter(&ztest_name_lock, RW_READER);
2393 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
2394 VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
2395 nvlist_free(nvroot);
2396 VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
2397 VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
2398 spa_close(spa, FTAG);
2399
2400 rw_exit(&ztest_name_lock);
2401 }
2402
2403 /* ARGSUSED */
2404 void
2405 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
2406 {
2407 spa_t *spa;
2408 uint64_t initial_version = SPA_VERSION_INITIAL;
2409 uint64_t version, newversion;
2410 nvlist_t *nvroot, *props;
2411 char *name;
2412
2413 mutex_enter(&ztest_vdev_lock);
2414 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
2415
2416 /*
2417 * Clean up from previous runs.
2418 */
2419 (void) spa_destroy(name);
2420
2421 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
2422 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
2423
2424 /*
2425 * If we're configuring a RAIDZ device then make sure that the
2426 * the initial version is capable of supporting that feature.
2427 */
2428 switch (ztest_opts.zo_raidz_parity) {
2429 case 0:
2430 case 1:
2431 initial_version = SPA_VERSION_INITIAL;
2432 break;
2433 case 2:
2452 VERIFY0(spa_create(name, nvroot, props, NULL));
2453 fnvlist_free(nvroot);
2454 fnvlist_free(props);
2455
2456 VERIFY0(spa_open(name, &spa, FTAG));
2457 VERIFY3U(spa_version(spa), ==, version);
2458 newversion = ztest_random_spa_version(version + 1);
2459
2460 if (ztest_opts.zo_verbose >= 4) {
2461 (void) printf("upgrading spa version from %llu to %llu\n",
2462 (u_longlong_t)version, (u_longlong_t)newversion);
2463 }
2464
2465 spa_upgrade(spa, newversion);
2466 VERIFY3U(spa_version(spa), >, version);
2467 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
2468 zpool_prop_to_name(ZPOOL_PROP_VERSION)));
2469 spa_close(spa, FTAG);
2470
2471 strfree(name);
2472 mutex_exit(&ztest_vdev_lock);
2473 }
2474
2475 static vdev_t *
2476 vdev_lookup_by_path(vdev_t *vd, const char *path)
2477 {
2478 vdev_t *mvd;
2479
2480 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
2481 return (vd);
2482
2483 for (int c = 0; c < vd->vdev_children; c++)
2484 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
2485 NULL)
2486 return (mvd);
2487
2488 return (NULL);
2489 }
2490
2491 /*
2492 * Find the first available hole which can be used as a top-level.
2505 if (cvd->vdev_ishole)
2506 break;
2507 }
2508 return (c);
2509 }
2510
2511 /*
2512 * Verify that vdev_add() works as expected.
2513 */
2514 /* ARGSUSED */
2515 void
2516 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
2517 {
2518 ztest_shared_t *zs = ztest_shared;
2519 spa_t *spa = ztest_spa;
2520 uint64_t leaves;
2521 uint64_t guid;
2522 nvlist_t *nvroot;
2523 int error;
2524
2525 mutex_enter(&ztest_vdev_lock);
2526 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
2527
2528 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2529
2530 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
2531
2532 /*
2533 * If we have slogs then remove them 1/4 of the time.
2534 */
2535 if (spa_has_slogs(spa) && ztest_random(4) == 0) {
2536 /*
2537 * Grab the guid from the head of the log class rotor.
2538 */
2539 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
2540
2541 spa_config_exit(spa, SCL_VDEV, FTAG);
2542
2543 /*
2544 * We have to grab the zs_name_lock as writer to
2545 * prevent a race between removing a slog (dmu_objset_find)
2546 * and destroying a dataset. Removing the slog will
2547 * grab a reference on the dataset which may cause
2548 * dmu_objset_destroy() to fail with EBUSY thus
2549 * leaving the dataset in an inconsistent state.
2550 */
2551 rw_enter(&ztest_name_lock, RW_WRITER);
2552 error = spa_vdev_remove(spa, guid, B_FALSE);
2553 rw_exit(&ztest_name_lock);
2554
2555 if (error && error != EEXIST)
2556 fatal(0, "spa_vdev_remove() = %d", error);
2557 } else {
2558 spa_config_exit(spa, SCL_VDEV, FTAG);
2559
2560 /*
2561 * Make 1/4 of the devices be log devices.
2562 */
2563 nvroot = make_vdev_root(NULL, NULL, NULL,
2564 ztest_opts.zo_vdev_size, 0,
2565 ztest_random(4) == 0, ztest_opts.zo_raidz,
2566 zs->zs_mirrors, 1);
2567
2568 error = spa_vdev_add(spa, nvroot);
2569 nvlist_free(nvroot);
2570
2571 if (error == ENOSPC)
2572 ztest_record_enospc("spa_vdev_add");
2573 else if (error != 0)
2574 fatal(0, "spa_vdev_add() = %d", error);
2575 }
2576
2577 mutex_exit(&ztest_vdev_lock);
2578 }
2579
2580 /*
2581 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
2582 */
2583 /* ARGSUSED */
2584 void
2585 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
2586 {
2587 ztest_shared_t *zs = ztest_shared;
2588 spa_t *spa = ztest_spa;
2589 vdev_t *rvd = spa->spa_root_vdev;
2590 spa_aux_vdev_t *sav;
2591 char *aux;
2592 uint64_t guid = 0;
2593 int error;
2594
2595 if (ztest_random(2) == 0) {
2596 sav = &spa->spa_spares;
2597 aux = ZPOOL_CONFIG_SPARES;
2598 } else {
2599 sav = &spa->spa_l2cache;
2600 aux = ZPOOL_CONFIG_L2CACHE;
2601 }
2602
2603 mutex_enter(&ztest_vdev_lock);
2604
2605 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2606
2607 if (sav->sav_count != 0 && ztest_random(4) == 0) {
2608 /*
2609 * Pick a random device to remove.
2610 */
2611 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
2612 } else {
2613 /*
2614 * Find an unused device we can add.
2615 */
2616 zs->zs_vdev_aux = 0;
2617 for (;;) {
2618 char path[MAXPATHLEN];
2619 int c;
2620 (void) snprintf(path, sizeof (path), ztest_aux_template,
2621 ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
2622 zs->zs_vdev_aux);
2623 for (c = 0; c < sav->sav_count; c++)
2640 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
2641 (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
2642 error = spa_vdev_add(spa, nvroot);
2643 if (error != 0)
2644 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
2645 nvlist_free(nvroot);
2646 } else {
2647 /*
2648 * Remove an existing device. Sometimes, dirty its
2649 * vdev state first to make sure we handle removal
2650 * of devices that have pending state changes.
2651 */
2652 if (ztest_random(2) == 0)
2653 (void) vdev_online(spa, guid, 0, NULL);
2654
2655 error = spa_vdev_remove(spa, guid, B_FALSE);
2656 if (error != 0 && error != EBUSY)
2657 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
2658 }
2659
2660 mutex_exit(&ztest_vdev_lock);
2661 }
2662
2663 /*
2664 * split a pool if it has mirror tlvdevs
2665 */
2666 /* ARGSUSED */
2667 void
2668 ztest_split_pool(ztest_ds_t *zd, uint64_t id)
2669 {
2670 ztest_shared_t *zs = ztest_shared;
2671 spa_t *spa = ztest_spa;
2672 vdev_t *rvd = spa->spa_root_vdev;
2673 nvlist_t *tree, **child, *config, *split, **schild;
2674 uint_t c, children, schildren = 0, lastlogid = 0;
2675 int error = 0;
2676
2677 mutex_enter(&ztest_vdev_lock);
2678
2679 /* ensure we have a useable config; mirrors of raidz aren't supported */
2680 if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
2681 mutex_exit(&ztest_vdev_lock);
2682 return;
2683 }
2684
2685 /* clean up the old pool, if any */
2686 (void) spa_destroy("splitp");
2687
2688 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2689
2690 /* generate a config from the existing config */
2691 mutex_enter(&spa->spa_props_lock);
2692 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
2693 &tree) == 0);
2694 mutex_exit(&spa->spa_props_lock);
2695
2696 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
2697 &children) == 0);
2698
2699 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
2700 for (c = 0; c < children; c++) {
2701 vdev_t *tvd = rvd->vdev_child[c];
2720 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
2721 }
2722
2723 /* OK, create a config that can be used to split */
2724 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
2725 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
2726 VDEV_TYPE_ROOT) == 0);
2727 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
2728 lastlogid != 0 ? lastlogid : schildren) == 0);
2729
2730 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
2731 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
2732
2733 for (c = 0; c < schildren; c++)
2734 nvlist_free(schild[c]);
2735 free(schild);
2736 nvlist_free(split);
2737
2738 spa_config_exit(spa, SCL_VDEV, FTAG);
2739
2740 rw_enter(&ztest_name_lock, RW_WRITER);
2741 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
2742 rw_exit(&ztest_name_lock);
2743
2744 nvlist_free(config);
2745
2746 if (error == 0) {
2747 (void) printf("successful split - results:\n");
2748 mutex_enter(&spa_namespace_lock);
2749 show_pool_stats(spa);
2750 show_pool_stats(spa_lookup("splitp"));
2751 mutex_exit(&spa_namespace_lock);
2752 ++zs->zs_splits;
2753 --zs->zs_mirrors;
2754 }
2755 mutex_exit(&ztest_vdev_lock);
2756
2757 }
2758
2759 /*
2760 * Verify that we can attach and detach devices.
2761 */
2762 /* ARGSUSED */
2763 void
2764 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
2765 {
2766 ztest_shared_t *zs = ztest_shared;
2767 spa_t *spa = ztest_spa;
2768 spa_aux_vdev_t *sav = &spa->spa_spares;
2769 vdev_t *rvd = spa->spa_root_vdev;
2770 vdev_t *oldvd, *newvd, *pvd;
2771 nvlist_t *root;
2772 uint64_t leaves;
2773 uint64_t leaf, top;
2774 uint64_t ashift = ztest_get_ashift();
2775 uint64_t oldguid, pguid;
2776 uint64_t oldsize, newsize;
2777 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
2778 int replacing;
2779 int oldvd_has_siblings = B_FALSE;
2780 int newvd_is_spare = B_FALSE;
2781 int oldvd_is_log;
2782 int error, expected_error;
2783
2784 mutex_enter(&ztest_vdev_lock);
2785 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
2786
2787 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2788
2789 /*
2790 * Decide whether to do an attach or a replace.
2791 */
2792 replacing = ztest_random(2);
2793
2794 /*
2795 * Pick a random top-level vdev.
2796 */
2797 top = ztest_random_vdev_top(spa, B_TRUE);
2798
2799 /*
2800 * Pick a random leaf within it.
2801 */
2802 leaf = ztest_random(leaves);
2803
2804 /*
2825 ASSERT(oldvd->vdev_children >= 2);
2826 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
2827 }
2828
2829 oldguid = oldvd->vdev_guid;
2830 oldsize = vdev_get_min_asize(oldvd);
2831 oldvd_is_log = oldvd->vdev_top->vdev_islog;
2832 (void) strcpy(oldpath, oldvd->vdev_path);
2833 pvd = oldvd->vdev_parent;
2834 pguid = pvd->vdev_guid;
2835
2836 /*
2837 * If oldvd has siblings, then half of the time, detach it.
2838 */
2839 if (oldvd_has_siblings && ztest_random(2) == 0) {
2840 spa_config_exit(spa, SCL_VDEV, FTAG);
2841 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
2842 if (error != 0 && error != ENODEV && error != EBUSY &&
2843 error != ENOTSUP)
2844 fatal(0, "detach (%s) returned %d", oldpath, error);
2845 mutex_exit(&ztest_vdev_lock);
2846 return;
2847 }
2848
2849 /*
2850 * For the new vdev, choose with equal probability between the two
2851 * standard paths (ending in either 'a' or 'b') or a random hot spare.
2852 */
2853 if (sav->sav_count != 0 && ztest_random(3) == 0) {
2854 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
2855 newvd_is_spare = B_TRUE;
2856 (void) strcpy(newpath, newvd->vdev_path);
2857 } else {
2858 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
2859 ztest_opts.zo_dir, ztest_opts.zo_pool,
2860 top * leaves + leaf);
2861 if (ztest_random(2) == 0)
2862 newpath[strlen(newpath) - 1] = 'b';
2863 newvd = vdev_lookup_by_path(rvd, newpath);
2864 }
2865
2919 * fail with ENODEV, or fail with EOVERFLOW.
2920 */
2921 if (expected_error == ENOTSUP &&
2922 (error == 0 || error == ENODEV || error == EOVERFLOW))
2923 expected_error = error;
2924
2925 /*
2926 * If someone grew the LUN, the replacement may be too small.
2927 */
2928 if (error == EOVERFLOW || error == EBUSY)
2929 expected_error = error;
2930
2931 /* XXX workaround 6690467 */
2932 if (error != expected_error && expected_error != EBUSY) {
2933 fatal(0, "attach (%s %llu, %s %llu, %d) "
2934 "returned %d, expected %d",
2935 oldpath, oldsize, newpath,
2936 newsize, replacing, error, expected_error);
2937 }
2938
2939 mutex_exit(&ztest_vdev_lock);
2940 }
2941
2942 /*
2943 * Callback function which expands the physical size of the vdev.
2944 */
2945 vdev_t *
2946 grow_vdev(vdev_t *vd, void *arg)
2947 {
2948 spa_t *spa = vd->vdev_spa;
2949 size_t *newsize = arg;
2950 size_t fsize;
2951 int fd;
2952
2953 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2954 ASSERT(vd->vdev_ops->vdev_op_leaf);
2955
2956 if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
2957 return (vd);
2958
2959 fsize = lseek(fd, 0, SEEK_END);
3047 return (cvd);
3048 }
3049 return (NULL);
3050 }
3051
3052 /*
3053 * Verify that dynamic LUN growth works as expected.
3054 */
3055 /* ARGSUSED */
3056 void
3057 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
3058 {
3059 spa_t *spa = ztest_spa;
3060 vdev_t *vd, *tvd;
3061 metaslab_class_t *mc;
3062 metaslab_group_t *mg;
3063 size_t psize, newsize;
3064 uint64_t top;
3065 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
3066
3067 mutex_enter(&ztest_vdev_lock);
3068 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3069
3070 top = ztest_random_vdev_top(spa, B_TRUE);
3071
3072 tvd = spa->spa_root_vdev->vdev_child[top];
3073 mg = tvd->vdev_mg;
3074 mc = mg->mg_class;
3075 old_ms_count = tvd->vdev_ms_count;
3076 old_class_space = metaslab_class_get_space(mc);
3077
3078 /*
3079 * Determine the size of the first leaf vdev associated with
3080 * our top-level device.
3081 */
3082 vd = vdev_walk_tree(tvd, NULL, NULL);
3083 ASSERT3P(vd, !=, NULL);
3084 ASSERT(vd->vdev_ops->vdev_op_leaf);
3085
3086 psize = vd->vdev_psize;
3087
3088 /*
3089 * We only try to expand the vdev if it's healthy, less than 4x its
3090 * original size, and it has a valid psize.
3091 */
3092 if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
3093 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
3094 spa_config_exit(spa, SCL_STATE, spa);
3095 mutex_exit(&ztest_vdev_lock);
3096 return;
3097 }
3098 ASSERT(psize > 0);
3099 newsize = psize + psize / 8;
3100 ASSERT3U(newsize, >, psize);
3101
3102 if (ztest_opts.zo_verbose >= 6) {
3103 (void) printf("Expanding LUN %s from %lu to %lu\n",
3104 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
3105 }
3106
3107 /*
3108 * Growing the vdev is a two step process:
3109 * 1). expand the physical size (i.e. relabel)
3110 * 2). online the vdev to create the new metaslabs
3111 */
3112 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
3113 vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
3114 tvd->vdev_state != VDEV_STATE_HEALTHY) {
3115 if (ztest_opts.zo_verbose >= 5) {
3116 (void) printf("Could not expand LUN because "
3117 "the vdev configuration changed.\n");
3118 }
3119 spa_config_exit(spa, SCL_STATE, spa);
3120 mutex_exit(&ztest_vdev_lock);
3121 return;
3122 }
3123
3124 spa_config_exit(spa, SCL_STATE, spa);
3125
3126 /*
3127 * Expanding the LUN will update the config asynchronously,
3128 * thus we must wait for the async thread to complete any
3129 * pending tasks before proceeding.
3130 */
3131 for (;;) {
3132 boolean_t done;
3133 mutex_enter(&spa->spa_async_lock);
3134 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
3135 mutex_exit(&spa->spa_async_lock);
3136 if (done)
3137 break;
3138 txg_wait_synced(spa_get_dsl(spa), 0);
3139 (void) poll(NULL, 0, 100);
3140 }
3141
3142 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3143
3144 tvd = spa->spa_root_vdev->vdev_child[top];
3145 new_ms_count = tvd->vdev_ms_count;
3146 new_class_space = metaslab_class_get_space(mc);
3147
3148 if (tvd->vdev_mg != mg || mg->mg_class != mc) {
3149 if (ztest_opts.zo_verbose >= 5) {
3150 (void) printf("Could not verify LUN expansion due to "
3151 "intervening vdev offline or remove.\n");
3152 }
3153 spa_config_exit(spa, SCL_STATE, spa);
3154 mutex_exit(&ztest_vdev_lock);
3155 return;
3156 }
3157
3158 /*
3159 * Make sure we were able to grow the vdev.
3160 */
3161 if (new_ms_count <= old_ms_count)
3162 fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
3163 old_ms_count, new_ms_count);
3164
3165 /*
3166 * Make sure we were able to grow the pool.
3167 */
3168 if (new_class_space <= old_class_space)
3169 fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
3170 old_class_space, new_class_space);
3171
3172 if (ztest_opts.zo_verbose >= 5) {
3173 char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ];
3174
3175 nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf));
3176 nicenum(new_class_space, newnumbuf, sizeof (newnumbuf));
3177 (void) printf("%s grew from %s to %s\n",
3178 spa->spa_name, oldnumbuf, newnumbuf);
3179 }
3180
3181 spa_config_exit(spa, SCL_STATE, spa);
3182 mutex_exit(&ztest_vdev_lock);
3183 }
3184
3185 /*
3186 * Verify that dmu_objset_{create,destroy,open,close} work as expected.
3187 */
3188 /* ARGSUSED */
3189 static void
3190 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
3191 {
3192 /*
3193 * Create the objects common to all ztest datasets.
3194 */
3195 VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
3196 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
3197 }
3198
3199 static int
3200 ztest_dataset_create(char *dsname)
3201 {
3202 uint64_t zilset = ztest_random(100);
3276 (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname,
3277 (u_longlong_t)id);
3278
3279 error = dsl_destroy_snapshot(snapname, B_FALSE);
3280 if (error != 0 && error != ENOENT)
3281 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
3282 return (B_TRUE);
3283 }
3284
3285 /* ARGSUSED */
3286 void
3287 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
3288 {
3289 ztest_ds_t zdtmp;
3290 int iters;
3291 int error;
3292 objset_t *os, *os2;
3293 char name[ZFS_MAX_DATASET_NAME_LEN];
3294 zilog_t *zilog;
3295
3296 rw_enter(&ztest_name_lock, RW_READER);
3297
3298 (void) snprintf(name, sizeof (name), "%s/temp_%llu",
3299 ztest_opts.zo_pool, (u_longlong_t)id);
3300
3301 /*
3302 * If this dataset exists from a previous run, process its replay log
3303 * half of the time. If we don't replay it, then dmu_objset_destroy()
3304 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
3305 */
3306 if (ztest_random(2) == 0 &&
3307 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
3308 ztest_zd_init(&zdtmp, NULL, os);
3309 zil_replay(os, &zdtmp, ztest_replay_vector);
3310 ztest_zd_fini(&zdtmp);
3311 dmu_objset_disown(os, FTAG);
3312 }
3313
3314 /*
3315 * There may be an old instance of the dataset we're about to
3316 * create lying around from a previous run. If so, destroy it
3317 * and all of its snapshots.
3318 */
3319 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
3320 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
3321
3322 /*
3323 * Verify that the destroyed dataset is no longer in the namespace.
3324 */
3325 VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
3326 FTAG, &os));
3327
3328 /*
3329 * Verify that we can create a new dataset.
3330 */
3331 error = ztest_dataset_create(name);
3332 if (error) {
3333 if (error == ENOSPC) {
3334 ztest_record_enospc(FTAG);
3335 rw_exit(&ztest_name_lock);
3336 return;
3337 }
3338 fatal(0, "dmu_objset_create(%s) = %d", name, error);
3339 }
3340
3341 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
3342
3343 ztest_zd_init(&zdtmp, NULL, os);
3344
3345 /*
3346 * Open the intent log for it.
3347 */
3348 zilog = zil_open(os, ztest_get_data);
3349
3350 /*
3351 * Put some objects in there, do a little I/O to them,
3352 * and randomly take a couple of snapshots along the way.
3353 */
3354 iters = ztest_random(5);
3355 for (int i = 0; i < iters; i++) {
3363 */
3364 VERIFY3U(EEXIST, ==,
3365 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
3366
3367 /*
3368 * Verify that we can hold an objset that is also owned.
3369 */
3370 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
3371 dmu_objset_rele(os2, FTAG);
3372
3373 /*
3374 * Verify that we cannot own an objset that is already owned.
3375 */
3376 VERIFY3U(EBUSY, ==,
3377 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
3378
3379 zil_close(zilog);
3380 dmu_objset_disown(os, FTAG);
3381 ztest_zd_fini(&zdtmp);
3382
3383 rw_exit(&ztest_name_lock);
3384 }
3385
3386 /*
3387 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
3388 */
3389 void
3390 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
3391 {
3392 rw_enter(&ztest_name_lock, RW_READER);
3393 (void) ztest_snapshot_destroy(zd->zd_name, id);
3394 (void) ztest_snapshot_create(zd->zd_name, id);
3395 rw_exit(&ztest_name_lock);
3396 }
3397
3398 /*
3399 * Cleanup non-standard snapshots and clones.
3400 */
3401 void
3402 ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
3403 {
3404 char snap1name[ZFS_MAX_DATASET_NAME_LEN];
3405 char clone1name[ZFS_MAX_DATASET_NAME_LEN];
3406 char snap2name[ZFS_MAX_DATASET_NAME_LEN];
3407 char clone2name[ZFS_MAX_DATASET_NAME_LEN];
3408 char snap3name[ZFS_MAX_DATASET_NAME_LEN];
3409 int error;
3410
3411 (void) snprintf(snap1name, sizeof (snap1name),
3412 "%s@s1_%llu", osname, id);
3413 (void) snprintf(clone1name, sizeof (clone1name),
3414 "%s/c1_%llu", osname, id);
3415 (void) snprintf(snap2name, sizeof (snap2name),
3434 error = dsl_destroy_snapshot(snap1name, B_FALSE);
3435 if (error && error != ENOENT)
3436 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
3437 }
3438
3439 /*
3440 * Verify dsl_dataset_promote handles EBUSY
3441 */
3442 void
3443 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
3444 {
3445 objset_t *os;
3446 char snap1name[ZFS_MAX_DATASET_NAME_LEN];
3447 char clone1name[ZFS_MAX_DATASET_NAME_LEN];
3448 char snap2name[ZFS_MAX_DATASET_NAME_LEN];
3449 char clone2name[ZFS_MAX_DATASET_NAME_LEN];
3450 char snap3name[ZFS_MAX_DATASET_NAME_LEN];
3451 char *osname = zd->zd_name;
3452 int error;
3453
3454 rw_enter(&ztest_name_lock, RW_READER);
3455
3456 ztest_dsl_dataset_cleanup(osname, id);
3457
3458 (void) snprintf(snap1name, sizeof (snap1name),
3459 "%s@s1_%llu", osname, id);
3460 (void) snprintf(clone1name, sizeof (clone1name),
3461 "%s/c1_%llu", osname, id);
3462 (void) snprintf(snap2name, sizeof (snap2name),
3463 "%s@s2_%llu", clone1name, id);
3464 (void) snprintf(clone2name, sizeof (clone2name),
3465 "%s/c2_%llu", osname, id);
3466 (void) snprintf(snap3name, sizeof (snap3name),
3467 "%s@s3_%llu", clone1name, id);
3468
3469 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
3470 if (error && error != EEXIST) {
3471 if (error == ENOSPC) {
3472 ztest_record_enospc(FTAG);
3473 goto out;
3474 }
3511 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
3512 }
3513
3514 error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
3515 if (error)
3516 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
3517 error = dsl_dataset_promote(clone2name, NULL);
3518 if (error == ENOSPC) {
3519 dmu_objset_disown(os, FTAG);
3520 ztest_record_enospc(FTAG);
3521 goto out;
3522 }
3523 if (error != EBUSY)
3524 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
3525 error);
3526 dmu_objset_disown(os, FTAG);
3527
3528 out:
3529 ztest_dsl_dataset_cleanup(osname, id);
3530
3531 rw_exit(&ztest_name_lock);
3532 }
3533
3534 /*
3535 * Verify that dmu_object_{alloc,free} work as expected.
3536 */
3537 void
3538 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
3539 {
3540 ztest_od_t od[4];
3541 int batchsize = sizeof (od) / sizeof (od[0]);
3542
3543 for (int b = 0; b < batchsize; b++)
3544 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
3545
3546 /*
3547 * Destroy the previous batch of objects, create a new batch,
3548 * and do some I/O on the new objects.
3549 */
3550 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
3551 return;
4445
4446 if (error == ECANCELED) {
4447 ASSERT0(data->zcd_txg);
4448 ASSERT(!data->zcd_added);
4449
4450 /*
4451 * The private callback data should be destroyed here, but
4452 * since we are going to check the zcd_called field after
4453 * dmu_tx_abort(), we will destroy it there.
4454 */
4455 return;
4456 }
4457
4458 /* Was this callback added to the global callback list? */
4459 if (!data->zcd_added)
4460 goto out;
4461
4462 ASSERT3U(data->zcd_txg, !=, 0);
4463
4464 /* Remove our callback from the list */
4465 mutex_enter(&zcl.zcl_callbacks_lock);
4466 list_remove(&zcl.zcl_callbacks, data);
4467 mutex_exit(&zcl.zcl_callbacks_lock);
4468
4469 out:
4470 umem_free(data, sizeof (ztest_cb_data_t));
4471 }
4472
4473 /* Allocate and initialize callback data structure */
4474 static ztest_cb_data_t *
4475 ztest_create_cb_data(objset_t *os, uint64_t txg)
4476 {
4477 ztest_cb_data_t *cb_data;
4478
4479 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
4480
4481 cb_data->zcd_txg = txg;
4482 cb_data->zcd_spa = dmu_objset_spa(os);
4483
4484 return (cb_data);
4485 }
4486
4487 /*
4549 }
4550
4551 return;
4552 }
4553
4554 cb_data[2] = ztest_create_cb_data(os, txg);
4555 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
4556
4557 /*
4558 * Read existing data to make sure there isn't a future leak.
4559 */
4560 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
4561 &old_txg, DMU_READ_PREFETCH));
4562
4563 if (old_txg > txg)
4564 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
4565 old_txg, txg);
4566
4567 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
4568
4569 mutex_enter(&zcl.zcl_callbacks_lock);
4570
4571 /*
4572 * Since commit callbacks don't have any ordering requirement and since
4573 * it is theoretically possible for a commit callback to be called
4574 * after an arbitrary amount of time has elapsed since its txg has been
4575 * synced, it is difficult to reliably determine whether a commit
4576 * callback hasn't been called due to high load or due to a flawed
4577 * implementation.
4578 *
4579 * In practice, we will assume that if after a certain number of txgs a
4580 * commit callback hasn't been called, then most likely there's an
4581 * implementation bug..
4582 */
4583 tmp_cb = list_head(&zcl.zcl_callbacks);
4584 if (tmp_cb != NULL &&
4585 (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
4586 fatal(0, "Commit callback threshold exceeded, oldest txg: %"
4587 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
4588 }
4589
4596 * (from other objsets) may have sneaked in.
4597 */
4598 tmp_cb = list_tail(&zcl.zcl_callbacks);
4599 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
4600 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
4601
4602 /* Add the 3 callbacks to the list */
4603 for (i = 0; i < 3; i++) {
4604 if (tmp_cb == NULL)
4605 list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
4606 else
4607 list_insert_after(&zcl.zcl_callbacks, tmp_cb,
4608 cb_data[i]);
4609
4610 cb_data[i]->zcd_added = B_TRUE;
4611 VERIFY(!cb_data[i]->zcd_called);
4612
4613 tmp_cb = cb_data[i];
4614 }
4615
4616 mutex_exit(&zcl.zcl_callbacks_lock);
4617
4618 dmu_tx_commit(tx);
4619 }
4620
4621 /* ARGSUSED */
4622 void
4623 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
4624 {
4625 zfs_prop_t proplist[] = {
4626 ZFS_PROP_CHECKSUM,
4627 ZFS_PROP_COMPRESSION,
4628 ZFS_PROP_COPIES,
4629 ZFS_PROP_DEDUP
4630 };
4631
4632 rw_enter(&ztest_name_lock, RW_READER);
4633
4634 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
4635 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
4636 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
4637
4638 rw_exit(&ztest_name_lock);
4639 }
4640
4641 /* ARGSUSED */
4642 void
4643 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
4644 {
4645 nvlist_t *props = NULL;
4646
4647 rw_enter(&ztest_name_lock, RW_READER);
4648
4649 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
4650 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
4651
4652 VERIFY0(spa_prop_get(ztest_spa, &props));
4653
4654 if (ztest_opts.zo_verbose >= 6)
4655 dump_nvlist(props, 4);
4656
4657 nvlist_free(props);
4658
4659 rw_exit(&ztest_name_lock);
4660 }
4661
4662 static int
4663 user_release_one(const char *snapname, const char *holdname)
4664 {
4665 nvlist_t *snaps, *holds;
4666 int error;
4667
4668 snaps = fnvlist_alloc();
4669 holds = fnvlist_alloc();
4670 fnvlist_add_boolean(holds, holdname);
4671 fnvlist_add_nvlist(snaps, snapname, holds);
4672 fnvlist_free(holds);
4673 error = dsl_dataset_user_release(snaps, NULL);
4674 fnvlist_free(snaps);
4675 return (error);
4676 }
4677
4678 /*
4679 * Test snapshot hold/release and deferred destroy.
4680 */
4681 void
4682 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
4683 {
4684 int error;
4685 objset_t *os = zd->zd_os;
4686 objset_t *origin;
4687 char snapname[100];
4688 char fullname[100];
4689 char clonename[100];
4690 char tag[100];
4691 char osname[ZFS_MAX_DATASET_NAME_LEN];
4692 nvlist_t *holds;
4693
4694 rw_enter(&ztest_name_lock, RW_READER);
4695
4696 dmu_objset_name(os, osname);
4697
4698 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
4699 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
4700 (void) snprintf(clonename, sizeof (clonename),
4701 "%s/ch1_%llu", osname, id);
4702 (void) snprintf(tag, sizeof (tag), "tag_%llu", id);
4703
4704 /*
4705 * Clean up from any previous run.
4706 */
4707 error = dsl_destroy_head(clonename);
4708 if (error != ENOENT)
4709 ASSERT0(error);
4710 error = user_release_one(fullname, tag);
4711 if (error != ESRCH && error != ENOENT)
4712 ASSERT0(error);
4713 error = dsl_destroy_snapshot(fullname, B_FALSE);
4714 if (error != ENOENT)
4779
4780 error = dsl_destroy_snapshot(fullname, B_FALSE);
4781 if (error != EBUSY) {
4782 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
4783 fullname, error);
4784 }
4785
4786 error = dsl_destroy_snapshot(fullname, B_TRUE);
4787 if (error) {
4788 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
4789 fullname, error);
4790 }
4791
4792 error = user_release_one(fullname, tag);
4793 if (error)
4794 fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error);
4795
4796 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
4797
4798 out:
4799 rw_exit(&ztest_name_lock);
4800 }
4801
4802 /*
4803 * Inject random faults into the on-disk data.
4804 */
4805 /* ARGSUSED */
4806 void
4807 ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
4808 {
4809 ztest_shared_t *zs = ztest_shared;
4810 spa_t *spa = ztest_spa;
4811 int fd;
4812 uint64_t offset;
4813 uint64_t leaves;
4814 uint64_t bad = 0x1990c0ffeedecade;
4815 uint64_t top, leaf;
4816 char path0[MAXPATHLEN];
4817 char pathrand[MAXPATHLEN];
4818 size_t fsize;
4819 int bshift = SPA_MAXBLOCKSHIFT + 2;
4820 int iters = 1000;
4821 int maxfaults;
4822 int mirror_save;
4823 vdev_t *vd0 = NULL;
4824 uint64_t guid0 = 0;
4825 boolean_t islog = B_FALSE;
4826
4827 mutex_enter(&ztest_vdev_lock);
4828 maxfaults = MAXFAULTS();
4829 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
4830 mirror_save = zs->zs_mirrors;
4831 mutex_exit(&ztest_vdev_lock);
4832
4833 ASSERT(leaves >= 1);
4834
4835 /*
4836 * Grab the name lock as reader. There are some operations
4837 * which don't like to have their vdevs changed while
4838 * they are in progress (i.e. spa_change_guid). Those
4839 * operations will have grabbed the name lock as writer.
4840 */
4841 rw_enter(&ztest_name_lock, RW_READER);
4842
4843 /*
4844 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
4845 */
4846 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4847
4848 if (ztest_random(2) == 0) {
4849 /*
4850 * Inject errors on a normal data device or slog device.
4851 */
4852 top = ztest_random_vdev_top(spa, B_TRUE);
4853 leaf = ztest_random(leaves) + zs->zs_splits;
4854
4855 /*
4856 * Generate paths to the first leaf in this top-level vdev,
4857 * and to the random leaf we selected. We'll induce transient
4858 * write failures and random online/offline activity on leaf 0,
4859 * and we'll write random garbage to the randomly chosen leaf.
4860 */
4861 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
4890 vdev_file_t *vf = vd0->vdev_tsd;
4891
4892 if (vf != NULL && ztest_random(3) == 0) {
4893 (void) close(vf->vf_vnode->v_fd);
4894 vf->vf_vnode->v_fd = -1;
4895 } else if (ztest_random(2) == 0) {
4896 vd0->vdev_cant_read = B_TRUE;
4897 } else {
4898 vd0->vdev_cant_write = B_TRUE;
4899 }
4900 guid0 = vd0->vdev_guid;
4901 }
4902 } else {
4903 /*
4904 * Inject errors on an l2cache device.
4905 */
4906 spa_aux_vdev_t *sav = &spa->spa_l2cache;
4907
4908 if (sav->sav_count == 0) {
4909 spa_config_exit(spa, SCL_STATE, FTAG);
4910 rw_exit(&ztest_name_lock);
4911 return;
4912 }
4913 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
4914 guid0 = vd0->vdev_guid;
4915 (void) strcpy(path0, vd0->vdev_path);
4916 (void) strcpy(pathrand, vd0->vdev_path);
4917
4918 leaf = 0;
4919 leaves = 1;
4920 maxfaults = INT_MAX; /* no limit on cache devices */
4921 }
4922
4923 spa_config_exit(spa, SCL_STATE, FTAG);
4924 rw_exit(&ztest_name_lock);
4925
4926 /*
4927 * If we can tolerate two or more faults, or we're dealing
4928 * with a slog, randomly online/offline vd0.
4929 */
4930 if ((maxfaults >= 2 || islog) && guid0 != 0) {
4931 if (ztest_random(10) < 6) {
4932 int flags = (ztest_random(2) == 0 ?
4933 ZFS_OFFLINE_TEMPORARY : 0);
4934
4935 /*
4936 * We have to grab the zs_name_lock as writer to
4937 * prevent a race between offlining a slog and
4938 * destroying a dataset. Offlining the slog will
4939 * grab a reference on the dataset which may cause
4940 * dmu_objset_destroy() to fail with EBUSY thus
4941 * leaving the dataset in an inconsistent state.
4942 */
4943 if (islog)
4944 rw_enter(&ztest_name_lock, RW_WRITER);
4945
4946 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
4947
4948 if (islog)
4949 rw_exit(&ztest_name_lock);
4950 } else {
4951 /*
4952 * Ideally we would like to be able to randomly
4953 * call vdev_[on|off]line without holding locks
4954 * to force unpredictable failures but the side
4955 * effects of vdev_[on|off]line prevent us from
4956 * doing so. We grab the ztest_vdev_lock here to
4957 * prevent a race between injection testing and
4958 * aux_vdev removal.
4959 */
4960 mutex_enter(&ztest_vdev_lock);
4961 (void) vdev_online(spa, guid0, 0, NULL);
4962 mutex_exit(&ztest_vdev_lock);
4963 }
4964 }
4965
4966 if (maxfaults == 0)
4967 return;
4968
4969 /*
4970 * We have at least single-fault tolerance, so inject data corruption.
4971 */
4972 fd = open(pathrand, O_RDWR);
4973
4974 if (fd == -1) /* we hit a gap in the device namespace */
4975 return;
4976
4977 fsize = lseek(fd, 0, SEEK_END);
4978
4979 while (--iters != 0) {
4980 /*
4981 * The offset must be chosen carefully to ensure that
4982 * we do not inject a given logical block with errors
5014 * because we also damage (parts of) the other side of
5015 * the mirror/raidz.
5016 *
5017 * Additionally, we will always have both an even and an
5018 * odd label, so that we can handle crashes in the
5019 * middle of vdev_config_sync().
5020 */
5021 if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE)
5022 continue;
5023
5024 /*
5025 * The two end labels are stored at the "end" of the disk, but
5026 * the end of the disk (vdev_psize) is aligned to
5027 * sizeof (vdev_label_t).
5028 */
5029 uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t));
5030 if ((leaf & 1) == 1 &&
5031 offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE)
5032 continue;
5033
5034 mutex_enter(&ztest_vdev_lock);
5035 if (mirror_save != zs->zs_mirrors) {
5036 mutex_exit(&ztest_vdev_lock);
5037 (void) close(fd);
5038 return;
5039 }
5040
5041 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
5042 fatal(1, "can't inject bad word at 0x%llx in %s",
5043 offset, pathrand);
5044
5045 mutex_exit(&ztest_vdev_lock);
5046
5047 if (ztest_opts.zo_verbose >= 7)
5048 (void) printf("injected bad word into %s,"
5049 " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
5050 }
5051
5052 (void) close(fd);
5053 }
5054
5055 /*
5056 * Verify that DDT repair works as expected.
5057 */
5058 void
5059 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
5060 {
5061 ztest_shared_t *zs = ztest_shared;
5062 spa_t *spa = ztest_spa;
5063 objset_t *os = zd->zd_os;
5064 ztest_od_t od[1];
5065 uint64_t object, blocksize, txg, pattern, psize;
5066 enum zio_checksum checksum = spa_dedup_checksum(spa);
5067 dmu_buf_t *db;
5068 dmu_tx_t *tx;
5069 abd_t *abd;
5070 blkptr_t blk;
5071 int copies = 2 * ZIO_DEDUPDITTO_MIN;
5072
5073 blocksize = ztest_random_blocksize();
5074 blocksize = MIN(blocksize, 2048); /* because we write so many */
5075
5076 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
5077
5078 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
5079 return;
5080
5081 /*
5082 * Take the name lock as writer to prevent anyone else from changing
5083 * the pool and dataset properies we need to maintain during this test.
5084 */
5085 rw_enter(&ztest_name_lock, RW_WRITER);
5086
5087 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
5088 B_FALSE) != 0 ||
5089 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
5090 B_FALSE) != 0) {
5091 rw_exit(&ztest_name_lock);
5092 return;
5093 }
5094
5095 dmu_objset_stats_t dds;
5096 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
5097 dmu_objset_fast_stat(os, &dds);
5098 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
5099
5100 object = od[0].od_object;
5101 blocksize = od[0].od_blocksize;
5102 pattern = zs->zs_guid ^ dds.dds_guid;
5103
5104 ASSERT(object != 0);
5105
5106 tx = dmu_tx_create(os);
5107 dmu_tx_hold_write(tx, object, 0, copies * blocksize);
5108 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
5109 if (txg == 0) {
5110 rw_exit(&ztest_name_lock);
5111 return;
5112 }
5113
5114 /*
5115 * Write all the copies of our block.
5116 */
5117 for (int i = 0; i < copies; i++) {
5118 uint64_t offset = i * blocksize;
5119 int error = dmu_buf_hold(os, object, offset, FTAG, &db,
5120 DMU_READ_NO_PREFETCH);
5121 if (error != 0) {
5122 fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
5123 os, (long long)object, (long long) offset, error);
5124 }
5125 ASSERT(db->db_offset == offset);
5126 ASSERT(db->db_size == blocksize);
5127 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
5128 ztest_pattern_match(db->db_data, db->db_size, 0ULL));
5129 dmu_buf_will_fill(db, tx);
5130 ztest_pattern_set(db->db_data, db->db_size, pattern);
5138 * Find out what block we got.
5139 */
5140 VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
5141 DMU_READ_NO_PREFETCH));
5142 blk = *((dmu_buf_impl_t *)db)->db_blkptr;
5143 dmu_buf_rele(db, FTAG);
5144
5145 /*
5146 * Damage the block. Dedup-ditto will save us when we read it later.
5147 */
5148 psize = BP_GET_PSIZE(&blk);
5149 abd = abd_alloc_linear(psize, B_TRUE);
5150 ztest_pattern_set(abd_to_buf(abd), psize, ~pattern);
5151
5152 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
5153 abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
5154 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
5155
5156 abd_free(abd);
5157
5158 rw_exit(&ztest_name_lock);
5159 }
5160
5161 /*
5162 * Scrub the pool.
5163 */
5164 /* ARGSUSED */
5165 void
5166 ztest_scrub(ztest_ds_t *zd, uint64_t id)
5167 {
5168 spa_t *spa = ztest_spa;
5169
5170 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5171 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
5172 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5173 }
5174
5175 /*
5176 * Change the guid for the pool.
5177 */
5178 /* ARGSUSED */
5179 void
5180 ztest_reguid(ztest_ds_t *zd, uint64_t id)
5181 {
5182 spa_t *spa = ztest_spa;
5183 uint64_t orig, load;
5184 int error;
5185
5186 orig = spa_guid(spa);
5187 load = spa_load_guid(spa);
5188
5189 rw_enter(&ztest_name_lock, RW_WRITER);
5190 error = spa_change_guid(spa);
5191 rw_exit(&ztest_name_lock);
5192
5193 if (error != 0)
5194 return;
5195
5196 if (ztest_opts.zo_verbose >= 4) {
5197 (void) printf("Changed guid old %llu -> %llu\n",
5198 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
5199 }
5200
5201 VERIFY3U(orig, !=, spa_guid(spa));
5202 VERIFY3U(load, ==, spa_load_guid(spa));
5203 }
5204
5205 /*
5206 * Rename the pool to a different name and then rename it back.
5207 */
5208 /* ARGSUSED */
5209 void
5210 ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
5211 {
5212 char *oldname, *newname;
5213 spa_t *spa;
5214
5215 rw_enter(&ztest_name_lock, RW_WRITER);
5216
5217 oldname = ztest_opts.zo_pool;
5218 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
5219 (void) strcpy(newname, oldname);
5220 (void) strcat(newname, "_tmp");
5221
5222 /*
5223 * Do the rename
5224 */
5225 VERIFY3U(0, ==, spa_rename(oldname, newname));
5226
5227 /*
5228 * Try to open it under the old name, which shouldn't exist
5229 */
5230 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
5231
5232 /*
5233 * Open it under the new name and make sure it's still the same spa_t.
5234 */
5235 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
5236
5237 ASSERT(spa == ztest_spa);
5238 spa_close(spa, FTAG);
5239
5240 /*
5241 * Rename it back to the original
5242 */
5243 VERIFY3U(0, ==, spa_rename(newname, oldname));
5244
5245 /*
5246 * Make sure it can still be opened
5247 */
5248 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
5249
5250 ASSERT(spa == ztest_spa);
5251 spa_close(spa, FTAG);
5252
5253 umem_free(newname, strlen(newname) + 1);
5254
5255 rw_exit(&ztest_name_lock);
5256 }
5257
5258 /*
5259 * Verify pool integrity by running zdb.
5260 */
5261 static void
5262 ztest_run_zdb(char *pool)
5263 {
5264 int status;
5265 char zdb[MAXPATHLEN + MAXNAMELEN + 20];
5266 char zbuf[1024];
5267 char *bin;
5268 char *ztest;
5269 char *isa;
5270 int isalen;
5271 FILE *fp;
5272
5273 (void) realpath(getexecname(), zdb);
5274
5275 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
5590 * That's because zap_count() returns the open-context value,
5591 * while dmu_objset_space() returns the rootbp fill count.
5592 */
5593 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
5594 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
5595 ASSERT3U(dirobjs + 1, ==, usedobjs);
5596 }
5597
5598 static int
5599 ztest_dataset_open(int d)
5600 {
5601 ztest_ds_t *zd = &ztest_ds[d];
5602 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
5603 objset_t *os;
5604 zilog_t *zilog;
5605 char name[ZFS_MAX_DATASET_NAME_LEN];
5606 int error;
5607
5608 ztest_dataset_name(name, ztest_opts.zo_pool, d);
5609
5610 rw_enter(&ztest_name_lock, RW_READER);
5611
5612 error = ztest_dataset_create(name);
5613 if (error == ENOSPC) {
5614 rw_exit(&ztest_name_lock);
5615 ztest_record_enospc(FTAG);
5616 return (error);
5617 }
5618 ASSERT(error == 0 || error == EEXIST);
5619
5620 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
5621 rw_exit(&ztest_name_lock);
5622
5623 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
5624
5625 zilog = zd->zd_zilog;
5626
5627 if (zilog->zl_header->zh_claim_lr_seq != 0 &&
5628 zilog->zl_header->zh_claim_lr_seq < committed_seq)
5629 fatal(0, "missing log records: claimed %llu < committed %llu",
5630 zilog->zl_header->zh_claim_lr_seq, committed_seq);
5631
5632 ztest_dataset_dirobj_verify(zd);
5633
5634 zil_replay(os, zd, ztest_replay_vector);
5635
5636 ztest_dataset_dirobj_verify(zd);
5637
5638 if (ztest_opts.zo_verbose >= 6)
5639 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
5640 zd->zd_name,
5641 (u_longlong_t)zilog->zl_parse_blk_count,
5652 return (0);
5653 }
5654
5655 static void
5656 ztest_dataset_close(int d)
5657 {
5658 ztest_ds_t *zd = &ztest_ds[d];
5659
5660 zil_close(zd->zd_zilog);
5661 dmu_objset_disown(zd->zd_os, zd);
5662
5663 ztest_zd_fini(zd);
5664 }
5665
5666 /*
5667 * Kick off threads to run tests on all datasets in parallel.
5668 */
5669 static void
5670 ztest_run(ztest_shared_t *zs)
5671 {
5672 pthread_t *tid;
5673 spa_t *spa;
5674 objset_t *os;
5675 pthread_t resume_tid;
5676 int error;
5677
5678 ztest_exiting = B_FALSE;
5679
5680 /*
5681 * Initialize parent/child shared state.
5682 */
5683 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
5684 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
5685
5686 zs->zs_thread_start = gethrtime();
5687 zs->zs_thread_stop =
5688 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
5689 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
5690 zs->zs_thread_kill = zs->zs_thread_stop;
5691 if (ztest_random(100) < ztest_opts.zo_killrate) {
5692 zs->zs_thread_kill -=
5693 ztest_random(ztest_opts.zo_passtime * NANOSEC);
5694 }
5695
5696 mutex_init(&zcl.zcl_callbacks_lock, NULL, USYNC_THREAD, NULL);
5697
5698 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
5699 offsetof(ztest_cb_data_t, zcd_node));
5700
5701 /*
5702 * Open our pool.
5703 */
5704 kernel_init(FREAD | FWRITE);
5705 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
5706 spa->spa_debug = B_TRUE;
5707 metaslab_preload_limit = ztest_random(20) + 1;
5708 ztest_spa = spa;
5709
5710 dmu_objset_stats_t dds;
5711 VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
5712 DMU_OST_ANY, B_TRUE, FTAG, &os));
5713 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
5714 dmu_objset_fast_stat(os, &dds);
5715 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
5716 zs->zs_guid = dds.dds_guid;
5717 dmu_objset_disown(os, FTAG);
5718
5719 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
5720
5721 /*
5722 * We don't expect the pool to suspend unless maxfaults == 0,
5723 * in which case ztest_fault_inject() temporarily takes away
5724 * the only valid replica.
5725 */
5726 if (MAXFAULTS() == 0)
5727 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
5728 else
5729 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
5730
5731 /*
5732 * Create a thread to periodically resume suspended I/O.
5733 */
5734 VERIFY(pthread_create(&resume_tid, NULL, ztest_resume_thread,
5735 spa) == 0);
5736
5737 /*
5738 * Create a deadman thread to abort() if we hang.
5739 */
5740 VERIFY(pthread_create(&resume_tid, NULL, ztest_deadman_thread,
5741 zs) == 0);
5742
5743 /*
5744 * Verify that we can safely inquire about about any object,
5745 * whether it's allocated or not. To make it interesting,
5746 * we probe a 5-wide window around each power of two.
5747 * This hits all edge cases, including zero and the max.
5748 */
5749 for (int t = 0; t < 64; t++) {
5750 for (int d = -5; d <= 5; d++) {
5751 error = dmu_object_info(spa->spa_meta_objset,
5752 (1ULL << t) + d, NULL);
5753 ASSERT(error == 0 || error == ENOENT ||
5754 error == EINVAL);
5755 }
5756 }
5757
5758 /*
5759 * If we got any ENOSPC errors on the previous run, destroy something.
5760 */
5761 if (zs->zs_enospc_count != 0) {
5762 int d = ztest_random(ztest_opts.zo_datasets);
5763 ztest_dataset_destroy(d);
5764 }
5765 zs->zs_enospc_count = 0;
5766
5767 tid = umem_zalloc(ztest_opts.zo_threads * sizeof (pthread_t),
5768 UMEM_NOFAIL);
5769
5770 if (ztest_opts.zo_verbose >= 4)
5771 (void) printf("starting main threads...\n");
5772
5773 /*
5774 * Kick off all the tests that run in parallel.
5775 */
5776 for (int t = 0; t < ztest_opts.zo_threads; t++) {
5777 if (t < ztest_opts.zo_datasets &&
5778 ztest_dataset_open(t) != 0)
5779 return;
5780 VERIFY(pthread_create(&tid[t], NULL, ztest_thread,
5781 (void *)(uintptr_t)t) == 0);
5782 }
5783
5784 /*
5785 * Wait for all of the tests to complete. We go in reverse order
5786 * so we don't close datasets while threads are still using them.
5787 */
5788 for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
5789 VERIFY(pthread_join(tid[t], NULL) == 0);
5790 if (t < ztest_opts.zo_datasets)
5791 ztest_dataset_close(t);
5792 }
5793
5794 txg_wait_synced(spa_get_dsl(spa), 0);
5795
5796 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
5797 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
5798 zfs_dbgmsg_print(FTAG);
5799
5800 umem_free(tid, ztest_opts.zo_threads * sizeof (pthread_t));
5801
5802 /* Kill the resume thread */
5803 ztest_exiting = B_TRUE;
5804 VERIFY(pthread_join(resume_tid, NULL) == 0);
5805 ztest_resume(spa);
5806
5807 /*
5808 * Right before closing the pool, kick off a bunch of async I/O;
5809 * spa_close() should wait for it to complete.
5810 */
5811 for (uint64_t object = 1; object < 50; object++) {
5812 dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20,
5813 ZIO_PRIORITY_SYNC_READ);
5814 }
5815
5816 spa_close(spa, FTAG);
5817
5818 /*
5819 * Verify that we can loop over all pools.
5820 */
5821 mutex_enter(&spa_namespace_lock);
5822 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
5823 if (ztest_opts.zo_verbose > 3)
5824 (void) printf("spa_next: found %s\n", spa_name(spa));
5825 mutex_exit(&spa_namespace_lock);
5826
5827 /*
5828 * Verify that we can export the pool and reimport it under a
5829 * different name.
5830 */
5831 if (ztest_random(2) == 0) {
5832 char name[ZFS_MAX_DATASET_NAME_LEN];
5833 (void) snprintf(name, sizeof (name), "%s_import",
5834 ztest_opts.zo_pool);
5835 ztest_spa_import_export(ztest_opts.zo_pool, name);
5836 ztest_spa_import_export(name, ztest_opts.zo_pool);
5837 }
5838
5839 kernel_fini();
5840
5841 list_destroy(&zcl.zcl_callbacks);
5842
5843 mutex_destroy(&zcl.zcl_callbacks_lock);
5844
5845 rw_destroy(&ztest_name_lock);
5846 mutex_destroy(&ztest_vdev_lock);
5847 }
5848
5849 static void
5850 ztest_freeze(void)
5851 {
5852 ztest_ds_t *zd = &ztest_ds[0];
5853 spa_t *spa;
5854 int numloops = 0;
5855
5856 if (ztest_opts.zo_verbose >= 3)
5857 (void) printf("testing spa_freeze()...\n");
5858
5859 kernel_init(FREAD | FWRITE);
5860 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
5861 VERIFY3U(0, ==, ztest_dataset_open(0));
5862 spa->spa_debug = B_TRUE;
5863 ztest_spa = spa;
5864
5865 /*
5866 * Force the first log block to be transactionally allocated.
5970 nvlist_t *props;
5971
5972 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
5973 if (ztest_random(2) == 0)
5974 return (props);
5975 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
5976
5977 return (props);
5978 }
5979
5980 /*
5981 * Create a storage pool with the given name and initial vdev size.
5982 * Then test spa_freeze() functionality.
5983 */
5984 static void
5985 ztest_init(ztest_shared_t *zs)
5986 {
5987 spa_t *spa;
5988 nvlist_t *nvroot, *props;
5989
5990 mutex_init(&ztest_vdev_lock, NULL, USYNC_THREAD, NULL);
5991 rw_init(&ztest_name_lock, NULL, USYNC_THREAD, NULL);
5992
5993 kernel_init(FREAD | FWRITE);
5994
5995 /*
5996 * Create the storage pool.
5997 */
5998 (void) spa_destroy(ztest_opts.zo_pool);
5999 ztest_shared->zs_vdev_next_leaf = 0;
6000 zs->zs_splits = 0;
6001 zs->zs_mirrors = ztest_opts.zo_mirrors;
6002 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
6003 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
6004 props = make_random_props();
6005 for (int i = 0; i < SPA_FEATURES; i++) {
6006 char buf[1024];
6007 (void) snprintf(buf, sizeof (buf), "feature@%s",
6008 spa_feature_table[i].fi_uname);
6009 VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
6010 }
6011 VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
6012 nvlist_free(nvroot);
6013 nvlist_free(props);
6014
6015 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
6016 zs->zs_metaslab_sz =
6017 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
6018
6019 spa_close(spa, FTAG);
6020
6021 kernel_fini();
6022
6023 ztest_run_zdb(ztest_opts.zo_pool);
6024
6025 ztest_freeze();
6026
6027 ztest_run_zdb(ztest_opts.zo_pool);
6028
6029 rw_destroy(&ztest_name_lock);
6030 mutex_destroy(&ztest_vdev_lock);
6031 }
6032
6033 static void
6034 setup_data_fd(void)
6035 {
6036 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
6037
6038 ztest_fd_data = mkstemp(ztest_name_data);
6039 ASSERT3S(ztest_fd_data, >=, 0);
6040 (void) unlink(ztest_name_data);
6041 }
6042
6043
6044 static int
6045 shared_data_size(ztest_shared_hdr_t *hdr)
6046 {
6047 int size;
6048
6049 size = hdr->zh_hdr_size;
6050 size += hdr->zh_opts_size;
|