1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 /*
28 * The objective of this program is to provide a DMU/ZAP/SPA stress test
29 * that runs entirely in userland, is easy to use, and easy to extend.
30 *
31 * The overall design of the ztest program is as follows:
32 *
33 * (1) For each major functional area (e.g. adding vdevs to a pool,
34 * creating and destroying datasets, reading and writing objects, etc)
35 * we have a simple routine to test that functionality. These
36 * individual routines do not have to do anything "stressful".
37 *
38 * (2) We turn these simple functionality tests into a stress test by
39 * running them all in parallel, with as many threads as desired,
40 * and spread across as many datasets, objects, and vdevs as desired.
41 *
42 * (3) While all this is happening, we inject faults into the pool to
43 * verify that self-healing data really works.
44 *
45 * (4) Every time we open a dataset, we change its checksum and compression
46 * functions. Thus even individual objects vary from block to block
47 * in which checksum they use and whether they're compressed.
48 *
49 * (5) To verify that we never lose on-disk consistency after a crash,
50 * we run the entire test in a child of the main process.
51 * At random times, the child self-immolates with a SIGKILL.
52 * This is the software equivalent of pulling the power cord.
53 * The parent then runs the test again, using the existing
54 * storage pool, as many times as desired. If backwards compatability
55 * testing is enabled ztest will sometimes run the "older" version
56 * of ztest after a SIGKILL.
57 *
58 * (6) To verify that we don't have future leaks or temporal incursions,
59 * many of the functional tests record the transaction group number
60 * as part of their data. When reading old data, they verify that
61 * the transaction group number is less than the current, open txg.
62 * If you add a new test, please do this if applicable.
63 *
64 * When run with no arguments, ztest runs for about five minutes and
65 * produces no output if successful. To get a little bit of information,
66 * specify -V. To get more information, specify -VV, and so on.
67 *
68 * To turn this into an overnight stress test, use -T to specify run time.
69 *
70 * You can ask more more vdevs [-v], datasets [-d], or threads [-t]
71 * to increase the pool capacity, fanout, and overall stress level.
72 *
73 * Use the -k option to set the desired frequency of kills.
74 *
75 * When ztest invokes itself it passes all relevant information through a
76 * temporary file which is mmap-ed in the child process. This allows shared
77 * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always
78 * stored at offset 0 of this file and contains information on the size and
79 * number of shared structures in the file. The information stored in this file
80 * must remain backwards compatible with older versions of ztest so that
81 * ztest can invoke them during backwards compatibility testing (-B).
82 */
83
84 #include <sys/zfs_context.h>
85 #include <sys/spa.h>
86 #include <sys/dmu.h>
87 #include <sys/txg.h>
88 #include <sys/dbuf.h>
89 #include <sys/zap.h>
90 #include <sys/dmu_objset.h>
91 #include <sys/poll.h>
92 #include <sys/stat.h>
93 #include <sys/time.h>
94 #include <sys/wait.h>
95 #include <sys/mman.h>
96 #include <sys/resource.h>
97 #include <sys/zio.h>
98 #include <sys/zil.h>
99 #include <sys/zil_impl.h>
100 #include <sys/vdev_impl.h>
101 #include <sys/vdev_file.h>
102 #include <sys/spa_impl.h>
103 #include <sys/metaslab_impl.h>
104 #include <sys/dsl_prop.h>
105 #include <sys/dsl_dataset.h>
106 #include <sys/dsl_destroy.h>
107 #include <sys/dsl_scan.h>
108 #include <sys/zio_checksum.h>
109 #include <sys/refcount.h>
110 #include <sys/zfeature.h>
111 #include <sys/dsl_userhold.h>
112 #include <stdio.h>
113 #include <stdio_ext.h>
114 #include <stdlib.h>
115 #include <unistd.h>
116 #include <signal.h>
117 #include <umem.h>
118 #include <dlfcn.h>
119 #include <ctype.h>
120 #include <math.h>
121 #include <sys/fs/zfs.h>
122 #include <libnvpair.h>
123
124 static int ztest_fd_data = -1;
125 static int ztest_fd_rand = -1;
126
127 typedef struct ztest_shared_hdr {
128 uint64_t zh_hdr_size;
129 uint64_t zh_opts_size;
130 uint64_t zh_size;
131 uint64_t zh_stats_size;
132 uint64_t zh_stats_count;
133 uint64_t zh_ds_size;
134 uint64_t zh_ds_count;
135 } ztest_shared_hdr_t;
136
137 static ztest_shared_hdr_t *ztest_shared_hdr;
138
139 typedef struct ztest_shared_opts {
140 char zo_pool[MAXNAMELEN];
141 char zo_dir[MAXNAMELEN];
142 char zo_alt_ztest[MAXNAMELEN];
143 char zo_alt_libpath[MAXNAMELEN];
144 uint64_t zo_vdevs;
145 uint64_t zo_vdevtime;
146 size_t zo_vdev_size;
147 int zo_ashift;
148 int zo_mirrors;
149 int zo_raidz;
150 int zo_raidz_parity;
151 int zo_datasets;
152 int zo_threads;
153 uint64_t zo_passtime;
154 uint64_t zo_killrate;
155 int zo_verbose;
156 int zo_init;
157 uint64_t zo_time;
158 uint64_t zo_maxloops;
159 uint64_t zo_metaslab_gang_bang;
160 } ztest_shared_opts_t;
161
162 static const ztest_shared_opts_t ztest_opts_defaults = {
163 .zo_pool = { 'z', 't', 'e', 's', 't', '\0' },
164 .zo_dir = { '/', 't', 'm', 'p', '\0' },
165 .zo_alt_ztest = { '\0' },
166 .zo_alt_libpath = { '\0' },
167 .zo_vdevs = 5,
168 .zo_ashift = SPA_MINBLOCKSHIFT,
169 .zo_mirrors = 2,
170 .zo_raidz = 4,
171 .zo_raidz_parity = 1,
172 .zo_vdev_size = SPA_MINDEVSIZE,
173 .zo_datasets = 7,
174 .zo_threads = 23,
175 .zo_passtime = 60, /* 60 seconds */
176 .zo_killrate = 70, /* 70% kill rate */
177 .zo_verbose = 0,
178 .zo_init = 1,
179 .zo_time = 300, /* 5 minutes */
180 .zo_maxloops = 50, /* max loops during spa_freeze() */
181 .zo_metaslab_gang_bang = 32 << 10
182 };
183
184 extern uint64_t metaslab_gang_bang;
185 extern uint64_t metaslab_df_alloc_threshold;
186
187 static ztest_shared_opts_t *ztest_shared_opts;
188 static ztest_shared_opts_t ztest_opts;
189
190 typedef struct ztest_shared_ds {
191 uint64_t zd_seq;
192 } ztest_shared_ds_t;
193
194 static ztest_shared_ds_t *ztest_shared_ds;
195 #define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d])
196
197 #define BT_MAGIC 0x123456789abcdefULL
198 #define MAXFAULTS() \
199 (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1)
200
201 enum ztest_io_type {
202 ZTEST_IO_WRITE_TAG,
203 ZTEST_IO_WRITE_PATTERN,
204 ZTEST_IO_WRITE_ZEROES,
205 ZTEST_IO_TRUNCATE,
206 ZTEST_IO_SETATTR,
207 ZTEST_IO_REWRITE,
208 ZTEST_IO_TYPES
209 };
210
211 typedef struct ztest_block_tag {
212 uint64_t bt_magic;
213 uint64_t bt_objset;
214 uint64_t bt_object;
215 uint64_t bt_offset;
216 uint64_t bt_gen;
217 uint64_t bt_txg;
218 uint64_t bt_crtxg;
219 } ztest_block_tag_t;
220
221 typedef struct bufwad {
222 uint64_t bw_index;
223 uint64_t bw_txg;
224 uint64_t bw_data;
225 } bufwad_t;
226
227 /*
228 * XXX -- fix zfs range locks to be generic so we can use them here.
229 */
230 typedef enum {
231 RL_READER,
232 RL_WRITER,
233 RL_APPEND
234 } rl_type_t;
235
236 typedef struct rll {
237 void *rll_writer;
238 int rll_readers;
239 mutex_t rll_lock;
240 cond_t rll_cv;
241 } rll_t;
242
243 typedef struct rl {
244 uint64_t rl_object;
245 uint64_t rl_offset;
246 uint64_t rl_size;
247 rll_t *rl_lock;
248 } rl_t;
249
250 #define ZTEST_RANGE_LOCKS 64
251 #define ZTEST_OBJECT_LOCKS 64
252
253 /*
254 * Object descriptor. Used as a template for object lookup/create/remove.
255 */
256 typedef struct ztest_od {
257 uint64_t od_dir;
258 uint64_t od_object;
259 dmu_object_type_t od_type;
260 dmu_object_type_t od_crtype;
261 uint64_t od_blocksize;
262 uint64_t od_crblocksize;
263 uint64_t od_gen;
264 uint64_t od_crgen;
265 char od_name[MAXNAMELEN];
266 } ztest_od_t;
267
268 /*
269 * Per-dataset state.
270 */
271 typedef struct ztest_ds {
272 ztest_shared_ds_t *zd_shared;
273 objset_t *zd_os;
274 rwlock_t zd_zilog_lock;
275 zilog_t *zd_zilog;
276 ztest_od_t *zd_od; /* debugging aid */
277 char zd_name[MAXNAMELEN];
278 mutex_t zd_dirobj_lock;
279 rll_t zd_object_lock[ZTEST_OBJECT_LOCKS];
280 rll_t zd_range_lock[ZTEST_RANGE_LOCKS];
281 } ztest_ds_t;
282
283 /*
284 * Per-iteration state.
285 */
286 typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
287
288 typedef struct ztest_info {
289 ztest_func_t *zi_func; /* test function */
290 uint64_t zi_iters; /* iterations per execution */
291 uint64_t *zi_interval; /* execute every <interval> seconds */
292 } ztest_info_t;
293
294 typedef struct ztest_shared_callstate {
295 uint64_t zc_count; /* per-pass count */
296 uint64_t zc_time; /* per-pass time */
297 uint64_t zc_next; /* next time to call this function */
298 } ztest_shared_callstate_t;
299
300 static ztest_shared_callstate_t *ztest_shared_callstate;
301 #define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c])
302
303 /*
304 * Note: these aren't static because we want dladdr() to work.
305 */
306 ztest_func_t ztest_dmu_read_write;
307 ztest_func_t ztest_dmu_write_parallel;
308 ztest_func_t ztest_dmu_object_alloc_free;
309 ztest_func_t ztest_dmu_commit_callbacks;
310 ztest_func_t ztest_zap;
311 ztest_func_t ztest_zap_parallel;
312 ztest_func_t ztest_zil_commit;
313 ztest_func_t ztest_zil_remount;
314 ztest_func_t ztest_dmu_read_write_zcopy;
315 ztest_func_t ztest_dmu_objset_create_destroy;
316 ztest_func_t ztest_dmu_prealloc;
317 ztest_func_t ztest_fzap;
318 ztest_func_t ztest_dmu_snapshot_create_destroy;
319 ztest_func_t ztest_dsl_prop_get_set;
320 ztest_func_t ztest_spa_prop_get_set;
321 ztest_func_t ztest_spa_create_destroy;
322 ztest_func_t ztest_fault_inject;
323 ztest_func_t ztest_ddt_repair;
324 ztest_func_t ztest_dmu_snapshot_hold;
325 ztest_func_t ztest_spa_rename;
326 ztest_func_t ztest_scrub;
327 ztest_func_t ztest_dsl_dataset_promote_busy;
328 ztest_func_t ztest_vdev_attach_detach;
329 ztest_func_t ztest_vdev_LUN_growth;
330 ztest_func_t ztest_vdev_add_remove;
331 ztest_func_t ztest_vdev_aux_add_remove;
332 ztest_func_t ztest_split_pool;
333 ztest_func_t ztest_reguid;
334 ztest_func_t ztest_spa_upgrade;
335
336 uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
337 uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
338 uint64_t zopt_often = 1ULL * NANOSEC; /* every second */
339 uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */
340 uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */
341
342 ztest_info_t ztest_info[] = {
343 { ztest_dmu_read_write, 1, &zopt_always },
344 { ztest_dmu_write_parallel, 10, &zopt_always },
345 { ztest_dmu_object_alloc_free, 1, &zopt_always },
346 { ztest_dmu_commit_callbacks, 1, &zopt_always },
347 { ztest_zap, 30, &zopt_always },
348 { ztest_zap_parallel, 100, &zopt_always },
349 { ztest_split_pool, 1, &zopt_always },
350 { ztest_zil_commit, 1, &zopt_incessant },
351 { ztest_zil_remount, 1, &zopt_sometimes },
352 { ztest_dmu_read_write_zcopy, 1, &zopt_often },
353 { ztest_dmu_objset_create_destroy, 1, &zopt_often },
354 { ztest_dsl_prop_get_set, 1, &zopt_often },
355 { ztest_spa_prop_get_set, 1, &zopt_sometimes },
356 #if 0
357 { ztest_dmu_prealloc, 1, &zopt_sometimes },
358 #endif
359 { ztest_fzap, 1, &zopt_sometimes },
360 { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
361 { ztest_spa_create_destroy, 1, &zopt_sometimes },
362 { ztest_fault_inject, 1, &zopt_sometimes },
363 { ztest_ddt_repair, 1, &zopt_sometimes },
364 { ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
365 { ztest_reguid, 1, &zopt_sometimes },
366 { ztest_spa_rename, 1, &zopt_rarely },
367 { ztest_scrub, 1, &zopt_rarely },
368 { ztest_spa_upgrade, 1, &zopt_rarely },
369 { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
370 { ztest_vdev_attach_detach, 1, &zopt_sometimes },
371 { ztest_vdev_LUN_growth, 1, &zopt_rarely },
372 { ztest_vdev_add_remove, 1,
373 &ztest_opts.zo_vdevtime },
374 { ztest_vdev_aux_add_remove, 1,
375 &ztest_opts.zo_vdevtime },
376 };
377
378 #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
379
380 /*
381 * The following struct is used to hold a list of uncalled commit callbacks.
382 * The callbacks are ordered by txg number.
383 */
384 typedef struct ztest_cb_list {
385 mutex_t zcl_callbacks_lock;
386 list_t zcl_callbacks;
387 } ztest_cb_list_t;
388
389 /*
390 * Stuff we need to share writably between parent and child.
391 */
392 typedef struct ztest_shared {
393 boolean_t zs_do_init;
394 hrtime_t zs_proc_start;
395 hrtime_t zs_proc_stop;
396 hrtime_t zs_thread_start;
397 hrtime_t zs_thread_stop;
398 hrtime_t zs_thread_kill;
399 uint64_t zs_enospc_count;
400 uint64_t zs_vdev_next_leaf;
401 uint64_t zs_vdev_aux;
402 uint64_t zs_alloc;
403 uint64_t zs_space;
404 uint64_t zs_splits;
405 uint64_t zs_mirrors;
406 uint64_t zs_metaslab_sz;
407 uint64_t zs_metaslab_df_alloc_threshold;
408 uint64_t zs_guid;
409 } ztest_shared_t;
410
411 #define ID_PARALLEL -1ULL
412
413 static char ztest_dev_template[] = "%s/%s.%llua";
414 static char ztest_aux_template[] = "%s/%s.%s.%llu";
415 ztest_shared_t *ztest_shared;
416
417 static spa_t *ztest_spa = NULL;
418 static ztest_ds_t *ztest_ds;
419
420 static mutex_t ztest_vdev_lock;
421
422 /*
423 * The ztest_name_lock protects the pool and dataset namespace used by
424 * the individual tests. To modify the namespace, consumers must grab
425 * this lock as writer. Grabbing the lock as reader will ensure that the
426 * namespace does not change while the lock is held.
427 */
428 static rwlock_t ztest_name_lock;
429
430 static boolean_t ztest_dump_core = B_TRUE;
431 static boolean_t ztest_exiting;
432
433 /* Global commit callback list */
434 static ztest_cb_list_t zcl;
435
436 enum ztest_object {
437 ZTEST_META_DNODE = 0,
438 ZTEST_DIROBJ,
439 ZTEST_OBJECTS
440 };
441
442 static void usage(boolean_t) __NORETURN;
443
444 /*
445 * These libumem hooks provide a reasonable set of defaults for the allocator's
446 * debugging facilities.
447 */
448 const char *
449 _umem_debug_init()
450 {
451 return ("default,verbose"); /* $UMEM_DEBUG setting */
452 }
453
454 const char *
455 _umem_logging_init(void)
456 {
457 return ("fail,contents"); /* $UMEM_LOGGING setting */
458 }
459
460 #define FATAL_MSG_SZ 1024
461
462 char *fatal_msg;
463
464 static void
465 fatal(int do_perror, char *message, ...)
466 {
467 va_list args;
468 int save_errno = errno;
469 char buf[FATAL_MSG_SZ];
470
471 (void) fflush(stdout);
472
473 va_start(args, message);
474 (void) sprintf(buf, "ztest: ");
475 /* LINTED */
476 (void) vsprintf(buf + strlen(buf), message, args);
477 va_end(args);
478 if (do_perror) {
479 (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf),
480 ": %s", strerror(save_errno));
481 }
482 (void) fprintf(stderr, "%s\n", buf);
483 fatal_msg = buf; /* to ease debugging */
484 if (ztest_dump_core)
485 abort();
486 exit(3);
487 }
488
489 static int
490 str2shift(const char *buf)
491 {
492 const char *ends = "BKMGTPEZ";
493 int i;
494
495 if (buf[0] == '\0')
496 return (0);
497 for (i = 0; i < strlen(ends); i++) {
498 if (toupper(buf[0]) == ends[i])
499 break;
500 }
501 if (i == strlen(ends)) {
502 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n",
503 buf);
504 usage(B_FALSE);
505 }
506 if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) {
507 return (10*i);
508 }
509 (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf);
510 usage(B_FALSE);
511 /* NOTREACHED */
512 }
513
514 static uint64_t
515 nicenumtoull(const char *buf)
516 {
517 char *end;
518 uint64_t val;
519
520 val = strtoull(buf, &end, 0);
521 if (end == buf) {
522 (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf);
523 usage(B_FALSE);
524 } else if (end[0] == '.') {
525 double fval = strtod(buf, &end);
526 fval *= pow(2, str2shift(end));
527 if (fval > UINT64_MAX) {
528 (void) fprintf(stderr, "ztest: value too large: %s\n",
529 buf);
530 usage(B_FALSE);
531 }
532 val = (uint64_t)fval;
533 } else {
534 int shift = str2shift(end);
535 if (shift >= 64 || (val << shift) >> shift != val) {
536 (void) fprintf(stderr, "ztest: value too large: %s\n",
537 buf);
538 usage(B_FALSE);
539 }
540 val <<= shift;
541 }
542 return (val);
543 }
544
545 static void
546 usage(boolean_t requested)
547 {
548 const ztest_shared_opts_t *zo = &ztest_opts_defaults;
549
550 char nice_vdev_size[10];
551 char nice_gang_bang[10];
552 FILE *fp = requested ? stdout : stderr;
553
554 nicenum(zo->zo_vdev_size, nice_vdev_size);
555 nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang);
556
557 (void) fprintf(fp, "Usage: %s\n"
558 "\t[-v vdevs (default: %llu)]\n"
559 "\t[-s size_of_each_vdev (default: %s)]\n"
560 "\t[-a alignment_shift (default: %d)] use 0 for random\n"
561 "\t[-m mirror_copies (default: %d)]\n"
562 "\t[-r raidz_disks (default: %d)]\n"
563 "\t[-R raidz_parity (default: %d)]\n"
564 "\t[-d datasets (default: %d)]\n"
565 "\t[-t threads (default: %d)]\n"
566 "\t[-g gang_block_threshold (default: %s)]\n"
567 "\t[-i init_count (default: %d)] initialize pool i times\n"
568 "\t[-k kill_percentage (default: %llu%%)]\n"
569 "\t[-p pool_name (default: %s)]\n"
570 "\t[-f dir (default: %s)] file directory for vdev files\n"
571 "\t[-V] verbose (use multiple times for ever more blather)\n"
572 "\t[-E] use existing pool instead of creating new one\n"
573 "\t[-T time (default: %llu sec)] total run time\n"
574 "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n"
575 "\t[-P passtime (default: %llu sec)] time per pass\n"
576 "\t[-B alt_ztest (default: <none>)] alternate ztest path\n"
577 "\t[-h] (print help)\n"
578 "",
579 zo->zo_pool,
580 (u_longlong_t)zo->zo_vdevs, /* -v */
581 nice_vdev_size, /* -s */
582 zo->zo_ashift, /* -a */
583 zo->zo_mirrors, /* -m */
584 zo->zo_raidz, /* -r */
585 zo->zo_raidz_parity, /* -R */
586 zo->zo_datasets, /* -d */
587 zo->zo_threads, /* -t */
588 nice_gang_bang, /* -g */
589 zo->zo_init, /* -i */
590 (u_longlong_t)zo->zo_killrate, /* -k */
591 zo->zo_pool, /* -p */
592 zo->zo_dir, /* -f */
593 (u_longlong_t)zo->zo_time, /* -T */
594 (u_longlong_t)zo->zo_maxloops, /* -F */
595 (u_longlong_t)zo->zo_passtime);
596 exit(requested ? 0 : 1);
597 }
598
599 static void
600 process_options(int argc, char **argv)
601 {
602 char *path;
603 ztest_shared_opts_t *zo = &ztest_opts;
604
605 int opt;
606 uint64_t value;
607 char altdir[MAXNAMELEN] = { 0 };
608
609 bcopy(&ztest_opts_defaults, zo, sizeof (*zo));
610
611 while ((opt = getopt(argc, argv,
612 "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:")) != EOF) {
613 value = 0;
614 switch (opt) {
615 case 'v':
616 case 's':
617 case 'a':
618 case 'm':
619 case 'r':
620 case 'R':
621 case 'd':
622 case 't':
623 case 'g':
624 case 'i':
625 case 'k':
626 case 'T':
627 case 'P':
628 case 'F':
629 value = nicenumtoull(optarg);
630 }
631 switch (opt) {
632 case 'v':
633 zo->zo_vdevs = value;
634 break;
635 case 's':
636 zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value);
637 break;
638 case 'a':
639 zo->zo_ashift = value;
640 break;
641 case 'm':
642 zo->zo_mirrors = value;
643 break;
644 case 'r':
645 zo->zo_raidz = MAX(1, value);
646 break;
647 case 'R':
648 zo->zo_raidz_parity = MIN(MAX(value, 1), 3);
649 break;
650 case 'd':
651 zo->zo_datasets = MAX(1, value);
652 break;
653 case 't':
654 zo->zo_threads = MAX(1, value);
655 break;
656 case 'g':
657 zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1,
658 value);
659 break;
660 case 'i':
661 zo->zo_init = value;
662 break;
663 case 'k':
664 zo->zo_killrate = value;
665 break;
666 case 'p':
667 (void) strlcpy(zo->zo_pool, optarg,
668 sizeof (zo->zo_pool));
669 break;
670 case 'f':
671 path = realpath(optarg, NULL);
672 if (path == NULL) {
673 (void) fprintf(stderr, "error: %s: %s\n",
674 optarg, strerror(errno));
675 usage(B_FALSE);
676 } else {
677 (void) strlcpy(zo->zo_dir, path,
678 sizeof (zo->zo_dir));
679 }
680 break;
681 case 'V':
682 zo->zo_verbose++;
683 break;
684 case 'E':
685 zo->zo_init = 0;
686 break;
687 case 'T':
688 zo->zo_time = value;
689 break;
690 case 'P':
691 zo->zo_passtime = MAX(1, value);
692 break;
693 case 'F':
694 zo->zo_maxloops = MAX(1, value);
695 break;
696 case 'B':
697 (void) strlcpy(altdir, optarg, sizeof (altdir));
698 break;
699 case 'h':
700 usage(B_TRUE);
701 break;
702 case '?':
703 default:
704 usage(B_FALSE);
705 break;
706 }
707 }
708
709 zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1);
710
711 zo->zo_vdevtime =
712 (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs :
713 UINT64_MAX >> 2);
714
715 if (strlen(altdir) > 0) {
716 char *cmd;
717 char *realaltdir;
718 char *bin;
719 char *ztest;
720 char *isa;
721 int isalen;
722
723 cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
724 realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
725
726 VERIFY(NULL != realpath(getexecname(), cmd));
727 if (0 != access(altdir, F_OK)) {
728 ztest_dump_core = B_FALSE;
729 fatal(B_TRUE, "invalid alternate ztest path: %s",
730 altdir);
731 }
732 VERIFY(NULL != realpath(altdir, realaltdir));
733
734 /*
735 * 'cmd' should be of the form "<anything>/usr/bin/<isa>/ztest".
736 * We want to extract <isa> to determine if we should use
737 * 32 or 64 bit binaries.
738 */
739 bin = strstr(cmd, "/usr/bin/");
740 ztest = strstr(bin, "/ztest");
741 isa = bin + 9;
742 isalen = ztest - isa;
743 (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest),
744 "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa);
745 (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath),
746 "%s/usr/lib/%.*s", realaltdir, isalen, isa);
747
748 if (0 != access(zo->zo_alt_ztest, X_OK)) {
749 ztest_dump_core = B_FALSE;
750 fatal(B_TRUE, "invalid alternate ztest: %s",
751 zo->zo_alt_ztest);
752 } else if (0 != access(zo->zo_alt_libpath, X_OK)) {
753 ztest_dump_core = B_FALSE;
754 fatal(B_TRUE, "invalid alternate lib directory %s",
755 zo->zo_alt_libpath);
756 }
757
758 umem_free(cmd, MAXPATHLEN);
759 umem_free(realaltdir, MAXPATHLEN);
760 }
761 }
762
763 static void
764 ztest_kill(ztest_shared_t *zs)
765 {
766 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa));
767 zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa));
768 (void) kill(getpid(), SIGKILL);
769 }
770
771 static uint64_t
772 ztest_random(uint64_t range)
773 {
774 uint64_t r;
775
776 ASSERT3S(ztest_fd_rand, >=, 0);
777
778 if (range == 0)
779 return (0);
780
781 if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r))
782 fatal(1, "short read from /dev/urandom");
783
784 return (r % range);
785 }
786
787 /* ARGSUSED */
788 static void
789 ztest_record_enospc(const char *s)
790 {
791 ztest_shared->zs_enospc_count++;
792 }
793
794 static uint64_t
795 ztest_get_ashift(void)
796 {
797 if (ztest_opts.zo_ashift == 0)
798 return (SPA_MINBLOCKSHIFT + ztest_random(3));
799 return (ztest_opts.zo_ashift);
800 }
801
802 static nvlist_t *
803 make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift)
804 {
805 char pathbuf[MAXPATHLEN];
806 uint64_t vdev;
807 nvlist_t *file;
808
809 if (ashift == 0)
810 ashift = ztest_get_ashift();
811
812 if (path == NULL) {
813 path = pathbuf;
814
815 if (aux != NULL) {
816 vdev = ztest_shared->zs_vdev_aux;
817 (void) snprintf(path, sizeof (pathbuf),
818 ztest_aux_template, ztest_opts.zo_dir,
819 pool == NULL ? ztest_opts.zo_pool : pool,
820 aux, vdev);
821 } else {
822 vdev = ztest_shared->zs_vdev_next_leaf++;
823 (void) snprintf(path, sizeof (pathbuf),
824 ztest_dev_template, ztest_opts.zo_dir,
825 pool == NULL ? ztest_opts.zo_pool : pool, vdev);
826 }
827 }
828
829 if (size != 0) {
830 int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666);
831 if (fd == -1)
832 fatal(1, "can't open %s", path);
833 if (ftruncate(fd, size) != 0)
834 fatal(1, "can't ftruncate %s", path);
835 (void) close(fd);
836 }
837
838 VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0);
839 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0);
840 VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0);
841 VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0);
842
843 return (file);
844 }
845
846 static nvlist_t *
847 make_vdev_raidz(char *path, char *aux, char *pool, size_t size,
848 uint64_t ashift, int r)
849 {
850 nvlist_t *raidz, **child;
851 int c;
852
853 if (r < 2)
854 return (make_vdev_file(path, aux, pool, size, ashift));
855 child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL);
856
857 for (c = 0; c < r; c++)
858 child[c] = make_vdev_file(path, aux, pool, size, ashift);
859
860 VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0);
861 VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE,
862 VDEV_TYPE_RAIDZ) == 0);
863 VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY,
864 ztest_opts.zo_raidz_parity) == 0);
865 VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN,
866 child, r) == 0);
867
868 for (c = 0; c < r; c++)
869 nvlist_free(child[c]);
870
871 umem_free(child, r * sizeof (nvlist_t *));
872
873 return (raidz);
874 }
875
876 static nvlist_t *
877 make_vdev_mirror(char *path, char *aux, char *pool, size_t size,
878 uint64_t ashift, int r, int m)
879 {
880 nvlist_t *mirror, **child;
881 int c;
882
883 if (m < 1)
884 return (make_vdev_raidz(path, aux, pool, size, ashift, r));
885
886 child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL);
887
888 for (c = 0; c < m; c++)
889 child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r);
890
891 VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0);
892 VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE,
893 VDEV_TYPE_MIRROR) == 0);
894 VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN,
895 child, m) == 0);
896
897 for (c = 0; c < m; c++)
898 nvlist_free(child[c]);
899
900 umem_free(child, m * sizeof (nvlist_t *));
901
902 return (mirror);
903 }
904
905 static nvlist_t *
906 make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift,
907 int log, int r, int m, int t)
908 {
909 nvlist_t *root, **child;
910 int c;
911
912 ASSERT(t > 0);
913
914 child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL);
915
916 for (c = 0; c < t; c++) {
917 child[c] = make_vdev_mirror(path, aux, pool, size, ashift,
918 r, m);
919 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
920 log) == 0);
921 }
922
923 VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0);
924 VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0);
925 VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN,
926 child, t) == 0);
927
928 for (c = 0; c < t; c++)
929 nvlist_free(child[c]);
930
931 umem_free(child, t * sizeof (nvlist_t *));
932
933 return (root);
934 }
935
936 /*
937 * Find a random spa version. Returns back a random spa version in the
938 * range [initial_version, SPA_VERSION_FEATURES].
939 */
940 static uint64_t
941 ztest_random_spa_version(uint64_t initial_version)
942 {
943 uint64_t version = initial_version;
944
945 if (version <= SPA_VERSION_BEFORE_FEATURES) {
946 version = version +
947 ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1);
948 }
949
950 if (version > SPA_VERSION_BEFORE_FEATURES)
951 version = SPA_VERSION_FEATURES;
952
953 ASSERT(SPA_VERSION_IS_SUPPORTED(version));
954 return (version);
955 }
956
957 static int
958 ztest_random_blocksize(void)
959 {
960 return (1 << (SPA_MINBLOCKSHIFT +
961 ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
962 }
963
964 static int
965 ztest_random_ibshift(void)
966 {
967 return (DN_MIN_INDBLKSHIFT +
968 ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
969 }
970
971 static uint64_t
972 ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
973 {
974 uint64_t top;
975 vdev_t *rvd = spa->spa_root_vdev;
976 vdev_t *tvd;
977
978 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
979
980 do {
981 top = ztest_random(rvd->vdev_children);
982 tvd = rvd->vdev_child[top];
983 } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
984 tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
985
986 return (top);
987 }
988
989 static uint64_t
990 ztest_random_dsl_prop(zfs_prop_t prop)
991 {
992 uint64_t value;
993
994 do {
995 value = zfs_prop_random_value(prop, ztest_random(-1ULL));
996 } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
997
998 return (value);
999 }
1000
1001 static int
1002 ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
1003 boolean_t inherit)
1004 {
1005 const char *propname = zfs_prop_to_name(prop);
1006 const char *valname;
1007 char setpoint[MAXPATHLEN];
1008 uint64_t curval;
1009 int error;
1010
1011 error = dsl_prop_set_int(osname, propname,
1012 (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value);
1013
1014 if (error == ENOSPC) {
1015 ztest_record_enospc(FTAG);
1016 return (error);
1017 }
1018 ASSERT0(error);
1019
1020 VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint));
1021
1022 if (ztest_opts.zo_verbose >= 6) {
1023 VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
1024 (void) printf("%s %s = %s at '%s'\n",
1025 osname, propname, valname, setpoint);
1026 }
1027
1028 return (error);
1029 }
1030
1031 static int
1032 ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value)
1033 {
1034 spa_t *spa = ztest_spa;
1035 nvlist_t *props = NULL;
1036 int error;
1037
1038 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
1039 VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
1040
1041 error = spa_prop_set(spa, props);
1042
1043 nvlist_free(props);
1044
1045 if (error == ENOSPC) {
1046 ztest_record_enospc(FTAG);
1047 return (error);
1048 }
1049 ASSERT0(error);
1050
1051 return (error);
1052 }
1053
1054 static void
1055 ztest_rll_init(rll_t *rll)
1056 {
1057 rll->rll_writer = NULL;
1058 rll->rll_readers = 0;
1059 VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
1060 VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
1061 }
1062
1063 static void
1064 ztest_rll_destroy(rll_t *rll)
1065 {
1066 ASSERT(rll->rll_writer == NULL);
1067 ASSERT(rll->rll_readers == 0);
1068 VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
1069 VERIFY(cond_destroy(&rll->rll_cv) == 0);
1070 }
1071
1072 static void
1073 ztest_rll_lock(rll_t *rll, rl_type_t type)
1074 {
1075 VERIFY(mutex_lock(&rll->rll_lock) == 0);
1076
1077 if (type == RL_READER) {
1078 while (rll->rll_writer != NULL)
1079 (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
1080 rll->rll_readers++;
1081 } else {
1082 while (rll->rll_writer != NULL || rll->rll_readers)
1083 (void) cond_wait(&rll->rll_cv, &rll->rll_lock);
1084 rll->rll_writer = curthread;
1085 }
1086
1087 VERIFY(mutex_unlock(&rll->rll_lock) == 0);
1088 }
1089
1090 static void
1091 ztest_rll_unlock(rll_t *rll)
1092 {
1093 VERIFY(mutex_lock(&rll->rll_lock) == 0);
1094
1095 if (rll->rll_writer) {
1096 ASSERT(rll->rll_readers == 0);
1097 rll->rll_writer = NULL;
1098 } else {
1099 ASSERT(rll->rll_readers != 0);
1100 ASSERT(rll->rll_writer == NULL);
1101 rll->rll_readers--;
1102 }
1103
1104 if (rll->rll_writer == NULL && rll->rll_readers == 0)
1105 VERIFY(cond_broadcast(&rll->rll_cv) == 0);
1106
1107 VERIFY(mutex_unlock(&rll->rll_lock) == 0);
1108 }
1109
1110 static void
1111 ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
1112 {
1113 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
1114
1115 ztest_rll_lock(rll, type);
1116 }
1117
1118 static void
1119 ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
1120 {
1121 rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
1122
1123 ztest_rll_unlock(rll);
1124 }
1125
1126 static rl_t *
1127 ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
1128 uint64_t size, rl_type_t type)
1129 {
1130 uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
1131 rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
1132 rl_t *rl;
1133
1134 rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
1135 rl->rl_object = object;
1136 rl->rl_offset = offset;
1137 rl->rl_size = size;
1138 rl->rl_lock = rll;
1139
1140 ztest_rll_lock(rll, type);
1141
1142 return (rl);
1143 }
1144
1145 static void
1146 ztest_range_unlock(rl_t *rl)
1147 {
1148 rll_t *rll = rl->rl_lock;
1149
1150 ztest_rll_unlock(rll);
1151
1152 umem_free(rl, sizeof (*rl));
1153 }
1154
1155 static void
1156 ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os)
1157 {
1158 zd->zd_os = os;
1159 zd->zd_zilog = dmu_objset_zil(os);
1160 zd->zd_shared = szd;
1161 dmu_objset_name(os, zd->zd_name);
1162
1163 if (zd->zd_shared != NULL)
1164 zd->zd_shared->zd_seq = 0;
1165
1166 VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0);
1167 VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
1168
1169 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
1170 ztest_rll_init(&zd->zd_object_lock[l]);
1171
1172 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
1173 ztest_rll_init(&zd->zd_range_lock[l]);
1174 }
1175
1176 static void
1177 ztest_zd_fini(ztest_ds_t *zd)
1178 {
1179 VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
1180
1181 for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
1182 ztest_rll_destroy(&zd->zd_object_lock[l]);
1183
1184 for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
1185 ztest_rll_destroy(&zd->zd_range_lock[l]);
1186 }
1187
1188 #define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
1189
1190 static uint64_t
1191 ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
1192 {
1193 uint64_t txg;
1194 int error;
1195
1196 /*
1197 * Attempt to assign tx to some transaction group.
1198 */
1199 error = dmu_tx_assign(tx, txg_how);
1200 if (error) {
1201 if (error == ERESTART) {
1202 ASSERT(txg_how == TXG_NOWAIT);
1203 dmu_tx_wait(tx);
1204 } else {
1205 ASSERT3U(error, ==, ENOSPC);
1206 ztest_record_enospc(tag);
1207 }
1208 dmu_tx_abort(tx);
1209 return (0);
1210 }
1211 txg = dmu_tx_get_txg(tx);
1212 ASSERT(txg != 0);
1213 return (txg);
1214 }
1215
1216 static void
1217 ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
1218 {
1219 uint64_t *ip = buf;
1220 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
1221
1222 while (ip < ip_end)
1223 *ip++ = value;
1224 }
1225
1226 static boolean_t
1227 ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
1228 {
1229 uint64_t *ip = buf;
1230 uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
1231 uint64_t diff = 0;
1232
1233 while (ip < ip_end)
1234 diff |= (value - *ip++);
1235
1236 return (diff == 0);
1237 }
1238
1239 static void
1240 ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
1241 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
1242 {
1243 bt->bt_magic = BT_MAGIC;
1244 bt->bt_objset = dmu_objset_id(os);
1245 bt->bt_object = object;
1246 bt->bt_offset = offset;
1247 bt->bt_gen = gen;
1248 bt->bt_txg = txg;
1249 bt->bt_crtxg = crtxg;
1250 }
1251
1252 static void
1253 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
1254 uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
1255 {
1256 ASSERT(bt->bt_magic == BT_MAGIC);
1257 ASSERT(bt->bt_objset == dmu_objset_id(os));
1258 ASSERT(bt->bt_object == object);
1259 ASSERT(bt->bt_offset == offset);
1260 ASSERT(bt->bt_gen <= gen);
1261 ASSERT(bt->bt_txg <= txg);
1262 ASSERT(bt->bt_crtxg == crtxg);
1263 }
1264
1265 static ztest_block_tag_t *
1266 ztest_bt_bonus(dmu_buf_t *db)
1267 {
1268 dmu_object_info_t doi;
1269 ztest_block_tag_t *bt;
1270
1271 dmu_object_info_from_db(db, &doi);
1272 ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
1273 ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
1274 bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
1275
1276 return (bt);
1277 }
1278
1279 /*
1280 * ZIL logging ops
1281 */
1282
1283 #define lrz_type lr_mode
1284 #define lrz_blocksize lr_uid
1285 #define lrz_ibshift lr_gid
1286 #define lrz_bonustype lr_rdev
1287 #define lrz_bonuslen lr_crtime[1]
1288
1289 static void
1290 ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
1291 {
1292 char *name = (void *)(lr + 1); /* name follows lr */
1293 size_t namesize = strlen(name) + 1;
1294 itx_t *itx;
1295
1296 if (zil_replaying(zd->zd_zilog, tx))
1297 return;
1298
1299 itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
1300 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1301 sizeof (*lr) + namesize - sizeof (lr_t));
1302
1303 zil_itx_assign(zd->zd_zilog, itx, tx);
1304 }
1305
1306 static void
1307 ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object)
1308 {
1309 char *name = (void *)(lr + 1); /* name follows lr */
1310 size_t namesize = strlen(name) + 1;
1311 itx_t *itx;
1312
1313 if (zil_replaying(zd->zd_zilog, tx))
1314 return;
1315
1316 itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
1317 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1318 sizeof (*lr) + namesize - sizeof (lr_t));
1319
1320 itx->itx_oid = object;
1321 zil_itx_assign(zd->zd_zilog, itx, tx);
1322 }
1323
1324 static void
1325 ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
1326 {
1327 itx_t *itx;
1328 itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
1329
1330 if (zil_replaying(zd->zd_zilog, tx))
1331 return;
1332
1333 if (lr->lr_length > ZIL_MAX_LOG_DATA)
1334 write_state = WR_INDIRECT;
1335
1336 itx = zil_itx_create(TX_WRITE,
1337 sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
1338
1339 if (write_state == WR_COPIED &&
1340 dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
1341 ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
1342 zil_itx_destroy(itx);
1343 itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1344 write_state = WR_NEED_COPY;
1345 }
1346 itx->itx_private = zd;
1347 itx->itx_wr_state = write_state;
1348 itx->itx_sync = (ztest_random(8) == 0);
1349 itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
1350
1351 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1352 sizeof (*lr) - sizeof (lr_t));
1353
1354 zil_itx_assign(zd->zd_zilog, itx, tx);
1355 }
1356
1357 static void
1358 ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
1359 {
1360 itx_t *itx;
1361
1362 if (zil_replaying(zd->zd_zilog, tx))
1363 return;
1364
1365 itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1366 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1367 sizeof (*lr) - sizeof (lr_t));
1368
1369 itx->itx_sync = B_FALSE;
1370 zil_itx_assign(zd->zd_zilog, itx, tx);
1371 }
1372
1373 static void
1374 ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
1375 {
1376 itx_t *itx;
1377
1378 if (zil_replaying(zd->zd_zilog, tx))
1379 return;
1380
1381 itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
1382 bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
1383 sizeof (*lr) - sizeof (lr_t));
1384
1385 itx->itx_sync = B_FALSE;
1386 zil_itx_assign(zd->zd_zilog, itx, tx);
1387 }
1388
1389 /*
1390 * ZIL replay ops
1391 */
1392 static int
1393 ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
1394 {
1395 char *name = (void *)(lr + 1); /* name follows lr */
1396 objset_t *os = zd->zd_os;
1397 ztest_block_tag_t *bbt;
1398 dmu_buf_t *db;
1399 dmu_tx_t *tx;
1400 uint64_t txg;
1401 int error = 0;
1402
1403 if (byteswap)
1404 byteswap_uint64_array(lr, sizeof (*lr));
1405
1406 ASSERT(lr->lr_doid == ZTEST_DIROBJ);
1407 ASSERT(name[0] != '\0');
1408
1409 tx = dmu_tx_create(os);
1410
1411 dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
1412
1413 if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
1414 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1415 } else {
1416 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1417 }
1418
1419 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1420 if (txg == 0)
1421 return (ENOSPC);
1422
1423 ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
1424
1425 if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
1426 if (lr->lr_foid == 0) {
1427 lr->lr_foid = zap_create(os,
1428 lr->lrz_type, lr->lrz_bonustype,
1429 lr->lrz_bonuslen, tx);
1430 } else {
1431 error = zap_create_claim(os, lr->lr_foid,
1432 lr->lrz_type, lr->lrz_bonustype,
1433 lr->lrz_bonuslen, tx);
1434 }
1435 } else {
1436 if (lr->lr_foid == 0) {
1437 lr->lr_foid = dmu_object_alloc(os,
1438 lr->lrz_type, 0, lr->lrz_bonustype,
1439 lr->lrz_bonuslen, tx);
1440 } else {
1441 error = dmu_object_claim(os, lr->lr_foid,
1442 lr->lrz_type, 0, lr->lrz_bonustype,
1443 lr->lrz_bonuslen, tx);
1444 }
1445 }
1446
1447 if (error) {
1448 ASSERT3U(error, ==, EEXIST);
1449 ASSERT(zd->zd_zilog->zl_replay);
1450 dmu_tx_commit(tx);
1451 return (error);
1452 }
1453
1454 ASSERT(lr->lr_foid != 0);
1455
1456 if (lr->lrz_type != DMU_OT_ZAP_OTHER)
1457 VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
1458 lr->lrz_blocksize, lr->lrz_ibshift, tx));
1459
1460 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
1461 bbt = ztest_bt_bonus(db);
1462 dmu_buf_will_dirty(db, tx);
1463 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
1464 dmu_buf_rele(db, FTAG);
1465
1466 VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
1467 &lr->lr_foid, tx));
1468
1469 (void) ztest_log_create(zd, tx, lr);
1470
1471 dmu_tx_commit(tx);
1472
1473 return (0);
1474 }
1475
1476 static int
1477 ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
1478 {
1479 char *name = (void *)(lr + 1); /* name follows lr */
1480 objset_t *os = zd->zd_os;
1481 dmu_object_info_t doi;
1482 dmu_tx_t *tx;
1483 uint64_t object, txg;
1484
1485 if (byteswap)
1486 byteswap_uint64_array(lr, sizeof (*lr));
1487
1488 ASSERT(lr->lr_doid == ZTEST_DIROBJ);
1489 ASSERT(name[0] != '\0');
1490
1491 VERIFY3U(0, ==,
1492 zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
1493 ASSERT(object != 0);
1494
1495 ztest_object_lock(zd, object, RL_WRITER);
1496
1497 VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
1498
1499 tx = dmu_tx_create(os);
1500
1501 dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
1502 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
1503
1504 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1505 if (txg == 0) {
1506 ztest_object_unlock(zd, object);
1507 return (ENOSPC);
1508 }
1509
1510 if (doi.doi_type == DMU_OT_ZAP_OTHER) {
1511 VERIFY3U(0, ==, zap_destroy(os, object, tx));
1512 } else {
1513 VERIFY3U(0, ==, dmu_object_free(os, object, tx));
1514 }
1515
1516 VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
1517
1518 (void) ztest_log_remove(zd, tx, lr, object);
1519
1520 dmu_tx_commit(tx);
1521
1522 ztest_object_unlock(zd, object);
1523
1524 return (0);
1525 }
1526
1527 static int
1528 ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
1529 {
1530 objset_t *os = zd->zd_os;
1531 void *data = lr + 1; /* data follows lr */
1532 uint64_t offset, length;
1533 ztest_block_tag_t *bt = data;
1534 ztest_block_tag_t *bbt;
1535 uint64_t gen, txg, lrtxg, crtxg;
1536 dmu_object_info_t doi;
1537 dmu_tx_t *tx;
1538 dmu_buf_t *db;
1539 arc_buf_t *abuf = NULL;
1540 rl_t *rl;
1541
1542 if (byteswap)
1543 byteswap_uint64_array(lr, sizeof (*lr));
1544
1545 offset = lr->lr_offset;
1546 length = lr->lr_length;
1547
1548 /* If it's a dmu_sync() block, write the whole block */
1549 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
1550 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
1551 if (length < blocksize) {
1552 offset -= offset % blocksize;
1553 length = blocksize;
1554 }
1555 }
1556
1557 if (bt->bt_magic == BSWAP_64(BT_MAGIC))
1558 byteswap_uint64_array(bt, sizeof (*bt));
1559
1560 if (bt->bt_magic != BT_MAGIC)
1561 bt = NULL;
1562
1563 ztest_object_lock(zd, lr->lr_foid, RL_READER);
1564 rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
1565
1566 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
1567
1568 dmu_object_info_from_db(db, &doi);
1569
1570 bbt = ztest_bt_bonus(db);
1571 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1572 gen = bbt->bt_gen;
1573 crtxg = bbt->bt_crtxg;
1574 lrtxg = lr->lr_common.lrc_txg;
1575
1576 tx = dmu_tx_create(os);
1577
1578 dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
1579
1580 if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
1581 P2PHASE(offset, length) == 0)
1582 abuf = dmu_request_arcbuf(db, length);
1583
1584 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1585 if (txg == 0) {
1586 if (abuf != NULL)
1587 dmu_return_arcbuf(abuf);
1588 dmu_buf_rele(db, FTAG);
1589 ztest_range_unlock(rl);
1590 ztest_object_unlock(zd, lr->lr_foid);
1591 return (ENOSPC);
1592 }
1593
1594 if (bt != NULL) {
1595 /*
1596 * Usually, verify the old data before writing new data --
1597 * but not always, because we also want to verify correct
1598 * behavior when the data was not recently read into cache.
1599 */
1600 ASSERT(offset % doi.doi_data_block_size == 0);
1601 if (ztest_random(4) != 0) {
1602 int prefetch = ztest_random(2) ?
1603 DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
1604 ztest_block_tag_t rbt;
1605
1606 VERIFY(dmu_read(os, lr->lr_foid, offset,
1607 sizeof (rbt), &rbt, prefetch) == 0);
1608 if (rbt.bt_magic == BT_MAGIC) {
1609 ztest_bt_verify(&rbt, os, lr->lr_foid,
1610 offset, gen, txg, crtxg);
1611 }
1612 }
1613
1614 /*
1615 * Writes can appear to be newer than the bonus buffer because
1616 * the ztest_get_data() callback does a dmu_read() of the
1617 * open-context data, which may be different than the data
1618 * as it was when the write was generated.
1619 */
1620 if (zd->zd_zilog->zl_replay) {
1621 ztest_bt_verify(bt, os, lr->lr_foid, offset,
1622 MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
1623 bt->bt_crtxg);
1624 }
1625
1626 /*
1627 * Set the bt's gen/txg to the bonus buffer's gen/txg
1628 * so that all of the usual ASSERTs will work.
1629 */
1630 ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
1631 }
1632
1633 if (abuf == NULL) {
1634 dmu_write(os, lr->lr_foid, offset, length, data, tx);
1635 } else {
1636 bcopy(data, abuf->b_data, length);
1637 dmu_assign_arcbuf(db, offset, abuf, tx);
1638 }
1639
1640 (void) ztest_log_write(zd, tx, lr);
1641
1642 dmu_buf_rele(db, FTAG);
1643
1644 dmu_tx_commit(tx);
1645
1646 ztest_range_unlock(rl);
1647 ztest_object_unlock(zd, lr->lr_foid);
1648
1649 return (0);
1650 }
1651
1652 static int
1653 ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
1654 {
1655 objset_t *os = zd->zd_os;
1656 dmu_tx_t *tx;
1657 uint64_t txg;
1658 rl_t *rl;
1659
1660 if (byteswap)
1661 byteswap_uint64_array(lr, sizeof (*lr));
1662
1663 ztest_object_lock(zd, lr->lr_foid, RL_READER);
1664 rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
1665 RL_WRITER);
1666
1667 tx = dmu_tx_create(os);
1668
1669 dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
1670
1671 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1672 if (txg == 0) {
1673 ztest_range_unlock(rl);
1674 ztest_object_unlock(zd, lr->lr_foid);
1675 return (ENOSPC);
1676 }
1677
1678 VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
1679 lr->lr_length, tx) == 0);
1680
1681 (void) ztest_log_truncate(zd, tx, lr);
1682
1683 dmu_tx_commit(tx);
1684
1685 ztest_range_unlock(rl);
1686 ztest_object_unlock(zd, lr->lr_foid);
1687
1688 return (0);
1689 }
1690
1691 static int
1692 ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
1693 {
1694 objset_t *os = zd->zd_os;
1695 dmu_tx_t *tx;
1696 dmu_buf_t *db;
1697 ztest_block_tag_t *bbt;
1698 uint64_t txg, lrtxg, crtxg;
1699
1700 if (byteswap)
1701 byteswap_uint64_array(lr, sizeof (*lr));
1702
1703 ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
1704
1705 VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
1706
1707 tx = dmu_tx_create(os);
1708 dmu_tx_hold_bonus(tx, lr->lr_foid);
1709
1710 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
1711 if (txg == 0) {
1712 dmu_buf_rele(db, FTAG);
1713 ztest_object_unlock(zd, lr->lr_foid);
1714 return (ENOSPC);
1715 }
1716
1717 bbt = ztest_bt_bonus(db);
1718 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1719 crtxg = bbt->bt_crtxg;
1720 lrtxg = lr->lr_common.lrc_txg;
1721
1722 if (zd->zd_zilog->zl_replay) {
1723 ASSERT(lr->lr_size != 0);
1724 ASSERT(lr->lr_mode != 0);
1725 ASSERT(lrtxg != 0);
1726 } else {
1727 /*
1728 * Randomly change the size and increment the generation.
1729 */
1730 lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
1731 sizeof (*bbt);
1732 lr->lr_mode = bbt->bt_gen + 1;
1733 ASSERT(lrtxg == 0);
1734 }
1735
1736 /*
1737 * Verify that the current bonus buffer is not newer than our txg.
1738 */
1739 ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
1740 MAX(txg, lrtxg), crtxg);
1741
1742 dmu_buf_will_dirty(db, tx);
1743
1744 ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
1745 ASSERT3U(lr->lr_size, <=, db->db_size);
1746 VERIFY0(dmu_set_bonus(db, lr->lr_size, tx));
1747 bbt = ztest_bt_bonus(db);
1748
1749 ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
1750
1751 dmu_buf_rele(db, FTAG);
1752
1753 (void) ztest_log_setattr(zd, tx, lr);
1754
1755 dmu_tx_commit(tx);
1756
1757 ztest_object_unlock(zd, lr->lr_foid);
1758
1759 return (0);
1760 }
1761
1762 zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
1763 NULL, /* 0 no such transaction type */
1764 ztest_replay_create, /* TX_CREATE */
1765 NULL, /* TX_MKDIR */
1766 NULL, /* TX_MKXATTR */
1767 NULL, /* TX_SYMLINK */
1768 ztest_replay_remove, /* TX_REMOVE */
1769 NULL, /* TX_RMDIR */
1770 NULL, /* TX_LINK */
1771 NULL, /* TX_RENAME */
1772 ztest_replay_write, /* TX_WRITE */
1773 ztest_replay_truncate, /* TX_TRUNCATE */
1774 ztest_replay_setattr, /* TX_SETATTR */
1775 NULL, /* TX_ACL */
1776 NULL, /* TX_CREATE_ACL */
1777 NULL, /* TX_CREATE_ATTR */
1778 NULL, /* TX_CREATE_ACL_ATTR */
1779 NULL, /* TX_MKDIR_ACL */
1780 NULL, /* TX_MKDIR_ATTR */
1781 NULL, /* TX_MKDIR_ACL_ATTR */
1782 NULL, /* TX_WRITE2 */
1783 };
1784
1785 /*
1786 * ZIL get_data callbacks
1787 */
1788
1789 static void
1790 ztest_get_done(zgd_t *zgd, int error)
1791 {
1792 ztest_ds_t *zd = zgd->zgd_private;
1793 uint64_t object = zgd->zgd_rl->rl_object;
1794
1795 if (zgd->zgd_db)
1796 dmu_buf_rele(zgd->zgd_db, zgd);
1797
1798 ztest_range_unlock(zgd->zgd_rl);
1799 ztest_object_unlock(zd, object);
1800
1801 if (error == 0 && zgd->zgd_bp)
1802 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1803
1804 umem_free(zgd, sizeof (*zgd));
1805 }
1806
1807 static int
1808 ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1809 {
1810 ztest_ds_t *zd = arg;
1811 objset_t *os = zd->zd_os;
1812 uint64_t object = lr->lr_foid;
1813 uint64_t offset = lr->lr_offset;
1814 uint64_t size = lr->lr_length;
1815 blkptr_t *bp = &lr->lr_blkptr;
1816 uint64_t txg = lr->lr_common.lrc_txg;
1817 uint64_t crtxg;
1818 dmu_object_info_t doi;
1819 dmu_buf_t *db;
1820 zgd_t *zgd;
1821 int error;
1822
1823 ztest_object_lock(zd, object, RL_READER);
1824 error = dmu_bonus_hold(os, object, FTAG, &db);
1825 if (error) {
1826 ztest_object_unlock(zd, object);
1827 return (error);
1828 }
1829
1830 crtxg = ztest_bt_bonus(db)->bt_crtxg;
1831
1832 if (crtxg == 0 || crtxg > txg) {
1833 dmu_buf_rele(db, FTAG);
1834 ztest_object_unlock(zd, object);
1835 return (ENOENT);
1836 }
1837
1838 dmu_object_info_from_db(db, &doi);
1839 dmu_buf_rele(db, FTAG);
1840 db = NULL;
1841
1842 zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
1843 zgd->zgd_zilog = zd->zd_zilog;
1844 zgd->zgd_private = zd;
1845
1846 if (buf != NULL) { /* immediate write */
1847 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
1848 RL_READER);
1849
1850 error = dmu_read(os, object, offset, size, buf,
1851 DMU_READ_NO_PREFETCH);
1852 ASSERT(error == 0);
1853 } else {
1854 size = doi.doi_data_block_size;
1855 if (ISP2(size)) {
1856 offset = P2ALIGN(offset, size);
1857 } else {
1858 ASSERT(offset < size);
1859 offset = 0;
1860 }
1861
1862 zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
1863 RL_READER);
1864
1865 error = dmu_buf_hold(os, object, offset, zgd, &db,
1866 DMU_READ_NO_PREFETCH);
1867
1868 if (error == 0) {
1869 blkptr_t *obp = dmu_buf_get_blkptr(db);
1870 if (obp) {
1871 ASSERT(BP_IS_HOLE(bp));
1872 *bp = *obp;
1873 }
1874
1875 zgd->zgd_db = db;
1876 zgd->zgd_bp = bp;
1877
1878 ASSERT(db->db_offset == offset);
1879 ASSERT(db->db_size == size);
1880
1881 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1882 ztest_get_done, zgd);
1883
1884 if (error == 0)
1885 return (0);
1886 }
1887 }
1888
1889 ztest_get_done(zgd, error);
1890
1891 return (error);
1892 }
1893
1894 static void *
1895 ztest_lr_alloc(size_t lrsize, char *name)
1896 {
1897 char *lr;
1898 size_t namesize = name ? strlen(name) + 1 : 0;
1899
1900 lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
1901
1902 if (name)
1903 bcopy(name, lr + lrsize, namesize);
1904
1905 return (lr);
1906 }
1907
1908 void
1909 ztest_lr_free(void *lr, size_t lrsize, char *name)
1910 {
1911 size_t namesize = name ? strlen(name) + 1 : 0;
1912
1913 umem_free(lr, lrsize + namesize);
1914 }
1915
1916 /*
1917 * Lookup a bunch of objects. Returns the number of objects not found.
1918 */
1919 static int
1920 ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
1921 {
1922 int missing = 0;
1923 int error;
1924
1925 ASSERT(_mutex_held(&zd->zd_dirobj_lock));
1926
1927 for (int i = 0; i < count; i++, od++) {
1928 od->od_object = 0;
1929 error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
1930 sizeof (uint64_t), 1, &od->od_object);
1931 if (error) {
1932 ASSERT(error == ENOENT);
1933 ASSERT(od->od_object == 0);
1934 missing++;
1935 } else {
1936 dmu_buf_t *db;
1937 ztest_block_tag_t *bbt;
1938 dmu_object_info_t doi;
1939
1940 ASSERT(od->od_object != 0);
1941 ASSERT(missing == 0); /* there should be no gaps */
1942
1943 ztest_object_lock(zd, od->od_object, RL_READER);
1944 VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
1945 od->od_object, FTAG, &db));
1946 dmu_object_info_from_db(db, &doi);
1947 bbt = ztest_bt_bonus(db);
1948 ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
1949 od->od_type = doi.doi_type;
1950 od->od_blocksize = doi.doi_data_block_size;
1951 od->od_gen = bbt->bt_gen;
1952 dmu_buf_rele(db, FTAG);
1953 ztest_object_unlock(zd, od->od_object);
1954 }
1955 }
1956
1957 return (missing);
1958 }
1959
1960 static int
1961 ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
1962 {
1963 int missing = 0;
1964
1965 ASSERT(_mutex_held(&zd->zd_dirobj_lock));
1966
1967 for (int i = 0; i < count; i++, od++) {
1968 if (missing) {
1969 od->od_object = 0;
1970 missing++;
1971 continue;
1972 }
1973
1974 lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
1975
1976 lr->lr_doid = od->od_dir;
1977 lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */
1978 lr->lrz_type = od->od_crtype;
1979 lr->lrz_blocksize = od->od_crblocksize;
1980 lr->lrz_ibshift = ztest_random_ibshift();
1981 lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
1982 lr->lrz_bonuslen = dmu_bonus_max();
1983 lr->lr_gen = od->od_crgen;
1984 lr->lr_crtime[0] = time(NULL);
1985
1986 if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
1987 ASSERT(missing == 0);
1988 od->od_object = 0;
1989 missing++;
1990 } else {
1991 od->od_object = lr->lr_foid;
1992 od->od_type = od->od_crtype;
1993 od->od_blocksize = od->od_crblocksize;
1994 od->od_gen = od->od_crgen;
1995 ASSERT(od->od_object != 0);
1996 }
1997
1998 ztest_lr_free(lr, sizeof (*lr), od->od_name);
1999 }
2000
2001 return (missing);
2002 }
2003
2004 static int
2005 ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
2006 {
2007 int missing = 0;
2008 int error;
2009
2010 ASSERT(_mutex_held(&zd->zd_dirobj_lock));
2011
2012 od += count - 1;
2013
2014 for (int i = count - 1; i >= 0; i--, od--) {
2015 if (missing) {
2016 missing++;
2017 continue;
2018 }
2019
2020 /*
2021 * No object was found.
2022 */
2023 if (od->od_object == 0)
2024 continue;
2025
2026 lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
2027
2028 lr->lr_doid = od->od_dir;
2029
2030 if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
2031 ASSERT3U(error, ==, ENOSPC);
2032 missing++;
2033 } else {
2034 od->od_object = 0;
2035 }
2036 ztest_lr_free(lr, sizeof (*lr), od->od_name);
2037 }
2038
2039 return (missing);
2040 }
2041
2042 static int
2043 ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
2044 void *data)
2045 {
2046 lr_write_t *lr;
2047 int error;
2048
2049 lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
2050
2051 lr->lr_foid = object;
2052 lr->lr_offset = offset;
2053 lr->lr_length = size;
2054 lr->lr_blkoff = 0;
2055 BP_ZERO(&lr->lr_blkptr);
2056
2057 bcopy(data, lr + 1, size);
2058
2059 error = ztest_replay_write(zd, lr, B_FALSE);
2060
2061 ztest_lr_free(lr, sizeof (*lr) + size, NULL);
2062
2063 return (error);
2064 }
2065
2066 static int
2067 ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
2068 {
2069 lr_truncate_t *lr;
2070 int error;
2071
2072 lr = ztest_lr_alloc(sizeof (*lr), NULL);
2073
2074 lr->lr_foid = object;
2075 lr->lr_offset = offset;
2076 lr->lr_length = size;
2077
2078 error = ztest_replay_truncate(zd, lr, B_FALSE);
2079
2080 ztest_lr_free(lr, sizeof (*lr), NULL);
2081
2082 return (error);
2083 }
2084
2085 static int
2086 ztest_setattr(ztest_ds_t *zd, uint64_t object)
2087 {
2088 lr_setattr_t *lr;
2089 int error;
2090
2091 lr = ztest_lr_alloc(sizeof (*lr), NULL);
2092
2093 lr->lr_foid = object;
2094 lr->lr_size = 0;
2095 lr->lr_mode = 0;
2096
2097 error = ztest_replay_setattr(zd, lr, B_FALSE);
2098
2099 ztest_lr_free(lr, sizeof (*lr), NULL);
2100
2101 return (error);
2102 }
2103
2104 static void
2105 ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
2106 {
2107 objset_t *os = zd->zd_os;
2108 dmu_tx_t *tx;
2109 uint64_t txg;
2110 rl_t *rl;
2111
2112 txg_wait_synced(dmu_objset_pool(os), 0);
2113
2114 ztest_object_lock(zd, object, RL_READER);
2115 rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
2116
2117 tx = dmu_tx_create(os);
2118
2119 dmu_tx_hold_write(tx, object, offset, size);
2120
2121 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
2122
2123 if (txg != 0) {
2124 dmu_prealloc(os, object, offset, size, tx);
2125 dmu_tx_commit(tx);
2126 txg_wait_synced(dmu_objset_pool(os), txg);
2127 } else {
2128 (void) dmu_free_long_range(os, object, offset, size);
2129 }
2130
2131 ztest_range_unlock(rl);
2132 ztest_object_unlock(zd, object);
2133 }
2134
2135 static void
2136 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
2137 {
2138 int err;
2139 ztest_block_tag_t wbt;
2140 dmu_object_info_t doi;
2141 enum ztest_io_type io_type;
2142 uint64_t blocksize;
2143 void *data;
2144
2145 VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
2146 blocksize = doi.doi_data_block_size;
2147 data = umem_alloc(blocksize, UMEM_NOFAIL);
2148
2149 /*
2150 * Pick an i/o type at random, biased toward writing block tags.
2151 */
2152 io_type = ztest_random(ZTEST_IO_TYPES);
2153 if (ztest_random(2) == 0)
2154 io_type = ZTEST_IO_WRITE_TAG;
2155
2156 (void) rw_rdlock(&zd->zd_zilog_lock);
2157
2158 switch (io_type) {
2159
2160 case ZTEST_IO_WRITE_TAG:
2161 ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
2162 (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
2163 break;
2164
2165 case ZTEST_IO_WRITE_PATTERN:
2166 (void) memset(data, 'a' + (object + offset) % 5, blocksize);
2167 if (ztest_random(2) == 0) {
2168 /*
2169 * Induce fletcher2 collisions to ensure that
2170 * zio_ddt_collision() detects and resolves them
2171 * when using fletcher2-verify for deduplication.
2172 */
2173 ((uint64_t *)data)[0] ^= 1ULL << 63;
2174 ((uint64_t *)data)[4] ^= 1ULL << 63;
2175 }
2176 (void) ztest_write(zd, object, offset, blocksize, data);
2177 break;
2178
2179 case ZTEST_IO_WRITE_ZEROES:
2180 bzero(data, blocksize);
2181 (void) ztest_write(zd, object, offset, blocksize, data);
2182 break;
2183
2184 case ZTEST_IO_TRUNCATE:
2185 (void) ztest_truncate(zd, object, offset, blocksize);
2186 break;
2187
2188 case ZTEST_IO_SETATTR:
2189 (void) ztest_setattr(zd, object);
2190 break;
2191
2192 case ZTEST_IO_REWRITE:
2193 (void) rw_rdlock(&ztest_name_lock);
2194 err = ztest_dsl_prop_set_uint64(zd->zd_name,
2195 ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
2196 B_FALSE);
2197 VERIFY(err == 0 || err == ENOSPC);
2198 err = ztest_dsl_prop_set_uint64(zd->zd_name,
2199 ZFS_PROP_COMPRESSION,
2200 ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
2201 B_FALSE);
2202 VERIFY(err == 0 || err == ENOSPC);
2203 (void) rw_unlock(&ztest_name_lock);
2204
2205 VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
2206 DMU_READ_NO_PREFETCH));
2207
2208 (void) ztest_write(zd, object, offset, blocksize, data);
2209 break;
2210 }
2211
2212 (void) rw_unlock(&zd->zd_zilog_lock);
2213
2214 umem_free(data, blocksize);
2215 }
2216
2217 /*
2218 * Initialize an object description template.
2219 */
2220 static void
2221 ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
2222 dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
2223 {
2224 od->od_dir = ZTEST_DIROBJ;
2225 od->od_object = 0;
2226
2227 od->od_crtype = type;
2228 od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
2229 od->od_crgen = gen;
2230
2231 od->od_type = DMU_OT_NONE;
2232 od->od_blocksize = 0;
2233 od->od_gen = 0;
2234
2235 (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
2236 tag, (int64_t)id, index);
2237 }
2238
2239 /*
2240 * Lookup or create the objects for a test using the od template.
2241 * If the objects do not all exist, or if 'remove' is specified,
2242 * remove any existing objects and create new ones. Otherwise,
2243 * use the existing objects.
2244 */
2245 static int
2246 ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
2247 {
2248 int count = size / sizeof (*od);
2249 int rv = 0;
2250
2251 VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
2252 if ((ztest_lookup(zd, od, count) != 0 || remove) &&
2253 (ztest_remove(zd, od, count) != 0 ||
2254 ztest_create(zd, od, count) != 0))
2255 rv = -1;
2256 zd->zd_od = od;
2257 VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
2258
2259 return (rv);
2260 }
2261
2262 /* ARGSUSED */
2263 void
2264 ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
2265 {
2266 zilog_t *zilog = zd->zd_zilog;
2267
2268 (void) rw_rdlock(&zd->zd_zilog_lock);
2269
2270 zil_commit(zilog, ztest_random(ZTEST_OBJECTS));
2271
2272 /*
2273 * Remember the committed values in zd, which is in parent/child
2274 * shared memory. If we die, the next iteration of ztest_run()
2275 * will verify that the log really does contain this record.
2276 */
2277 mutex_enter(&zilog->zl_lock);
2278 ASSERT(zd->zd_shared != NULL);
2279 ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq);
2280 zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq;
2281 mutex_exit(&zilog->zl_lock);
2282
2283 (void) rw_unlock(&zd->zd_zilog_lock);
2284 }
2285
2286 /*
2287 * This function is designed to simulate the operations that occur during a
2288 * mount/unmount operation. We hold the dataset across these operations in an
2289 * attempt to expose any implicit assumptions about ZIL management.
2290 */
2291 /* ARGSUSED */
2292 void
2293 ztest_zil_remount(ztest_ds_t *zd, uint64_t id)
2294 {
2295 objset_t *os = zd->zd_os;
2296
2297 /*
2298 * We grab the zd_dirobj_lock to ensure that no other thread is
2299 * updating the zil (i.e. adding in-memory log records) and the
2300 * zd_zilog_lock to block any I/O.
2301 */
2302 VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
2303 (void) rw_wrlock(&zd->zd_zilog_lock);
2304
2305 /* zfsvfs_teardown() */
2306 zil_close(zd->zd_zilog);
2307
2308 /* zfsvfs_setup() */
2309 VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog);
2310 zil_replay(os, zd, ztest_replay_vector);
2311
2312 (void) rw_unlock(&zd->zd_zilog_lock);
2313 VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
2314 }
2315
2316 /*
2317 * Verify that we can't destroy an active pool, create an existing pool,
2318 * or create a pool with a bad vdev spec.
2319 */
2320 /* ARGSUSED */
2321 void
2322 ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
2323 {
2324 ztest_shared_opts_t *zo = &ztest_opts;
2325 spa_t *spa;
2326 nvlist_t *nvroot;
2327
2328 /*
2329 * Attempt to create using a bad file.
2330 */
2331 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
2332 VERIFY3U(ENOENT, ==,
2333 spa_create("ztest_bad_file", nvroot, NULL, NULL));
2334 nvlist_free(nvroot);
2335
2336 /*
2337 * Attempt to create using a bad mirror.
2338 */
2339 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1);
2340 VERIFY3U(ENOENT, ==,
2341 spa_create("ztest_bad_mirror", nvroot, NULL, NULL));
2342 nvlist_free(nvroot);
2343
2344 /*
2345 * Attempt to create an existing pool. It shouldn't matter
2346 * what's in the nvroot; we should fail with EEXIST.
2347 */
2348 (void) rw_rdlock(&ztest_name_lock);
2349 nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1);
2350 VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL));
2351 nvlist_free(nvroot);
2352 VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG));
2353 VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool));
2354 spa_close(spa, FTAG);
2355
2356 (void) rw_unlock(&ztest_name_lock);
2357 }
2358
2359 /* ARGSUSED */
2360 void
2361 ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id)
2362 {
2363 spa_t *spa;
2364 uint64_t initial_version = SPA_VERSION_INITIAL;
2365 uint64_t version, newversion;
2366 nvlist_t *nvroot, *props;
2367 char *name;
2368
2369 VERIFY0(mutex_lock(&ztest_vdev_lock));
2370 name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool);
2371
2372 /*
2373 * Clean up from previous runs.
2374 */
2375 (void) spa_destroy(name);
2376
2377 nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0,
2378 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1);
2379
2380 /*
2381 * If we're configuring a RAIDZ device then make sure that the
2382 * the initial version is capable of supporting that feature.
2383 */
2384 switch (ztest_opts.zo_raidz_parity) {
2385 case 0:
2386 case 1:
2387 initial_version = SPA_VERSION_INITIAL;
2388 break;
2389 case 2:
2390 initial_version = SPA_VERSION_RAIDZ2;
2391 break;
2392 case 3:
2393 initial_version = SPA_VERSION_RAIDZ3;
2394 break;
2395 }
2396
2397 /*
2398 * Create a pool with a spa version that can be upgraded. Pick
2399 * a value between initial_version and SPA_VERSION_BEFORE_FEATURES.
2400 */
2401 do {
2402 version = ztest_random_spa_version(initial_version);
2403 } while (version > SPA_VERSION_BEFORE_FEATURES);
2404
2405 props = fnvlist_alloc();
2406 fnvlist_add_uint64(props,
2407 zpool_prop_to_name(ZPOOL_PROP_VERSION), version);
2408 VERIFY0(spa_create(name, nvroot, props, NULL));
2409 fnvlist_free(nvroot);
2410 fnvlist_free(props);
2411
2412 VERIFY0(spa_open(name, &spa, FTAG));
2413 VERIFY3U(spa_version(spa), ==, version);
2414 newversion = ztest_random_spa_version(version + 1);
2415
2416 if (ztest_opts.zo_verbose >= 4) {
2417 (void) printf("upgrading spa version from %llu to %llu\n",
2418 (u_longlong_t)version, (u_longlong_t)newversion);
2419 }
2420
2421 spa_upgrade(spa, newversion);
2422 VERIFY3U(spa_version(spa), >, version);
2423 VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config,
2424 zpool_prop_to_name(ZPOOL_PROP_VERSION)));
2425 spa_close(spa, FTAG);
2426
2427 strfree(name);
2428 VERIFY0(mutex_unlock(&ztest_vdev_lock));
2429 }
2430
2431 static vdev_t *
2432 vdev_lookup_by_path(vdev_t *vd, const char *path)
2433 {
2434 vdev_t *mvd;
2435
2436 if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
2437 return (vd);
2438
2439 for (int c = 0; c < vd->vdev_children; c++)
2440 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
2441 NULL)
2442 return (mvd);
2443
2444 return (NULL);
2445 }
2446
2447 /*
2448 * Find the first available hole which can be used as a top-level.
2449 */
2450 int
2451 find_vdev_hole(spa_t *spa)
2452 {
2453 vdev_t *rvd = spa->spa_root_vdev;
2454 int c;
2455
2456 ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
2457
2458 for (c = 0; c < rvd->vdev_children; c++) {
2459 vdev_t *cvd = rvd->vdev_child[c];
2460
2461 if (cvd->vdev_ishole)
2462 break;
2463 }
2464 return (c);
2465 }
2466
2467 /*
2468 * Verify that vdev_add() works as expected.
2469 */
2470 /* ARGSUSED */
2471 void
2472 ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
2473 {
2474 ztest_shared_t *zs = ztest_shared;
2475 spa_t *spa = ztest_spa;
2476 uint64_t leaves;
2477 uint64_t guid;
2478 nvlist_t *nvroot;
2479 int error;
2480
2481 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2482 leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz;
2483
2484 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2485
2486 ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
2487
2488 /*
2489 * If we have slogs then remove them 1/4 of the time.
2490 */
2491 if (spa_has_slogs(spa) && ztest_random(4) == 0) {
2492 /*
2493 * Grab the guid from the head of the log class rotor.
2494 */
2495 guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
2496
2497 spa_config_exit(spa, SCL_VDEV, FTAG);
2498
2499 /*
2500 * We have to grab the zs_name_lock as writer to
2501 * prevent a race between removing a slog (dmu_objset_find)
2502 * and destroying a dataset. Removing the slog will
2503 * grab a reference on the dataset which may cause
2504 * dmu_objset_destroy() to fail with EBUSY thus
2505 * leaving the dataset in an inconsistent state.
2506 */
2507 VERIFY(rw_wrlock(&ztest_name_lock) == 0);
2508 error = spa_vdev_remove(spa, guid, B_FALSE);
2509 VERIFY(rw_unlock(&ztest_name_lock) == 0);
2510
2511 if (error && error != EEXIST)
2512 fatal(0, "spa_vdev_remove() = %d", error);
2513 } else {
2514 spa_config_exit(spa, SCL_VDEV, FTAG);
2515
2516 /*
2517 * Make 1/4 of the devices be log devices.
2518 */
2519 nvroot = make_vdev_root(NULL, NULL, NULL,
2520 ztest_opts.zo_vdev_size, 0,
2521 ztest_random(4) == 0, ztest_opts.zo_raidz,
2522 zs->zs_mirrors, 1);
2523
2524 error = spa_vdev_add(spa, nvroot);
2525 nvlist_free(nvroot);
2526
2527 if (error == ENOSPC)
2528 ztest_record_enospc("spa_vdev_add");
2529 else if (error != 0)
2530 fatal(0, "spa_vdev_add() = %d", error);
2531 }
2532
2533 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2534 }
2535
2536 /*
2537 * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
2538 */
2539 /* ARGSUSED */
2540 void
2541 ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
2542 {
2543 ztest_shared_t *zs = ztest_shared;
2544 spa_t *spa = ztest_spa;
2545 vdev_t *rvd = spa->spa_root_vdev;
2546 spa_aux_vdev_t *sav;
2547 char *aux;
2548 uint64_t guid = 0;
2549 int error;
2550
2551 if (ztest_random(2) == 0) {
2552 sav = &spa->spa_spares;
2553 aux = ZPOOL_CONFIG_SPARES;
2554 } else {
2555 sav = &spa->spa_l2cache;
2556 aux = ZPOOL_CONFIG_L2CACHE;
2557 }
2558
2559 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2560
2561 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2562
2563 if (sav->sav_count != 0 && ztest_random(4) == 0) {
2564 /*
2565 * Pick a random device to remove.
2566 */
2567 guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
2568 } else {
2569 /*
2570 * Find an unused device we can add.
2571 */
2572 zs->zs_vdev_aux = 0;
2573 for (;;) {
2574 char path[MAXPATHLEN];
2575 int c;
2576 (void) snprintf(path, sizeof (path), ztest_aux_template,
2577 ztest_opts.zo_dir, ztest_opts.zo_pool, aux,
2578 zs->zs_vdev_aux);
2579 for (c = 0; c < sav->sav_count; c++)
2580 if (strcmp(sav->sav_vdevs[c]->vdev_path,
2581 path) == 0)
2582 break;
2583 if (c == sav->sav_count &&
2584 vdev_lookup_by_path(rvd, path) == NULL)
2585 break;
2586 zs->zs_vdev_aux++;
2587 }
2588 }
2589
2590 spa_config_exit(spa, SCL_VDEV, FTAG);
2591
2592 if (guid == 0) {
2593 /*
2594 * Add a new device.
2595 */
2596 nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
2597 (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1);
2598 error = spa_vdev_add(spa, nvroot);
2599 if (error != 0)
2600 fatal(0, "spa_vdev_add(%p) = %d", nvroot, error);
2601 nvlist_free(nvroot);
2602 } else {
2603 /*
2604 * Remove an existing device. Sometimes, dirty its
2605 * vdev state first to make sure we handle removal
2606 * of devices that have pending state changes.
2607 */
2608 if (ztest_random(2) == 0)
2609 (void) vdev_online(spa, guid, 0, NULL);
2610
2611 error = spa_vdev_remove(spa, guid, B_FALSE);
2612 if (error != 0 && error != EBUSY)
2613 fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
2614 }
2615
2616 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2617 }
2618
2619 /*
2620 * split a pool if it has mirror tlvdevs
2621 */
2622 /* ARGSUSED */
2623 void
2624 ztest_split_pool(ztest_ds_t *zd, uint64_t id)
2625 {
2626 ztest_shared_t *zs = ztest_shared;
2627 spa_t *spa = ztest_spa;
2628 vdev_t *rvd = spa->spa_root_vdev;
2629 nvlist_t *tree, **child, *config, *split, **schild;
2630 uint_t c, children, schildren = 0, lastlogid = 0;
2631 int error = 0;
2632
2633 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2634
2635 /* ensure we have a useable config; mirrors of raidz aren't supported */
2636 if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) {
2637 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2638 return;
2639 }
2640
2641 /* clean up the old pool, if any */
2642 (void) spa_destroy("splitp");
2643
2644 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2645
2646 /* generate a config from the existing config */
2647 mutex_enter(&spa->spa_props_lock);
2648 VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
2649 &tree) == 0);
2650 mutex_exit(&spa->spa_props_lock);
2651
2652 VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
2653 &children) == 0);
2654
2655 schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
2656 for (c = 0; c < children; c++) {
2657 vdev_t *tvd = rvd->vdev_child[c];
2658 nvlist_t **mchild;
2659 uint_t mchildren;
2660
2661 if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
2662 VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
2663 0) == 0);
2664 VERIFY(nvlist_add_string(schild[schildren],
2665 ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
2666 VERIFY(nvlist_add_uint64(schild[schildren],
2667 ZPOOL_CONFIG_IS_HOLE, 1) == 0);
2668 if (lastlogid == 0)
2669 lastlogid = schildren;
2670 ++schildren;
2671 continue;
2672 }
2673 lastlogid = 0;
2674 VERIFY(nvlist_lookup_nvlist_array(child[c],
2675 ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
2676 VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
2677 }
2678
2679 /* OK, create a config that can be used to split */
2680 VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
2681 VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
2682 VDEV_TYPE_ROOT) == 0);
2683 VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
2684 lastlogid != 0 ? lastlogid : schildren) == 0);
2685
2686 VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
2687 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
2688
2689 for (c = 0; c < schildren; c++)
2690 nvlist_free(schild[c]);
2691 free(schild);
2692 nvlist_free(split);
2693
2694 spa_config_exit(spa, SCL_VDEV, FTAG);
2695
2696 (void) rw_wrlock(&ztest_name_lock);
2697 error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
2698 (void) rw_unlock(&ztest_name_lock);
2699
2700 nvlist_free(config);
2701
2702 if (error == 0) {
2703 (void) printf("successful split - results:\n");
2704 mutex_enter(&spa_namespace_lock);
2705 show_pool_stats(spa);
2706 show_pool_stats(spa_lookup("splitp"));
2707 mutex_exit(&spa_namespace_lock);
2708 ++zs->zs_splits;
2709 --zs->zs_mirrors;
2710 }
2711 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2712
2713 }
2714
2715 /*
2716 * Verify that we can attach and detach devices.
2717 */
2718 /* ARGSUSED */
2719 void
2720 ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
2721 {
2722 ztest_shared_t *zs = ztest_shared;
2723 spa_t *spa = ztest_spa;
2724 spa_aux_vdev_t *sav = &spa->spa_spares;
2725 vdev_t *rvd = spa->spa_root_vdev;
2726 vdev_t *oldvd, *newvd, *pvd;
2727 nvlist_t *root;
2728 uint64_t leaves;
2729 uint64_t leaf, top;
2730 uint64_t ashift = ztest_get_ashift();
2731 uint64_t oldguid, pguid;
2732 size_t oldsize, newsize;
2733 char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
2734 int replacing;
2735 int oldvd_has_siblings = B_FALSE;
2736 int newvd_is_spare = B_FALSE;
2737 int oldvd_is_log;
2738 int error, expected_error;
2739
2740 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
2741 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
2742
2743 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2744
2745 /*
2746 * Decide whether to do an attach or a replace.
2747 */
2748 replacing = ztest_random(2);
2749
2750 /*
2751 * Pick a random top-level vdev.
2752 */
2753 top = ztest_random_vdev_top(spa, B_TRUE);
2754
2755 /*
2756 * Pick a random leaf within it.
2757 */
2758 leaf = ztest_random(leaves);
2759
2760 /*
2761 * Locate this vdev.
2762 */
2763 oldvd = rvd->vdev_child[top];
2764 if (zs->zs_mirrors >= 1) {
2765 ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
2766 ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
2767 oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz];
2768 }
2769 if (ztest_opts.zo_raidz > 1) {
2770 ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
2771 ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz);
2772 oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz];
2773 }
2774
2775 /*
2776 * If we're already doing an attach or replace, oldvd may be a
2777 * mirror vdev -- in which case, pick a random child.
2778 */
2779 while (oldvd->vdev_children != 0) {
2780 oldvd_has_siblings = B_TRUE;
2781 ASSERT(oldvd->vdev_children >= 2);
2782 oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
2783 }
2784
2785 oldguid = oldvd->vdev_guid;
2786 oldsize = vdev_get_min_asize(oldvd);
2787 oldvd_is_log = oldvd->vdev_top->vdev_islog;
2788 (void) strcpy(oldpath, oldvd->vdev_path);
2789 pvd = oldvd->vdev_parent;
2790 pguid = pvd->vdev_guid;
2791
2792 /*
2793 * If oldvd has siblings, then half of the time, detach it.
2794 */
2795 if (oldvd_has_siblings && ztest_random(2) == 0) {
2796 spa_config_exit(spa, SCL_VDEV, FTAG);
2797 error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
2798 if (error != 0 && error != ENODEV && error != EBUSY &&
2799 error != ENOTSUP)
2800 fatal(0, "detach (%s) returned %d", oldpath, error);
2801 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2802 return;
2803 }
2804
2805 /*
2806 * For the new vdev, choose with equal probability between the two
2807 * standard paths (ending in either 'a' or 'b') or a random hot spare.
2808 */
2809 if (sav->sav_count != 0 && ztest_random(3) == 0) {
2810 newvd = sav->sav_vdevs[ztest_random(sav->sav_count)];
2811 newvd_is_spare = B_TRUE;
2812 (void) strcpy(newpath, newvd->vdev_path);
2813 } else {
2814 (void) snprintf(newpath, sizeof (newpath), ztest_dev_template,
2815 ztest_opts.zo_dir, ztest_opts.zo_pool,
2816 top * leaves + leaf);
2817 if (ztest_random(2) == 0)
2818 newpath[strlen(newpath) - 1] = 'b';
2819 newvd = vdev_lookup_by_path(rvd, newpath);
2820 }
2821
2822 if (newvd) {
2823 newsize = vdev_get_min_asize(newvd);
2824 } else {
2825 /*
2826 * Make newsize a little bigger or smaller than oldsize.
2827 * If it's smaller, the attach should fail.
2828 * If it's larger, and we're doing a replace,
2829 * we should get dynamic LUN growth when we're done.
2830 */
2831 newsize = 10 * oldsize / (9 + ztest_random(3));
2832 }
2833
2834 /*
2835 * If pvd is not a mirror or root, the attach should fail with ENOTSUP,
2836 * unless it's a replace; in that case any non-replacing parent is OK.
2837 *
2838 * If newvd is already part of the pool, it should fail with EBUSY.
2839 *
2840 * If newvd is too small, it should fail with EOVERFLOW.
2841 */
2842 if (pvd->vdev_ops != &vdev_mirror_ops &&
2843 pvd->vdev_ops != &vdev_root_ops && (!replacing ||
2844 pvd->vdev_ops == &vdev_replacing_ops ||
2845 pvd->vdev_ops == &vdev_spare_ops))
2846 expected_error = ENOTSUP;
2847 else if (newvd_is_spare && (!replacing || oldvd_is_log))
2848 expected_error = ENOTSUP;
2849 else if (newvd == oldvd)
2850 expected_error = replacing ? 0 : EBUSY;
2851 else if (vdev_lookup_by_path(rvd, newpath) != NULL)
2852 expected_error = EBUSY;
2853 else if (newsize < oldsize)
2854 expected_error = EOVERFLOW;
2855 else if (ashift > oldvd->vdev_top->vdev_ashift)
2856 expected_error = EDOM;
2857 else
2858 expected_error = 0;
2859
2860 spa_config_exit(spa, SCL_VDEV, FTAG);
2861
2862 /*
2863 * Build the nvlist describing newpath.
2864 */
2865 root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
2866 ashift, 0, 0, 0, 1);
2867
2868 error = spa_vdev_attach(spa, oldguid, root, replacing);
2869
2870 nvlist_free(root);
2871
2872 /*
2873 * If our parent was the replacing vdev, but the replace completed,
2874 * then instead of failing with ENOTSUP we may either succeed,
2875 * fail with ENODEV, or fail with EOVERFLOW.
2876 */
2877 if (expected_error == ENOTSUP &&
2878 (error == 0 || error == ENODEV || error == EOVERFLOW))
2879 expected_error = error;
2880
2881 /*
2882 * If someone grew the LUN, the replacement may be too small.
2883 */
2884 if (error == EOVERFLOW || error == EBUSY)
2885 expected_error = error;
2886
2887 /* XXX workaround 6690467 */
2888 if (error != expected_error && expected_error != EBUSY) {
2889 fatal(0, "attach (%s %llu, %s %llu, %d) "
2890 "returned %d, expected %d",
2891 oldpath, (longlong_t)oldsize, newpath,
2892 (longlong_t)newsize, replacing, error, expected_error);
2893 }
2894
2895 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
2896 }
2897
2898 /*
2899 * Callback function which expands the physical size of the vdev.
2900 */
2901 vdev_t *
2902 grow_vdev(vdev_t *vd, void *arg)
2903 {
2904 spa_t *spa = vd->vdev_spa;
2905 size_t *newsize = arg;
2906 size_t fsize;
2907 int fd;
2908
2909 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2910 ASSERT(vd->vdev_ops->vdev_op_leaf);
2911
2912 if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
2913 return (vd);
2914
2915 fsize = lseek(fd, 0, SEEK_END);
2916 (void) ftruncate(fd, *newsize);
2917
2918 if (ztest_opts.zo_verbose >= 6) {
2919 (void) printf("%s grew from %lu to %lu bytes\n",
2920 vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
2921 }
2922 (void) close(fd);
2923 return (NULL);
2924 }
2925
2926 /*
2927 * Callback function which expands a given vdev by calling vdev_online().
2928 */
2929 /* ARGSUSED */
2930 vdev_t *
2931 online_vdev(vdev_t *vd, void *arg)
2932 {
2933 spa_t *spa = vd->vdev_spa;
2934 vdev_t *tvd = vd->vdev_top;
2935 uint64_t guid = vd->vdev_guid;
2936 uint64_t generation = spa->spa_config_generation + 1;
2937 vdev_state_t newstate = VDEV_STATE_UNKNOWN;
2938 int error;
2939
2940 ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
2941 ASSERT(vd->vdev_ops->vdev_op_leaf);
2942
2943 /* Calling vdev_online will initialize the new metaslabs */
2944 spa_config_exit(spa, SCL_STATE, spa);
2945 error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
2946 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
2947
2948 /*
2949 * If vdev_online returned an error or the underlying vdev_open
2950 * failed then we abort the expand. The only way to know that
2951 * vdev_open fails is by checking the returned newstate.
2952 */
2953 if (error || newstate != VDEV_STATE_HEALTHY) {
2954 if (ztest_opts.zo_verbose >= 5) {
2955 (void) printf("Unable to expand vdev, state %llu, "
2956 "error %d\n", (u_longlong_t)newstate, error);
2957 }
2958 return (vd);
2959 }
2960 ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
2961
2962 /*
2963 * Since we dropped the lock we need to ensure that we're
2964 * still talking to the original vdev. It's possible this
2965 * vdev may have been detached/replaced while we were
2966 * trying to online it.
2967 */
2968 if (generation != spa->spa_config_generation) {
2969 if (ztest_opts.zo_verbose >= 5) {
2970 (void) printf("vdev configuration has changed, "
2971 "guid %llu, state %llu, expected gen %llu, "
2972 "got gen %llu\n",
2973 (u_longlong_t)guid,
2974 (u_longlong_t)tvd->vdev_state,
2975 (u_longlong_t)generation,
2976 (u_longlong_t)spa->spa_config_generation);
2977 }
2978 return (vd);
2979 }
2980 return (NULL);
2981 }
2982
2983 /*
2984 * Traverse the vdev tree calling the supplied function.
2985 * We continue to walk the tree until we either have walked all
2986 * children or we receive a non-NULL return from the callback.
2987 * If a NULL callback is passed, then we just return back the first
2988 * leaf vdev we encounter.
2989 */
2990 vdev_t *
2991 vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
2992 {
2993 if (vd->vdev_ops->vdev_op_leaf) {
2994 if (func == NULL)
2995 return (vd);
2996 else
2997 return (func(vd, arg));
2998 }
2999
3000 for (uint_t c = 0; c < vd->vdev_children; c++) {
3001 vdev_t *cvd = vd->vdev_child[c];
3002 if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
3003 return (cvd);
3004 }
3005 return (NULL);
3006 }
3007
3008 /*
3009 * Verify that dynamic LUN growth works as expected.
3010 */
3011 /* ARGSUSED */
3012 void
3013 ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
3014 {
3015 spa_t *spa = ztest_spa;
3016 vdev_t *vd, *tvd;
3017 metaslab_class_t *mc;
3018 metaslab_group_t *mg;
3019 size_t psize, newsize;
3020 uint64_t top;
3021 uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
3022
3023 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
3024 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3025
3026 top = ztest_random_vdev_top(spa, B_TRUE);
3027
3028 tvd = spa->spa_root_vdev->vdev_child[top];
3029 mg = tvd->vdev_mg;
3030 mc = mg->mg_class;
3031 old_ms_count = tvd->vdev_ms_count;
3032 old_class_space = metaslab_class_get_space(mc);
3033
3034 /*
3035 * Determine the size of the first leaf vdev associated with
3036 * our top-level device.
3037 */
3038 vd = vdev_walk_tree(tvd, NULL, NULL);
3039 ASSERT3P(vd, !=, NULL);
3040 ASSERT(vd->vdev_ops->vdev_op_leaf);
3041
3042 psize = vd->vdev_psize;
3043
3044 /*
3045 * We only try to expand the vdev if it's healthy, less than 4x its
3046 * original size, and it has a valid psize.
3047 */
3048 if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
3049 psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) {
3050 spa_config_exit(spa, SCL_STATE, spa);
3051 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3052 return;
3053 }
3054 ASSERT(psize > 0);
3055 newsize = psize + psize / 8;
3056 ASSERT3U(newsize, >, psize);
3057
3058 if (ztest_opts.zo_verbose >= 6) {
3059 (void) printf("Expanding LUN %s from %lu to %lu\n",
3060 vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
3061 }
3062
3063 /*
3064 * Growing the vdev is a two step process:
3065 * 1). expand the physical size (i.e. relabel)
3066 * 2). online the vdev to create the new metaslabs
3067 */
3068 if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
3069 vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
3070 tvd->vdev_state != VDEV_STATE_HEALTHY) {
3071 if (ztest_opts.zo_verbose >= 5) {
3072 (void) printf("Could not expand LUN because "
3073 "the vdev configuration changed.\n");
3074 }
3075 spa_config_exit(spa, SCL_STATE, spa);
3076 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3077 return;
3078 }
3079
3080 spa_config_exit(spa, SCL_STATE, spa);
3081
3082 /*
3083 * Expanding the LUN will update the config asynchronously,
3084 * thus we must wait for the async thread to complete any
3085 * pending tasks before proceeding.
3086 */
3087 for (;;) {
3088 boolean_t done;
3089 mutex_enter(&spa->spa_async_lock);
3090 done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
3091 mutex_exit(&spa->spa_async_lock);
3092 if (done)
3093 break;
3094 txg_wait_synced(spa_get_dsl(spa), 0);
3095 (void) poll(NULL, 0, 100);
3096 }
3097
3098 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3099
3100 tvd = spa->spa_root_vdev->vdev_child[top];
3101 new_ms_count = tvd->vdev_ms_count;
3102 new_class_space = metaslab_class_get_space(mc);
3103
3104 if (tvd->vdev_mg != mg || mg->mg_class != mc) {
3105 if (ztest_opts.zo_verbose >= 5) {
3106 (void) printf("Could not verify LUN expansion due to "
3107 "intervening vdev offline or remove.\n");
3108 }
3109 spa_config_exit(spa, SCL_STATE, spa);
3110 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3111 return;
3112 }
3113
3114 /*
3115 * Make sure we were able to grow the vdev.
3116 */
3117 if (new_ms_count <= old_ms_count)
3118 fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
3119 old_ms_count, new_ms_count);
3120
3121 /*
3122 * Make sure we were able to grow the pool.
3123 */
3124 if (new_class_space <= old_class_space)
3125 fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
3126 old_class_space, new_class_space);
3127
3128 if (ztest_opts.zo_verbose >= 5) {
3129 char oldnumbuf[6], newnumbuf[6];
3130
3131 nicenum(old_class_space, oldnumbuf);
3132 nicenum(new_class_space, newnumbuf);
3133 (void) printf("%s grew from %s to %s\n",
3134 spa->spa_name, oldnumbuf, newnumbuf);
3135 }
3136
3137 spa_config_exit(spa, SCL_STATE, spa);
3138 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
3139 }
3140
3141 /*
3142 * Verify that dmu_objset_{create,destroy,open,close} work as expected.
3143 */
3144 /* ARGSUSED */
3145 static void
3146 ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
3147 {
3148 /*
3149 * Create the objects common to all ztest datasets.
3150 */
3151 VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
3152 DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
3153 }
3154
3155 static int
3156 ztest_dataset_create(char *dsname)
3157 {
3158 uint64_t zilset = ztest_random(100);
3159 int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0,
3160 ztest_objset_create_cb, NULL);
3161
3162 if (err || zilset < 80)
3163 return (err);
3164
3165 if (ztest_opts.zo_verbose >= 6)
3166 (void) printf("Setting dataset %s to sync always\n", dsname);
3167 return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC,
3168 ZFS_SYNC_ALWAYS, B_FALSE));
3169 }
3170
3171 /* ARGSUSED */
3172 static int
3173 ztest_objset_destroy_cb(const char *name, void *arg)
3174 {
3175 objset_t *os;
3176 dmu_object_info_t doi;
3177 int error;
3178
3179 /*
3180 * Verify that the dataset contains a directory object.
3181 */
3182 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os));
3183 error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
3184 if (error != ENOENT) {
3185 /* We could have crashed in the middle of destroying it */
3186 ASSERT0(error);
3187 ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
3188 ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
3189 }
3190 dmu_objset_disown(os, FTAG);
3191
3192 /*
3193 * Destroy the dataset.
3194 */
3195 if (strchr(name, '@') != NULL) {
3196 VERIFY0(dsl_destroy_snapshot(name, B_FALSE));
3197 } else {
3198 VERIFY0(dsl_destroy_head(name));
3199 }
3200 return (0);
3201 }
3202
3203 static boolean_t
3204 ztest_snapshot_create(char *osname, uint64_t id)
3205 {
3206 char snapname[MAXNAMELEN];
3207 int error;
3208
3209 (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id);
3210
3211 error = dmu_objset_snapshot_one(osname, snapname);
3212 if (error == ENOSPC) {
3213 ztest_record_enospc(FTAG);
3214 return (B_FALSE);
3215 }
3216 if (error != 0 && error != EEXIST) {
3217 fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname,
3218 snapname, error);
3219 }
3220 return (B_TRUE);
3221 }
3222
3223 static boolean_t
3224 ztest_snapshot_destroy(char *osname, uint64_t id)
3225 {
3226 char snapname[MAXNAMELEN];
3227 int error;
3228
3229 (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
3230 (u_longlong_t)id);
3231
3232 error = dsl_destroy_snapshot(snapname, B_FALSE);
3233 if (error != 0 && error != ENOENT)
3234 fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
3235 return (B_TRUE);
3236 }
3237
3238 /* ARGSUSED */
3239 void
3240 ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
3241 {
3242 ztest_ds_t zdtmp;
3243 int iters;
3244 int error;
3245 objset_t *os, *os2;
3246 char name[MAXNAMELEN];
3247 zilog_t *zilog;
3248
3249 (void) rw_rdlock(&ztest_name_lock);
3250
3251 (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
3252 ztest_opts.zo_pool, (u_longlong_t)id);
3253
3254 /*
3255 * If this dataset exists from a previous run, process its replay log
3256 * half of the time. If we don't replay it, then dmu_objset_destroy()
3257 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
3258 */
3259 if (ztest_random(2) == 0 &&
3260 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
3261 ztest_zd_init(&zdtmp, NULL, os);
3262 zil_replay(os, &zdtmp, ztest_replay_vector);
3263 ztest_zd_fini(&zdtmp);
3264 dmu_objset_disown(os, FTAG);
3265 }
3266
3267 /*
3268 * There may be an old instance of the dataset we're about to
3269 * create lying around from a previous run. If so, destroy it
3270 * and all of its snapshots.
3271 */
3272 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
3273 DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
3274
3275 /*
3276 * Verify that the destroyed dataset is no longer in the namespace.
3277 */
3278 VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE,
3279 FTAG, &os));
3280
3281 /*
3282 * Verify that we can create a new dataset.
3283 */
3284 error = ztest_dataset_create(name);
3285 if (error) {
3286 if (error == ENOSPC) {
3287 ztest_record_enospc(FTAG);
3288 (void) rw_unlock(&ztest_name_lock);
3289 return;
3290 }
3291 fatal(0, "dmu_objset_create(%s) = %d", name, error);
3292 }
3293
3294 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
3295
3296 ztest_zd_init(&zdtmp, NULL, os);
3297
3298 /*
3299 * Open the intent log for it.
3300 */
3301 zilog = zil_open(os, ztest_get_data);
3302
3303 /*
3304 * Put some objects in there, do a little I/O to them,
3305 * and randomly take a couple of snapshots along the way.
3306 */
3307 iters = ztest_random(5);
3308 for (int i = 0; i < iters; i++) {
3309 ztest_dmu_object_alloc_free(&zdtmp, id);
3310 if (ztest_random(iters) == 0)
3311 (void) ztest_snapshot_create(name, i);
3312 }
3313
3314 /*
3315 * Verify that we cannot create an existing dataset.
3316 */
3317 VERIFY3U(EEXIST, ==,
3318 dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
3319
3320 /*
3321 * Verify that we can hold an objset that is also owned.
3322 */
3323 VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
3324 dmu_objset_rele(os2, FTAG);
3325
3326 /*
3327 * Verify that we cannot own an objset that is already owned.
3328 */
3329 VERIFY3U(EBUSY, ==,
3330 dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
3331
3332 zil_close(zilog);
3333 dmu_objset_disown(os, FTAG);
3334 ztest_zd_fini(&zdtmp);
3335
3336 (void) rw_unlock(&ztest_name_lock);
3337 }
3338
3339 /*
3340 * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
3341 */
3342 void
3343 ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
3344 {
3345 (void) rw_rdlock(&ztest_name_lock);
3346 (void) ztest_snapshot_destroy(zd->zd_name, id);
3347 (void) ztest_snapshot_create(zd->zd_name, id);
3348 (void) rw_unlock(&ztest_name_lock);
3349 }
3350
3351 /*
3352 * Cleanup non-standard snapshots and clones.
3353 */
3354 void
3355 ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
3356 {
3357 char snap1name[MAXNAMELEN];
3358 char clone1name[MAXNAMELEN];
3359 char snap2name[MAXNAMELEN];
3360 char clone2name[MAXNAMELEN];
3361 char snap3name[MAXNAMELEN];
3362 int error;
3363
3364 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
3365 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
3366 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
3367 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
3368 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
3369
3370 error = dsl_destroy_head(clone2name);
3371 if (error && error != ENOENT)
3372 fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error);
3373 error = dsl_destroy_snapshot(snap3name, B_FALSE);
3374 if (error && error != ENOENT)
3375 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error);
3376 error = dsl_destroy_snapshot(snap2name, B_FALSE);
3377 if (error && error != ENOENT)
3378 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error);
3379 error = dsl_destroy_head(clone1name);
3380 if (error && error != ENOENT)
3381 fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error);
3382 error = dsl_destroy_snapshot(snap1name, B_FALSE);
3383 if (error && error != ENOENT)
3384 fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error);
3385 }
3386
3387 /*
3388 * Verify dsl_dataset_promote handles EBUSY
3389 */
3390 void
3391 ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
3392 {
3393 objset_t *os;
3394 char snap1name[MAXNAMELEN];
3395 char clone1name[MAXNAMELEN];
3396 char snap2name[MAXNAMELEN];
3397 char clone2name[MAXNAMELEN];
3398 char snap3name[MAXNAMELEN];
3399 char *osname = zd->zd_name;
3400 int error;
3401
3402 (void) rw_rdlock(&ztest_name_lock);
3403
3404 ztest_dsl_dataset_cleanup(osname, id);
3405
3406 (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
3407 (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
3408 (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
3409 (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
3410 (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
3411
3412 error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1);
3413 if (error && error != EEXIST) {
3414 if (error == ENOSPC) {
3415 ztest_record_enospc(FTAG);
3416 goto out;
3417 }
3418 fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
3419 }
3420
3421 error = dmu_objset_clone(clone1name, snap1name);
3422 if (error) {
3423 if (error == ENOSPC) {
3424 ztest_record_enospc(FTAG);
3425 goto out;
3426 }
3427 fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
3428 }
3429
3430 error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1);
3431 if (error && error != EEXIST) {
3432 if (error == ENOSPC) {
3433 ztest_record_enospc(FTAG);
3434 goto out;
3435 }
3436 fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
3437 }
3438
3439 error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1);
3440 if (error && error != EEXIST) {
3441 if (error == ENOSPC) {
3442 ztest_record_enospc(FTAG);
3443 goto out;
3444 }
3445 fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
3446 }
3447
3448 error = dmu_objset_clone(clone2name, snap3name);
3449 if (error) {
3450 if (error == ENOSPC) {
3451 ztest_record_enospc(FTAG);
3452 goto out;
3453 }
3454 fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
3455 }
3456
3457 error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os);
3458 if (error)
3459 fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
3460 error = dsl_dataset_promote(clone2name, NULL);
3461 if (error != EBUSY)
3462 fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
3463 error);
3464 dmu_objset_disown(os, FTAG);
3465
3466 out:
3467 ztest_dsl_dataset_cleanup(osname, id);
3468
3469 (void) rw_unlock(&ztest_name_lock);
3470 }
3471
3472 /*
3473 * Verify that dmu_object_{alloc,free} work as expected.
3474 */
3475 void
3476 ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
3477 {
3478 ztest_od_t od[4];
3479 int batchsize = sizeof (od) / sizeof (od[0]);
3480
3481 for (int b = 0; b < batchsize; b++)
3482 ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
3483
3484 /*
3485 * Destroy the previous batch of objects, create a new batch,
3486 * and do some I/O on the new objects.
3487 */
3488 if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
3489 return;
3490
3491 while (ztest_random(4 * batchsize) != 0)
3492 ztest_io(zd, od[ztest_random(batchsize)].od_object,
3493 ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
3494 }
3495
3496 /*
3497 * Verify that dmu_{read,write} work as expected.
3498 */
3499 void
3500 ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
3501 {
3502 objset_t *os = zd->zd_os;
3503 ztest_od_t od[2];
3504 dmu_tx_t *tx;
3505 int i, freeit, error;
3506 uint64_t n, s, txg;
3507 bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
3508 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
3509 uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
3510 uint64_t regions = 997;
3511 uint64_t stride = 123456789ULL;
3512 uint64_t width = 40;
3513 int free_percent = 5;
3514
3515 /*
3516 * This test uses two objects, packobj and bigobj, that are always
3517 * updated together (i.e. in the same tx) so that their contents are
3518 * in sync and can be compared. Their contents relate to each other
3519 * in a simple way: packobj is a dense array of 'bufwad' structures,
3520 * while bigobj is a sparse array of the same bufwads. Specifically,
3521 * for any index n, there are three bufwads that should be identical:
3522 *
3523 * packobj, at offset n * sizeof (bufwad_t)
3524 * bigobj, at the head of the nth chunk
3525 * bigobj, at the tail of the nth chunk
3526 *
3527 * The chunk size is arbitrary. It doesn't have to be a power of two,
3528 * and it doesn't have any relation to the object blocksize.
3529 * The only requirement is that it can hold at least two bufwads.
3530 *
3531 * Normally, we write the bufwad to each of these locations.
3532 * However, free_percent of the time we instead write zeroes to
3533 * packobj and perform a dmu_free_range() on bigobj. By comparing
3534 * bigobj to packobj, we can verify that the DMU is correctly
3535 * tracking which parts of an object are allocated and free,
3536 * and that the contents of the allocated blocks are correct.
3537 */
3538
3539 /*
3540 * Read the directory info. If it's the first time, set things up.
3541 */
3542 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
3543 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
3544
3545 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
3546 return;
3547
3548 bigobj = od[0].od_object;
3549 packobj = od[1].od_object;
3550 chunksize = od[0].od_gen;
3551 ASSERT(chunksize == od[1].od_gen);
3552
3553 /*
3554 * Prefetch a random chunk of the big object.
3555 * Our aim here is to get some async reads in flight
3556 * for blocks that we may free below; the DMU should
3557 * handle this race correctly.
3558 */
3559 n = ztest_random(regions) * stride + ztest_random(width);
3560 s = 1 + ztest_random(2 * width - 1);
3561 dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
3562
3563 /*
3564 * Pick a random index and compute the offsets into packobj and bigobj.
3565 */
3566 n = ztest_random(regions) * stride + ztest_random(width);
3567 s = 1 + ztest_random(width - 1);
3568
3569 packoff = n * sizeof (bufwad_t);
3570 packsize = s * sizeof (bufwad_t);
3571
3572 bigoff = n * chunksize;
3573 bigsize = s * chunksize;
3574
3575 packbuf = umem_alloc(packsize, UMEM_NOFAIL);
3576 bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
3577
3578 /*
3579 * free_percent of the time, free a range of bigobj rather than
3580 * overwriting it.
3581 */
3582 freeit = (ztest_random(100) < free_percent);
3583
3584 /*
3585 * Read the current contents of our objects.
3586 */
3587 error = dmu_read(os, packobj, packoff, packsize, packbuf,
3588 DMU_READ_PREFETCH);
3589 ASSERT0(error);
3590 error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
3591 DMU_READ_PREFETCH);
3592 ASSERT0(error);
3593
3594 /*
3595 * Get a tx for the mods to both packobj and bigobj.
3596 */
3597 tx = dmu_tx_create(os);
3598
3599 dmu_tx_hold_write(tx, packobj, packoff, packsize);
3600
3601 if (freeit)
3602 dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
3603 else
3604 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
3605
3606 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
3607 if (txg == 0) {
3608 umem_free(packbuf, packsize);
3609 umem_free(bigbuf, bigsize);
3610 return;
3611 }
3612
3613 dmu_object_set_checksum(os, bigobj,
3614 (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
3615
3616 dmu_object_set_compress(os, bigobj,
3617 (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
3618
3619 /*
3620 * For each index from n to n + s, verify that the existing bufwad
3621 * in packobj matches the bufwads at the head and tail of the
3622 * corresponding chunk in bigobj. Then update all three bufwads
3623 * with the new values we want to write out.
3624 */
3625 for (i = 0; i < s; i++) {
3626 /* LINTED */
3627 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
3628 /* LINTED */
3629 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
3630 /* LINTED */
3631 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
3632
3633 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
3634 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
3635
3636 if (pack->bw_txg > txg)
3637 fatal(0, "future leak: got %llx, open txg is %llx",
3638 pack->bw_txg, txg);
3639
3640 if (pack->bw_data != 0 && pack->bw_index != n + i)
3641 fatal(0, "wrong index: got %llx, wanted %llx+%llx",
3642 pack->bw_index, n, i);
3643
3644 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
3645 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
3646
3647 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
3648 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
3649
3650 if (freeit) {
3651 bzero(pack, sizeof (bufwad_t));
3652 } else {
3653 pack->bw_index = n + i;
3654 pack->bw_txg = txg;
3655 pack->bw_data = 1 + ztest_random(-2ULL);
3656 }
3657 *bigH = *pack;
3658 *bigT = *pack;
3659 }
3660
3661 /*
3662 * We've verified all the old bufwads, and made new ones.
3663 * Now write them out.
3664 */
3665 dmu_write(os, packobj, packoff, packsize, packbuf, tx);
3666
3667 if (freeit) {
3668 if (ztest_opts.zo_verbose >= 7) {
3669 (void) printf("freeing offset %llx size %llx"
3670 " txg %llx\n",
3671 (u_longlong_t)bigoff,
3672 (u_longlong_t)bigsize,
3673 (u_longlong_t)txg);
3674 }
3675 VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
3676 } else {
3677 if (ztest_opts.zo_verbose >= 7) {
3678 (void) printf("writing offset %llx size %llx"
3679 " txg %llx\n",
3680 (u_longlong_t)bigoff,
3681 (u_longlong_t)bigsize,
3682 (u_longlong_t)txg);
3683 }
3684 dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
3685 }
3686
3687 dmu_tx_commit(tx);
3688
3689 /*
3690 * Sanity check the stuff we just wrote.
3691 */
3692 {
3693 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
3694 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
3695
3696 VERIFY(0 == dmu_read(os, packobj, packoff,
3697 packsize, packcheck, DMU_READ_PREFETCH));
3698 VERIFY(0 == dmu_read(os, bigobj, bigoff,
3699 bigsize, bigcheck, DMU_READ_PREFETCH));
3700
3701 ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
3702 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
3703
3704 umem_free(packcheck, packsize);
3705 umem_free(bigcheck, bigsize);
3706 }
3707
3708 umem_free(packbuf, packsize);
3709 umem_free(bigbuf, bigsize);
3710 }
3711
3712 void
3713 compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
3714 uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
3715 {
3716 uint64_t i;
3717 bufwad_t *pack;
3718 bufwad_t *bigH;
3719 bufwad_t *bigT;
3720
3721 /*
3722 * For each index from n to n + s, verify that the existing bufwad
3723 * in packobj matches the bufwads at the head and tail of the
3724 * corresponding chunk in bigobj. Then update all three bufwads
3725 * with the new values we want to write out.
3726 */
3727 for (i = 0; i < s; i++) {
3728 /* LINTED */
3729 pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
3730 /* LINTED */
3731 bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
3732 /* LINTED */
3733 bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
3734
3735 ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
3736 ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
3737
3738 if (pack->bw_txg > txg)
3739 fatal(0, "future leak: got %llx, open txg is %llx",
3740 pack->bw_txg, txg);
3741
3742 if (pack->bw_data != 0 && pack->bw_index != n + i)
3743 fatal(0, "wrong index: got %llx, wanted %llx+%llx",
3744 pack->bw_index, n, i);
3745
3746 if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
3747 fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
3748
3749 if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
3750 fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
3751
3752 pack->bw_index = n + i;
3753 pack->bw_txg = txg;
3754 pack->bw_data = 1 + ztest_random(-2ULL);
3755
3756 *bigH = *pack;
3757 *bigT = *pack;
3758 }
3759 }
3760
3761 void
3762 ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
3763 {
3764 objset_t *os = zd->zd_os;
3765 ztest_od_t od[2];
3766 dmu_tx_t *tx;
3767 uint64_t i;
3768 int error;
3769 uint64_t n, s, txg;
3770 bufwad_t *packbuf, *bigbuf;
3771 uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
3772 uint64_t blocksize = ztest_random_blocksize();
3773 uint64_t chunksize = blocksize;
3774 uint64_t regions = 997;
3775 uint64_t stride = 123456789ULL;
3776 uint64_t width = 9;
3777 dmu_buf_t *bonus_db;
3778 arc_buf_t **bigbuf_arcbufs;
3779 dmu_object_info_t doi;
3780
3781 /*
3782 * This test uses two objects, packobj and bigobj, that are always
3783 * updated together (i.e. in the same tx) so that their contents are
3784 * in sync and can be compared. Their contents relate to each other
3785 * in a simple way: packobj is a dense array of 'bufwad' structures,
3786 * while bigobj is a sparse array of the same bufwads. Specifically,
3787 * for any index n, there are three bufwads that should be identical:
3788 *
3789 * packobj, at offset n * sizeof (bufwad_t)
3790 * bigobj, at the head of the nth chunk
3791 * bigobj, at the tail of the nth chunk
3792 *
3793 * The chunk size is set equal to bigobj block size so that
3794 * dmu_assign_arcbuf() can be tested for object updates.
3795 */
3796
3797 /*
3798 * Read the directory info. If it's the first time, set things up.
3799 */
3800 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
3801 ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
3802
3803 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
3804 return;
3805
3806 bigobj = od[0].od_object;
3807 packobj = od[1].od_object;
3808 blocksize = od[0].od_blocksize;
3809 chunksize = blocksize;
3810 ASSERT(chunksize == od[1].od_gen);
3811
3812 VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
3813 VERIFY(ISP2(doi.doi_data_block_size));
3814 VERIFY(chunksize == doi.doi_data_block_size);
3815 VERIFY(chunksize >= 2 * sizeof (bufwad_t));
3816
3817 /*
3818 * Pick a random index and compute the offsets into packobj and bigobj.
3819 */
3820 n = ztest_random(regions) * stride + ztest_random(width);
3821 s = 1 + ztest_random(width - 1);
3822
3823 packoff = n * sizeof (bufwad_t);
3824 packsize = s * sizeof (bufwad_t);
3825
3826 bigoff = n * chunksize;
3827 bigsize = s * chunksize;
3828
3829 packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
3830 bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
3831
3832 VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
3833
3834 bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
3835
3836 /*
3837 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
3838 * Iteration 1 test zcopy to already referenced dbufs.
3839 * Iteration 2 test zcopy to dirty dbuf in the same txg.
3840 * Iteration 3 test zcopy to dbuf dirty in previous txg.
3841 * Iteration 4 test zcopy when dbuf is no longer dirty.
3842 * Iteration 5 test zcopy when it can't be done.
3843 * Iteration 6 one more zcopy write.
3844 */
3845 for (i = 0; i < 7; i++) {
3846 uint64_t j;
3847 uint64_t off;
3848
3849 /*
3850 * In iteration 5 (i == 5) use arcbufs
3851 * that don't match bigobj blksz to test
3852 * dmu_assign_arcbuf() when it can't directly
3853 * assign an arcbuf to a dbuf.
3854 */
3855 for (j = 0; j < s; j++) {
3856 if (i != 5) {
3857 bigbuf_arcbufs[j] =
3858 dmu_request_arcbuf(bonus_db, chunksize);
3859 } else {
3860 bigbuf_arcbufs[2 * j] =
3861 dmu_request_arcbuf(bonus_db, chunksize / 2);
3862 bigbuf_arcbufs[2 * j + 1] =
3863 dmu_request_arcbuf(bonus_db, chunksize / 2);
3864 }
3865 }
3866
3867 /*
3868 * Get a tx for the mods to both packobj and bigobj.
3869 */
3870 tx = dmu_tx_create(os);
3871
3872 dmu_tx_hold_write(tx, packobj, packoff, packsize);
3873 dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
3874
3875 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
3876 if (txg == 0) {
3877 umem_free(packbuf, packsize);
3878 umem_free(bigbuf, bigsize);
3879 for (j = 0; j < s; j++) {
3880 if (i != 5) {
3881 dmu_return_arcbuf(bigbuf_arcbufs[j]);
3882 } else {
3883 dmu_return_arcbuf(
3884 bigbuf_arcbufs[2 * j]);
3885 dmu_return_arcbuf(
3886 bigbuf_arcbufs[2 * j + 1]);
3887 }
3888 }
3889 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
3890 dmu_buf_rele(bonus_db, FTAG);
3891 return;
3892 }
3893
3894 /*
3895 * 50% of the time don't read objects in the 1st iteration to
3896 * test dmu_assign_arcbuf() for the case when there're no
3897 * existing dbufs for the specified offsets.
3898 */
3899 if (i != 0 || ztest_random(2) != 0) {
3900 error = dmu_read(os, packobj, packoff,
3901 packsize, packbuf, DMU_READ_PREFETCH);
3902 ASSERT0(error);
3903 error = dmu_read(os, bigobj, bigoff, bigsize,
3904 bigbuf, DMU_READ_PREFETCH);
3905 ASSERT0(error);
3906 }
3907 compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
3908 n, chunksize, txg);
3909
3910 /*
3911 * We've verified all the old bufwads, and made new ones.
3912 * Now write them out.
3913 */
3914 dmu_write(os, packobj, packoff, packsize, packbuf, tx);
3915 if (ztest_opts.zo_verbose >= 7) {
3916 (void) printf("writing offset %llx size %llx"
3917 " txg %llx\n",
3918 (u_longlong_t)bigoff,
3919 (u_longlong_t)bigsize,
3920 (u_longlong_t)txg);
3921 }
3922 for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
3923 dmu_buf_t *dbt;
3924 if (i != 5) {
3925 bcopy((caddr_t)bigbuf + (off - bigoff),
3926 bigbuf_arcbufs[j]->b_data, chunksize);
3927 } else {
3928 bcopy((caddr_t)bigbuf + (off - bigoff),
3929 bigbuf_arcbufs[2 * j]->b_data,
3930 chunksize / 2);
3931 bcopy((caddr_t)bigbuf + (off - bigoff) +
3932 chunksize / 2,
3933 bigbuf_arcbufs[2 * j + 1]->b_data,
3934 chunksize / 2);
3935 }
3936
3937 if (i == 1) {
3938 VERIFY(dmu_buf_hold(os, bigobj, off,
3939 FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
3940 }
3941 if (i != 5) {
3942 dmu_assign_arcbuf(bonus_db, off,
3943 bigbuf_arcbufs[j], tx);
3944 } else {
3945 dmu_assign_arcbuf(bonus_db, off,
3946 bigbuf_arcbufs[2 * j], tx);
3947 dmu_assign_arcbuf(bonus_db,
3948 off + chunksize / 2,
3949 bigbuf_arcbufs[2 * j + 1], tx);
3950 }
3951 if (i == 1) {
3952 dmu_buf_rele(dbt, FTAG);
3953 }
3954 }
3955 dmu_tx_commit(tx);
3956
3957 /*
3958 * Sanity check the stuff we just wrote.
3959 */
3960 {
3961 void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
3962 void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
3963
3964 VERIFY(0 == dmu_read(os, packobj, packoff,
3965 packsize, packcheck, DMU_READ_PREFETCH));
3966 VERIFY(0 == dmu_read(os, bigobj, bigoff,
3967 bigsize, bigcheck, DMU_READ_PREFETCH));
3968
3969 ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
3970 ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
3971
3972 umem_free(packcheck, packsize);
3973 umem_free(bigcheck, bigsize);
3974 }
3975 if (i == 2) {
3976 txg_wait_open(dmu_objset_pool(os), 0);
3977 } else if (i == 3) {
3978 txg_wait_synced(dmu_objset_pool(os), 0);
3979 }
3980 }
3981
3982 dmu_buf_rele(bonus_db, FTAG);
3983 umem_free(packbuf, packsize);
3984 umem_free(bigbuf, bigsize);
3985 umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
3986 }
3987
3988 /* ARGSUSED */
3989 void
3990 ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
3991 {
3992 ztest_od_t od[1];
3993 uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
3994 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
3995
3996 /*
3997 * Have multiple threads write to large offsets in an object
3998 * to verify that parallel writes to an object -- even to the
3999 * same blocks within the object -- doesn't cause any trouble.
4000 */
4001 ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
4002
4003 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
4004 return;
4005
4006 while (ztest_random(10) != 0)
4007 ztest_io(zd, od[0].od_object, offset);
4008 }
4009
4010 void
4011 ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
4012 {
4013 ztest_od_t od[1];
4014 uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
4015 (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
4016 uint64_t count = ztest_random(20) + 1;
4017 uint64_t blocksize = ztest_random_blocksize();
4018 void *data;
4019
4020 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
4021
4022 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
4023 return;
4024
4025 if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
4026 return;
4027
4028 ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
4029
4030 data = umem_zalloc(blocksize, UMEM_NOFAIL);
4031
4032 while (ztest_random(count) != 0) {
4033 uint64_t randoff = offset + (ztest_random(count) * blocksize);
4034 if (ztest_write(zd, od[0].od_object, randoff, blocksize,
4035 data) != 0)
4036 break;
4037 while (ztest_random(4) != 0)
4038 ztest_io(zd, od[0].od_object, randoff);
4039 }
4040
4041 umem_free(data, blocksize);
4042 }
4043
4044 /*
4045 * Verify that zap_{create,destroy,add,remove,update} work as expected.
4046 */
4047 #define ZTEST_ZAP_MIN_INTS 1
4048 #define ZTEST_ZAP_MAX_INTS 4
4049 #define ZTEST_ZAP_MAX_PROPS 1000
4050
4051 void
4052 ztest_zap(ztest_ds_t *zd, uint64_t id)
4053 {
4054 objset_t *os = zd->zd_os;
4055 ztest_od_t od[1];
4056 uint64_t object;
4057 uint64_t txg, last_txg;
4058 uint64_t value[ZTEST_ZAP_MAX_INTS];
4059 uint64_t zl_ints, zl_intsize, prop;
4060 int i, ints;
4061 dmu_tx_t *tx;
4062 char propname[100], txgname[100];
4063 int error;
4064 char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
4065
4066 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
4067
4068 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
4069 return;
4070
4071 object = od[0].od_object;
4072
4073 /*
4074 * Generate a known hash collision, and verify that
4075 * we can lookup and remove both entries.
4076 */
4077 tx = dmu_tx_create(os);
4078 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4079 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4080 if (txg == 0)
4081 return;
4082 for (i = 0; i < 2; i++) {
4083 value[i] = i;
4084 VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
4085 1, &value[i], tx));
4086 }
4087 for (i = 0; i < 2; i++) {
4088 VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
4089 sizeof (uint64_t), 1, &value[i], tx));
4090 VERIFY3U(0, ==,
4091 zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
4092 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
4093 ASSERT3U(zl_ints, ==, 1);
4094 }
4095 for (i = 0; i < 2; i++) {
4096 VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
4097 }
4098 dmu_tx_commit(tx);
4099
4100 /*
4101 * Generate a buch of random entries.
4102 */
4103 ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
4104
4105 prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
4106 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
4107 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
4108 bzero(value, sizeof (value));
4109 last_txg = 0;
4110
4111 /*
4112 * If these zap entries already exist, validate their contents.
4113 */
4114 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
4115 if (error == 0) {
4116 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
4117 ASSERT3U(zl_ints, ==, 1);
4118
4119 VERIFY(zap_lookup(os, object, txgname, zl_intsize,
4120 zl_ints, &last_txg) == 0);
4121
4122 VERIFY(zap_length(os, object, propname, &zl_intsize,
4123 &zl_ints) == 0);
4124
4125 ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
4126 ASSERT3U(zl_ints, ==, ints);
4127
4128 VERIFY(zap_lookup(os, object, propname, zl_intsize,
4129 zl_ints, value) == 0);
4130
4131 for (i = 0; i < ints; i++) {
4132 ASSERT3U(value[i], ==, last_txg + object + i);
4133 }
4134 } else {
4135 ASSERT3U(error, ==, ENOENT);
4136 }
4137
4138 /*
4139 * Atomically update two entries in our zap object.
4140 * The first is named txg_%llu, and contains the txg
4141 * in which the property was last updated. The second
4142 * is named prop_%llu, and the nth element of its value
4143 * should be txg + object + n.
4144 */
4145 tx = dmu_tx_create(os);
4146 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4147 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4148 if (txg == 0)
4149 return;
4150
4151 if (last_txg > txg)
4152 fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
4153
4154 for (i = 0; i < ints; i++)
4155 value[i] = txg + object + i;
4156
4157 VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
4158 1, &txg, tx));
4159 VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
4160 ints, value, tx));
4161
4162 dmu_tx_commit(tx);
4163
4164 /*
4165 * Remove a random pair of entries.
4166 */
4167 prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
4168 (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop);
4169 (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop);
4170
4171 error = zap_length(os, object, txgname, &zl_intsize, &zl_ints);
4172
4173 if (error == ENOENT)
4174 return;
4175
4176 ASSERT0(error);
4177
4178 tx = dmu_tx_create(os);
4179 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4180 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4181 if (txg == 0)
4182 return;
4183 VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
4184 VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
4185 dmu_tx_commit(tx);
4186 }
4187
4188 /*
4189 * Testcase to test the upgrading of a microzap to fatzap.
4190 */
4191 void
4192 ztest_fzap(ztest_ds_t *zd, uint64_t id)
4193 {
4194 objset_t *os = zd->zd_os;
4195 ztest_od_t od[1];
4196 uint64_t object, txg;
4197
4198 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
4199
4200 if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
4201 return;
4202
4203 object = od[0].od_object;
4204
4205 /*
4206 * Add entries to this ZAP and make sure it spills over
4207 * and gets upgraded to a fatzap. Also, since we are adding
4208 * 2050 entries we should see ptrtbl growth and leaf-block split.
4209 */
4210 for (int i = 0; i < 2050; i++) {
4211 char name[MAXNAMELEN];
4212 uint64_t value = i;
4213 dmu_tx_t *tx;
4214 int error;
4215
4216 (void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
4217 id, value);
4218
4219 tx = dmu_tx_create(os);
4220 dmu_tx_hold_zap(tx, object, B_TRUE, name);
4221 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4222 if (txg == 0)
4223 return;
4224 error = zap_add(os, object, name, sizeof (uint64_t), 1,
4225 &value, tx);
4226 ASSERT(error == 0 || error == EEXIST);
4227 dmu_tx_commit(tx);
4228 }
4229 }
4230
4231 /* ARGSUSED */
4232 void
4233 ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
4234 {
4235 objset_t *os = zd->zd_os;
4236 ztest_od_t od[1];
4237 uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
4238 dmu_tx_t *tx;
4239 int i, namelen, error;
4240 int micro = ztest_random(2);
4241 char name[20], string_value[20];
4242 void *data;
4243
4244 ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
4245
4246 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
4247 return;
4248
4249 object = od[0].od_object;
4250
4251 /*
4252 * Generate a random name of the form 'xxx.....' where each
4253 * x is a random printable character and the dots are dots.
4254 * There are 94 such characters, and the name length goes from
4255 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
4256 */
4257 namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
4258
4259 for (i = 0; i < 3; i++)
4260 name[i] = '!' + ztest_random('~' - '!' + 1);
4261 for (; i < namelen - 1; i++)
4262 name[i] = '.';
4263 name[i] = '\0';
4264
4265 if ((namelen & 1) || micro) {
4266 wsize = sizeof (txg);
4267 wc = 1;
4268 data = &txg;
4269 } else {
4270 wsize = 1;
4271 wc = namelen;
4272 data = string_value;
4273 }
4274
4275 count = -1ULL;
4276 VERIFY0(zap_count(os, object, &count));
4277 ASSERT(count != -1ULL);
4278
4279 /*
4280 * Select an operation: length, lookup, add, update, remove.
4281 */
4282 i = ztest_random(5);
4283
4284 if (i >= 2) {
4285 tx = dmu_tx_create(os);
4286 dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
4287 txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
4288 if (txg == 0)
4289 return;
4290 bcopy(name, string_value, namelen);
4291 } else {
4292 tx = NULL;
4293 txg = 0;
4294 bzero(string_value, namelen);
4295 }
4296
4297 switch (i) {
4298
4299 case 0:
4300 error = zap_length(os, object, name, &zl_wsize, &zl_wc);
4301 if (error == 0) {
4302 ASSERT3U(wsize, ==, zl_wsize);
4303 ASSERT3U(wc, ==, zl_wc);
4304 } else {
4305 ASSERT3U(error, ==, ENOENT);
4306 }
4307 break;
4308
4309 case 1:
4310 error = zap_lookup(os, object, name, wsize, wc, data);
4311 if (error == 0) {
4312 if (data == string_value &&
4313 bcmp(name, data, namelen) != 0)
4314 fatal(0, "name '%s' != val '%s' len %d",
4315 name, data, namelen);
4316 } else {
4317 ASSERT3U(error, ==, ENOENT);
4318 }
4319 break;
4320
4321 case 2:
4322 error = zap_add(os, object, name, wsize, wc, data, tx);
4323 ASSERT(error == 0 || error == EEXIST);
4324 break;
4325
4326 case 3:
4327 VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
4328 break;
4329
4330 case 4:
4331 error = zap_remove(os, object, name, tx);
4332 ASSERT(error == 0 || error == ENOENT);
4333 break;
4334 }
4335
4336 if (tx != NULL)
4337 dmu_tx_commit(tx);
4338 }
4339
4340 /*
4341 * Commit callback data.
4342 */
4343 typedef struct ztest_cb_data {
4344 list_node_t zcd_node;
4345 uint64_t zcd_txg;
4346 int zcd_expected_err;
4347 boolean_t zcd_added;
4348 boolean_t zcd_called;
4349 spa_t *zcd_spa;
4350 } ztest_cb_data_t;
4351
4352 /* This is the actual commit callback function */
4353 static void
4354 ztest_commit_callback(void *arg, int error)
4355 {
4356 ztest_cb_data_t *data = arg;
4357 uint64_t synced_txg;
4358
4359 VERIFY(data != NULL);
4360 VERIFY3S(data->zcd_expected_err, ==, error);
4361 VERIFY(!data->zcd_called);
4362
4363 synced_txg = spa_last_synced_txg(data->zcd_spa);
4364 if (data->zcd_txg > synced_txg)
4365 fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
4366 ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
4367 synced_txg);
4368
4369 data->zcd_called = B_TRUE;
4370
4371 if (error == ECANCELED) {
4372 ASSERT0(data->zcd_txg);
4373 ASSERT(!data->zcd_added);
4374
4375 /*
4376 * The private callback data should be destroyed here, but
4377 * since we are going to check the zcd_called field after
4378 * dmu_tx_abort(), we will destroy it there.
4379 */
4380 return;
4381 }
4382
4383 /* Was this callback added to the global callback list? */
4384 if (!data->zcd_added)
4385 goto out;
4386
4387 ASSERT3U(data->zcd_txg, !=, 0);
4388
4389 /* Remove our callback from the list */
4390 (void) mutex_lock(&zcl.zcl_callbacks_lock);
4391 list_remove(&zcl.zcl_callbacks, data);
4392 (void) mutex_unlock(&zcl.zcl_callbacks_lock);
4393
4394 out:
4395 umem_free(data, sizeof (ztest_cb_data_t));
4396 }
4397
4398 /* Allocate and initialize callback data structure */
4399 static ztest_cb_data_t *
4400 ztest_create_cb_data(objset_t *os, uint64_t txg)
4401 {
4402 ztest_cb_data_t *cb_data;
4403
4404 cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
4405
4406 cb_data->zcd_txg = txg;
4407 cb_data->zcd_spa = dmu_objset_spa(os);
4408
4409 return (cb_data);
4410 }
4411
4412 /*
4413 * If a number of txgs equal to this threshold have been created after a commit
4414 * callback has been registered but not called, then we assume there is an
4415 * implementation bug.
4416 */
4417 #define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2)
4418
4419 /*
4420 * Commit callback test.
4421 */
4422 void
4423 ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
4424 {
4425 objset_t *os = zd->zd_os;
4426 ztest_od_t od[1];
4427 dmu_tx_t *tx;
4428 ztest_cb_data_t *cb_data[3], *tmp_cb;
4429 uint64_t old_txg, txg;
4430 int i, error;
4431
4432 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
4433
4434 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
4435 return;
4436
4437 tx = dmu_tx_create(os);
4438
4439 cb_data[0] = ztest_create_cb_data(os, 0);
4440 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
4441
4442 dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
4443
4444 /* Every once in a while, abort the transaction on purpose */
4445 if (ztest_random(100) == 0)
4446 error = -1;
4447
4448 if (!error)
4449 error = dmu_tx_assign(tx, TXG_NOWAIT);
4450
4451 txg = error ? 0 : dmu_tx_get_txg(tx);
4452
4453 cb_data[0]->zcd_txg = txg;
4454 cb_data[1] = ztest_create_cb_data(os, txg);
4455 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
4456
4457 if (error) {
4458 /*
4459 * It's not a strict requirement to call the registered
4460 * callbacks from inside dmu_tx_abort(), but that's what
4461 * it's supposed to happen in the current implementation
4462 * so we will check for that.
4463 */
4464 for (i = 0; i < 2; i++) {
4465 cb_data[i]->zcd_expected_err = ECANCELED;
4466 VERIFY(!cb_data[i]->zcd_called);
4467 }
4468
4469 dmu_tx_abort(tx);
4470
4471 for (i = 0; i < 2; i++) {
4472 VERIFY(cb_data[i]->zcd_called);
4473 umem_free(cb_data[i], sizeof (ztest_cb_data_t));
4474 }
4475
4476 return;
4477 }
4478
4479 cb_data[2] = ztest_create_cb_data(os, txg);
4480 dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
4481
4482 /*
4483 * Read existing data to make sure there isn't a future leak.
4484 */
4485 VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
4486 &old_txg, DMU_READ_PREFETCH));
4487
4488 if (old_txg > txg)
4489 fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
4490 old_txg, txg);
4491
4492 dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
4493
4494 (void) mutex_lock(&zcl.zcl_callbacks_lock);
4495
4496 /*
4497 * Since commit callbacks don't have any ordering requirement and since
4498 * it is theoretically possible for a commit callback to be called
4499 * after an arbitrary amount of time has elapsed since its txg has been
4500 * synced, it is difficult to reliably determine whether a commit
4501 * callback hasn't been called due to high load or due to a flawed
4502 * implementation.
4503 *
4504 * In practice, we will assume that if after a certain number of txgs a
4505 * commit callback hasn't been called, then most likely there's an
4506 * implementation bug..
4507 */
4508 tmp_cb = list_head(&zcl.zcl_callbacks);
4509 if (tmp_cb != NULL &&
4510 tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
4511 fatal(0, "Commit callback threshold exceeded, oldest txg: %"
4512 PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
4513 }
4514
4515 /*
4516 * Let's find the place to insert our callbacks.
4517 *
4518 * Even though the list is ordered by txg, it is possible for the
4519 * insertion point to not be the end because our txg may already be
4520 * quiescing at this point and other callbacks in the open txg
4521 * (from other objsets) may have sneaked in.
4522 */
4523 tmp_cb = list_tail(&zcl.zcl_callbacks);
4524 while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
4525 tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
4526
4527 /* Add the 3 callbacks to the list */
4528 for (i = 0; i < 3; i++) {
4529 if (tmp_cb == NULL)
4530 list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
4531 else
4532 list_insert_after(&zcl.zcl_callbacks, tmp_cb,
4533 cb_data[i]);
4534
4535 cb_data[i]->zcd_added = B_TRUE;
4536 VERIFY(!cb_data[i]->zcd_called);
4537
4538 tmp_cb = cb_data[i];
4539 }
4540
4541 (void) mutex_unlock(&zcl.zcl_callbacks_lock);
4542
4543 dmu_tx_commit(tx);
4544 }
4545
4546 /* ARGSUSED */
4547 void
4548 ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
4549 {
4550 zfs_prop_t proplist[] = {
4551 ZFS_PROP_CHECKSUM,
4552 ZFS_PROP_COMPRESSION,
4553 ZFS_PROP_COPIES,
4554 ZFS_PROP_DEDUP
4555 };
4556
4557 (void) rw_rdlock(&ztest_name_lock);
4558
4559 for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
4560 (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
4561 ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
4562
4563 (void) rw_unlock(&ztest_name_lock);
4564 }
4565
4566 /* ARGSUSED */
4567 void
4568 ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
4569 {
4570 nvlist_t *props = NULL;
4571
4572 (void) rw_rdlock(&ztest_name_lock);
4573
4574 (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO,
4575 ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
4576
4577 VERIFY0(spa_prop_get(ztest_spa, &props));
4578
4579 if (ztest_opts.zo_verbose >= 6)
4580 dump_nvlist(props, 4);
4581
4582 nvlist_free(props);
4583
4584 (void) rw_unlock(&ztest_name_lock);
4585 }
4586
4587 static int
4588 user_release_one(const char *snapname, const char *holdname)
4589 {
4590 nvlist_t *snaps, *holds;
4591 int error;
4592
4593 snaps = fnvlist_alloc();
4594 holds = fnvlist_alloc();
4595 fnvlist_add_boolean(holds, holdname);
4596 fnvlist_add_nvlist(snaps, snapname, holds);
4597 fnvlist_free(holds);
4598 error = dsl_dataset_user_release(snaps, NULL);
4599 fnvlist_free(snaps);
4600 return (error);
4601 }
4602
4603 /*
4604 * Test snapshot hold/release and deferred destroy.
4605 */
4606 void
4607 ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
4608 {
4609 int error;
4610 objset_t *os = zd->zd_os;
4611 objset_t *origin;
4612 char snapname[100];
4613 char fullname[100];
4614 char clonename[100];
4615 char tag[100];
4616 char osname[MAXNAMELEN];
4617 nvlist_t *holds;
4618
4619 (void) rw_rdlock(&ztest_name_lock);
4620
4621 dmu_objset_name(os, osname);
4622
4623 (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id);
4624 (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname);
4625 (void) snprintf(clonename, sizeof (clonename),
4626 "%s/ch1_%llu", osname, id);
4627 (void) snprintf(tag, sizeof (tag), "tag_%llu", id);
4628
4629 /*
4630 * Clean up from any previous run.
4631 */
4632 error = dsl_destroy_head(clonename);
4633 if (error != ENOENT)
4634 ASSERT0(error);
4635 error = user_release_one(fullname, tag);
4636 if (error != ESRCH && error != ENOENT)
4637 ASSERT0(error);
4638 error = dsl_destroy_snapshot(fullname, B_FALSE);
4639 if (error != ENOENT)
4640 ASSERT0(error);
4641
4642 /*
4643 * Create snapshot, clone it, mark snap for deferred destroy,
4644 * destroy clone, verify snap was also destroyed.
4645 */
4646 error = dmu_objset_snapshot_one(osname, snapname);
4647 if (error) {
4648 if (error == ENOSPC) {
4649 ztest_record_enospc("dmu_objset_snapshot");
4650 goto out;
4651 }
4652 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
4653 }
4654
4655 error = dmu_objset_clone(clonename, fullname);
4656 if (error) {
4657 if (error == ENOSPC) {
4658 ztest_record_enospc("dmu_objset_clone");
4659 goto out;
4660 }
4661 fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
4662 }
4663
4664 error = dsl_destroy_snapshot(fullname, B_TRUE);
4665 if (error) {
4666 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
4667 fullname, error);
4668 }
4669
4670 error = dsl_destroy_head(clonename);
4671 if (error)
4672 fatal(0, "dsl_destroy_head(%s) = %d", clonename, error);
4673
4674 error = dmu_objset_hold(fullname, FTAG, &origin);
4675 if (error != ENOENT)
4676 fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
4677
4678 /*
4679 * Create snapshot, add temporary hold, verify that we can't
4680 * destroy a held snapshot, mark for deferred destroy,
4681 * release hold, verify snapshot was destroyed.
4682 */
4683 error = dmu_objset_snapshot_one(osname, snapname);
4684 if (error) {
4685 if (error == ENOSPC) {
4686 ztest_record_enospc("dmu_objset_snapshot");
4687 goto out;
4688 }
4689 fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
4690 }
4691
4692 holds = fnvlist_alloc();
4693 fnvlist_add_string(holds, fullname, tag);
4694 error = dsl_dataset_user_hold(holds, 0, NULL);
4695 fnvlist_free(holds);
4696
4697 if (error)
4698 fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
4699
4700 error = dsl_destroy_snapshot(fullname, B_FALSE);
4701 if (error != EBUSY) {
4702 fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d",
4703 fullname, error);
4704 }
4705
4706 error = dsl_destroy_snapshot(fullname, B_TRUE);
4707 if (error) {
4708 fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d",
4709 fullname, error);
4710 }
4711
4712 error = user_release_one(fullname, tag);
4713 if (error)
4714 fatal(0, "user_release_one(%s)", fullname, tag);
4715
4716 VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT);
4717
4718 out:
4719 (void) rw_unlock(&ztest_name_lock);
4720 }
4721
4722 /*
4723 * Inject random faults into the on-disk data.
4724 */
4725 /* ARGSUSED */
4726 void
4727 ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
4728 {
4729 ztest_shared_t *zs = ztest_shared;
4730 spa_t *spa = ztest_spa;
4731 int fd;
4732 uint64_t offset;
4733 uint64_t leaves;
4734 uint64_t bad = 0x1990c0ffeedecade;
4735 uint64_t top, leaf;
4736 char path0[MAXPATHLEN];
4737 char pathrand[MAXPATHLEN];
4738 size_t fsize;
4739 int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */
4740 int iters = 1000;
4741 int maxfaults;
4742 int mirror_save;
4743 vdev_t *vd0 = NULL;
4744 uint64_t guid0 = 0;
4745 boolean_t islog = B_FALSE;
4746
4747 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
4748 maxfaults = MAXFAULTS();
4749 leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz;
4750 mirror_save = zs->zs_mirrors;
4751 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
4752
4753 ASSERT(leaves >= 1);
4754
4755 /*
4756 * We need SCL_STATE here because we're going to look at vd0->vdev_tsd.
4757 */
4758 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4759
4760 if (ztest_random(2) == 0) {
4761 /*
4762 * Inject errors on a normal data device or slog device.
4763 */
4764 top = ztest_random_vdev_top(spa, B_TRUE);
4765 leaf = ztest_random(leaves) + zs->zs_splits;
4766
4767 /*
4768 * Generate paths to the first leaf in this top-level vdev,
4769 * and to the random leaf we selected. We'll induce transient
4770 * write failures and random online/offline activity on leaf 0,
4771 * and we'll write random garbage to the randomly chosen leaf.
4772 */
4773 (void) snprintf(path0, sizeof (path0), ztest_dev_template,
4774 ztest_opts.zo_dir, ztest_opts.zo_pool,
4775 top * leaves + zs->zs_splits);
4776 (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
4777 ztest_opts.zo_dir, ztest_opts.zo_pool,
4778 top * leaves + leaf);
4779
4780 vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
4781 if (vd0 != NULL && vd0->vdev_top->vdev_islog)
4782 islog = B_TRUE;
4783
4784 if (vd0 != NULL && maxfaults != 1) {
4785 /*
4786 * Make vd0 explicitly claim to be unreadable,
4787 * or unwriteable, or reach behind its back
4788 * and close the underlying fd. We can do this if
4789 * maxfaults == 0 because we'll fail and reexecute,
4790 * and we can do it if maxfaults >= 2 because we'll
4791 * have enough redundancy. If maxfaults == 1, the
4792 * combination of this with injection of random data
4793 * corruption below exceeds the pool's fault tolerance.
4794 */
4795 vdev_file_t *vf = vd0->vdev_tsd;
4796
4797 if (vf != NULL && ztest_random(3) == 0) {
4798 (void) close(vf->vf_vnode->v_fd);
4799 vf->vf_vnode->v_fd = -1;
4800 } else if (ztest_random(2) == 0) {
4801 vd0->vdev_cant_read = B_TRUE;
4802 } else {
4803 vd0->vdev_cant_write = B_TRUE;
4804 }
4805 guid0 = vd0->vdev_guid;
4806 }
4807 } else {
4808 /*
4809 * Inject errors on an l2cache device.
4810 */
4811 spa_aux_vdev_t *sav = &spa->spa_l2cache;
4812
4813 if (sav->sav_count == 0) {
4814 spa_config_exit(spa, SCL_STATE, FTAG);
4815 return;
4816 }
4817 vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)];
4818 guid0 = vd0->vdev_guid;
4819 (void) strcpy(path0, vd0->vdev_path);
4820 (void) strcpy(pathrand, vd0->vdev_path);
4821
4822 leaf = 0;
4823 leaves = 1;
4824 maxfaults = INT_MAX; /* no limit on cache devices */
4825 }
4826
4827 spa_config_exit(spa, SCL_STATE, FTAG);
4828
4829 /*
4830 * If we can tolerate two or more faults, or we're dealing
4831 * with a slog, randomly online/offline vd0.
4832 */
4833 if ((maxfaults >= 2 || islog) && guid0 != 0) {
4834 if (ztest_random(10) < 6) {
4835 int flags = (ztest_random(2) == 0 ?
4836 ZFS_OFFLINE_TEMPORARY : 0);
4837
4838 /*
4839 * We have to grab the zs_name_lock as writer to
4840 * prevent a race between offlining a slog and
4841 * destroying a dataset. Offlining the slog will
4842 * grab a reference on the dataset which may cause
4843 * dmu_objset_destroy() to fail with EBUSY thus
4844 * leaving the dataset in an inconsistent state.
4845 */
4846 if (islog)
4847 (void) rw_wrlock(&ztest_name_lock);
4848
4849 VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
4850
4851 if (islog)
4852 (void) rw_unlock(&ztest_name_lock);
4853 } else {
4854 /*
4855 * Ideally we would like to be able to randomly
4856 * call vdev_[on|off]line without holding locks
4857 * to force unpredictable failures but the side
4858 * effects of vdev_[on|off]line prevent us from
4859 * doing so. We grab the ztest_vdev_lock here to
4860 * prevent a race between injection testing and
4861 * aux_vdev removal.
4862 */
4863 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
4864 (void) vdev_online(spa, guid0, 0, NULL);
4865 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
4866 }
4867 }
4868
4869 if (maxfaults == 0)
4870 return;
4871
4872 /*
4873 * We have at least single-fault tolerance, so inject data corruption.
4874 */
4875 fd = open(pathrand, O_RDWR);
4876
4877 if (fd == -1) /* we hit a gap in the device namespace */
4878 return;
4879
4880 fsize = lseek(fd, 0, SEEK_END);
4881
4882 while (--iters != 0) {
4883 offset = ztest_random(fsize / (leaves << bshift)) *
4884 (leaves << bshift) + (leaf << bshift) +
4885 (ztest_random(1ULL << (bshift - 1)) & -8ULL);
4886
4887 if (offset >= fsize)
4888 continue;
4889
4890 VERIFY(mutex_lock(&ztest_vdev_lock) == 0);
4891 if (mirror_save != zs->zs_mirrors) {
4892 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
4893 (void) close(fd);
4894 return;
4895 }
4896
4897 if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
4898 fatal(1, "can't inject bad word at 0x%llx in %s",
4899 offset, pathrand);
4900
4901 VERIFY(mutex_unlock(&ztest_vdev_lock) == 0);
4902
4903 if (ztest_opts.zo_verbose >= 7)
4904 (void) printf("injected bad word into %s,"
4905 " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
4906 }
4907
4908 (void) close(fd);
4909 }
4910
4911 /*
4912 * Verify that DDT repair works as expected.
4913 */
4914 void
4915 ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
4916 {
4917 ztest_shared_t *zs = ztest_shared;
4918 spa_t *spa = ztest_spa;
4919 objset_t *os = zd->zd_os;
4920 ztest_od_t od[1];
4921 uint64_t object, blocksize, txg, pattern, psize;
4922 enum zio_checksum checksum = spa_dedup_checksum(spa);
4923 dmu_buf_t *db;
4924 dmu_tx_t *tx;
4925 void *buf;
4926 blkptr_t blk;
4927 int copies = 2 * ZIO_DEDUPDITTO_MIN;
4928
4929 blocksize = ztest_random_blocksize();
4930 blocksize = MIN(blocksize, 2048); /* because we write so many */
4931
4932 ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
4933
4934 if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
4935 return;
4936
4937 /*
4938 * Take the name lock as writer to prevent anyone else from changing
4939 * the pool and dataset properies we need to maintain during this test.
4940 */
4941 (void) rw_wrlock(&ztest_name_lock);
4942
4943 if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
4944 B_FALSE) != 0 ||
4945 ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
4946 B_FALSE) != 0) {
4947 (void) rw_unlock(&ztest_name_lock);
4948 return;
4949 }
4950
4951 object = od[0].od_object;
4952 blocksize = od[0].od_blocksize;
4953 pattern = zs->zs_guid ^ dmu_objset_fsid_guid(os);
4954
4955 ASSERT(object != 0);
4956
4957 tx = dmu_tx_create(os);
4958 dmu_tx_hold_write(tx, object, 0, copies * blocksize);
4959 txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
4960 if (txg == 0) {
4961 (void) rw_unlock(&ztest_name_lock);
4962 return;
4963 }
4964
4965 /*
4966 * Write all the copies of our block.
4967 */
4968 for (int i = 0; i < copies; i++) {
4969 uint64_t offset = i * blocksize;
4970 int error = dmu_buf_hold(os, object, offset, FTAG, &db,
4971 DMU_READ_NO_PREFETCH);
4972 if (error != 0) {
4973 fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u",
4974 os, (long long)object, (long long) offset, error);
4975 }
4976 ASSERT(db->db_offset == offset);
4977 ASSERT(db->db_size == blocksize);
4978 ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
4979 ztest_pattern_match(db->db_data, db->db_size, 0ULL));
4980 dmu_buf_will_fill(db, tx);
4981 ztest_pattern_set(db->db_data, db->db_size, pattern);
4982 dmu_buf_rele(db, FTAG);
4983 }
4984
4985 dmu_tx_commit(tx);
4986 txg_wait_synced(spa_get_dsl(spa), txg);
4987
4988 /*
4989 * Find out what block we got.
4990 */
4991 VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
4992 DMU_READ_NO_PREFETCH));
4993 blk = *((dmu_buf_impl_t *)db)->db_blkptr;
4994 dmu_buf_rele(db, FTAG);
4995
4996 /*
4997 * Damage the block. Dedup-ditto will save us when we read it later.
4998 */
4999 psize = BP_GET_PSIZE(&blk);
5000 buf = zio_buf_alloc(psize);
5001 ztest_pattern_set(buf, psize, ~pattern);
5002
5003 (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
5004 buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
5005 ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
5006
5007 zio_buf_free(buf, psize);
5008
5009 (void) rw_unlock(&ztest_name_lock);
5010 }
5011
5012 /*
5013 * Scrub the pool.
5014 */
5015 /* ARGSUSED */
5016 void
5017 ztest_scrub(ztest_ds_t *zd, uint64_t id)
5018 {
5019 spa_t *spa = ztest_spa;
5020
5021 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5022 (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
5023 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5024 }
5025
5026 /*
5027 * Change the guid for the pool.
5028 */
5029 /* ARGSUSED */
5030 void
5031 ztest_reguid(ztest_ds_t *zd, uint64_t id)
5032 {
5033 spa_t *spa = ztest_spa;
5034 uint64_t orig, load;
5035 int error;
5036
5037 orig = spa_guid(spa);
5038 load = spa_load_guid(spa);
5039
5040 (void) rw_wrlock(&ztest_name_lock);
5041 error = spa_change_guid(spa);
5042 (void) rw_unlock(&ztest_name_lock);
5043
5044 if (error != 0)
5045 return;
5046
5047 if (ztest_opts.zo_verbose >= 4) {
5048 (void) printf("Changed guid old %llu -> %llu\n",
5049 (u_longlong_t)orig, (u_longlong_t)spa_guid(spa));
5050 }
5051
5052 VERIFY3U(orig, !=, spa_guid(spa));
5053 VERIFY3U(load, ==, spa_load_guid(spa));
5054 }
5055
5056 /*
5057 * Rename the pool to a different name and then rename it back.
5058 */
5059 /* ARGSUSED */
5060 void
5061 ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
5062 {
5063 char *oldname, *newname;
5064 spa_t *spa;
5065
5066 (void) rw_wrlock(&ztest_name_lock);
5067
5068 oldname = ztest_opts.zo_pool;
5069 newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
5070 (void) strcpy(newname, oldname);
5071 (void) strcat(newname, "_tmp");
5072
5073 /*
5074 * Do the rename
5075 */
5076 VERIFY3U(0, ==, spa_rename(oldname, newname));
5077
5078 /*
5079 * Try to open it under the old name, which shouldn't exist
5080 */
5081 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
5082
5083 /*
5084 * Open it under the new name and make sure it's still the same spa_t.
5085 */
5086 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
5087
5088 ASSERT(spa == ztest_spa);
5089 spa_close(spa, FTAG);
5090
5091 /*
5092 * Rename it back to the original
5093 */
5094 VERIFY3U(0, ==, spa_rename(newname, oldname));
5095
5096 /*
5097 * Make sure it can still be opened
5098 */
5099 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
5100
5101 ASSERT(spa == ztest_spa);
5102 spa_close(spa, FTAG);
5103
5104 umem_free(newname, strlen(newname) + 1);
5105
5106 (void) rw_unlock(&ztest_name_lock);
5107 }
5108
5109 /*
5110 * Verify pool integrity by running zdb.
5111 */
5112 static void
5113 ztest_run_zdb(char *pool)
5114 {
5115 int status;
5116 char zdb[MAXPATHLEN + MAXNAMELEN + 20];
5117 char zbuf[1024];
5118 char *bin;
5119 char *ztest;
5120 char *isa;
5121 int isalen;
5122 FILE *fp;
5123
5124 (void) realpath(getexecname(), zdb);
5125
5126 /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */
5127 bin = strstr(zdb, "/usr/bin/");
5128 ztest = strstr(bin, "/ztest");
5129 isa = bin + 8;
5130 isalen = ztest - isa;
5131 isa = strdup(isa);
5132 /* LINTED */
5133 (void) sprintf(bin,
5134 "/usr/sbin%.*s/zdb -bcc%s%s -U %s %s",
5135 isalen,
5136 isa,
5137 ztest_opts.zo_verbose >= 3 ? "s" : "",
5138 ztest_opts.zo_verbose >= 4 ? "v" : "",
5139 spa_config_path,
5140 pool);
5141 free(isa);
5142
5143 if (ztest_opts.zo_verbose >= 5)
5144 (void) printf("Executing %s\n", strstr(zdb, "zdb "));
5145
5146 fp = popen(zdb, "r");
5147
5148 while (fgets(zbuf, sizeof (zbuf), fp) != NULL)
5149 if (ztest_opts.zo_verbose >= 3)
5150 (void) printf("%s", zbuf);
5151
5152 status = pclose(fp);
5153
5154 if (status == 0)
5155 return;
5156
5157 ztest_dump_core = 0;
5158 if (WIFEXITED(status))
5159 fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status));
5160 else
5161 fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status));
5162 }
5163
5164 static void
5165 ztest_walk_pool_directory(char *header)
5166 {
5167 spa_t *spa = NULL;
5168
5169 if (ztest_opts.zo_verbose >= 6)
5170 (void) printf("%s\n", header);
5171
5172 mutex_enter(&spa_namespace_lock);
5173 while ((spa = spa_next(spa)) != NULL)
5174 if (ztest_opts.zo_verbose >= 6)
5175 (void) printf("\t%s\n", spa_name(spa));
5176 mutex_exit(&spa_namespace_lock);
5177 }
5178
5179 static void
5180 ztest_spa_import_export(char *oldname, char *newname)
5181 {
5182 nvlist_t *config, *newconfig;
5183 uint64_t pool_guid;
5184 spa_t *spa;
5185 int error;
5186
5187 if (ztest_opts.zo_verbose >= 4) {
5188 (void) printf("import/export: old = %s, new = %s\n",
5189 oldname, newname);
5190 }
5191
5192 /*
5193 * Clean up from previous runs.
5194 */
5195 (void) spa_destroy(newname);
5196
5197 /*
5198 * Get the pool's configuration and guid.
5199 */
5200 VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
5201
5202 /*
5203 * Kick off a scrub to tickle scrub/export races.
5204 */
5205 if (ztest_random(2) == 0)
5206 (void) spa_scan(spa, POOL_SCAN_SCRUB);
5207
5208 pool_guid = spa_guid(spa);
5209 spa_close(spa, FTAG);
5210
5211 ztest_walk_pool_directory("pools before export");
5212
5213 /*
5214 * Export it.
5215 */
5216 VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
5217
5218 ztest_walk_pool_directory("pools after export");
5219
5220 /*
5221 * Try to import it.
5222 */
5223 newconfig = spa_tryimport(config);
5224 ASSERT(newconfig != NULL);
5225 nvlist_free(newconfig);
5226
5227 /*
5228 * Import it under the new name.
5229 */
5230 error = spa_import(newname, config, NULL, 0);
5231 if (error != 0) {
5232 dump_nvlist(config, 0);
5233 fatal(B_FALSE, "couldn't import pool %s as %s: error %u",
5234 oldname, newname, error);
5235 }
5236
5237 ztest_walk_pool_directory("pools after import");
5238
5239 /*
5240 * Try to import it again -- should fail with EEXIST.
5241 */
5242 VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0));
5243
5244 /*
5245 * Try to import it under a different name -- should fail with EEXIST.
5246 */
5247 VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0));
5248
5249 /*
5250 * Verify that the pool is no longer visible under the old name.
5251 */
5252 VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
5253
5254 /*
5255 * Verify that we can open and close the pool using the new name.
5256 */
5257 VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
5258 ASSERT(pool_guid == spa_guid(spa));
5259 spa_close(spa, FTAG);
5260
5261 nvlist_free(config);
5262 }
5263
5264 static void
5265 ztest_resume(spa_t *spa)
5266 {
5267 if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6)
5268 (void) printf("resuming from suspended state\n");
5269 spa_vdev_state_enter(spa, SCL_NONE);
5270 vdev_clear(spa, NULL);
5271 (void) spa_vdev_state_exit(spa, NULL, 0);
5272 (void) zio_resume(spa);
5273 }
5274
5275 static void *
5276 ztest_resume_thread(void *arg)
5277 {
5278 spa_t *spa = arg;
5279
5280 while (!ztest_exiting) {
5281 if (spa_suspended(spa))
5282 ztest_resume(spa);
5283 (void) poll(NULL, 0, 100);
5284 }
5285 return (NULL);
5286 }
5287
5288 static void *
5289 ztest_deadman_thread(void *arg)
5290 {
5291 ztest_shared_t *zs = arg;
5292 int grace = 300;
5293 hrtime_t delta;
5294
5295 delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
5296
5297 (void) poll(NULL, 0, (int)(1000 * delta));
5298
5299 fatal(0, "failed to complete within %d seconds of deadline", grace);
5300
5301 return (NULL);
5302 }
5303
5304 static void
5305 ztest_execute(int test, ztest_info_t *zi, uint64_t id)
5306 {
5307 ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets];
5308 ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test);
5309 hrtime_t functime = gethrtime();
5310
5311 for (int i = 0; i < zi->zi_iters; i++)
5312 zi->zi_func(zd, id);
5313
5314 functime = gethrtime() - functime;
5315
5316 atomic_add_64(&zc->zc_count, 1);
5317 atomic_add_64(&zc->zc_time, functime);
5318
5319 if (ztest_opts.zo_verbose >= 4) {
5320 Dl_info dli;
5321 (void) dladdr((void *)zi->zi_func, &dli);
5322 (void) printf("%6.2f sec in %s\n",
5323 (double)functime / NANOSEC, dli.dli_sname);
5324 }
5325 }
5326
5327 static void *
5328 ztest_thread(void *arg)
5329 {
5330 int rand;
5331 uint64_t id = (uintptr_t)arg;
5332 ztest_shared_t *zs = ztest_shared;
5333 uint64_t call_next;
5334 hrtime_t now;
5335 ztest_info_t *zi;
5336 ztest_shared_callstate_t *zc;
5337
5338 while ((now = gethrtime()) < zs->zs_thread_stop) {
5339 /*
5340 * See if it's time to force a crash.
5341 */
5342 if (now > zs->zs_thread_kill)
5343 ztest_kill(zs);
5344
5345 /*
5346 * If we're getting ENOSPC with some regularity, stop.
5347 */
5348 if (zs->zs_enospc_count > 10)
5349 break;
5350
5351 /*
5352 * Pick a random function to execute.
5353 */
5354 rand = ztest_random(ZTEST_FUNCS);
5355 zi = &ztest_info[rand];
5356 zc = ZTEST_GET_SHARED_CALLSTATE(rand);
5357 call_next = zc->zc_next;
5358
5359 if (now >= call_next &&
5360 atomic_cas_64(&zc->zc_next, call_next, call_next +
5361 ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) {
5362 ztest_execute(rand, zi, id);
5363 }
5364 }
5365
5366 return (NULL);
5367 }
5368
5369 static void
5370 ztest_dataset_name(char *dsname, char *pool, int d)
5371 {
5372 (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
5373 }
5374
5375 static void
5376 ztest_dataset_destroy(int d)
5377 {
5378 char name[MAXNAMELEN];
5379
5380 ztest_dataset_name(name, ztest_opts.zo_pool, d);
5381
5382 if (ztest_opts.zo_verbose >= 3)
5383 (void) printf("Destroying %s to free up space\n", name);
5384
5385 /*
5386 * Cleanup any non-standard clones and snapshots. In general,
5387 * ztest thread t operates on dataset (t % zopt_datasets),
5388 * so there may be more than one thing to clean up.
5389 */
5390 for (int t = d; t < ztest_opts.zo_threads;
5391 t += ztest_opts.zo_datasets) {
5392 ztest_dsl_dataset_cleanup(name, t);
5393 }
5394
5395 (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
5396 DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
5397 }
5398
5399 static void
5400 ztest_dataset_dirobj_verify(ztest_ds_t *zd)
5401 {
5402 uint64_t usedobjs, dirobjs, scratch;
5403
5404 /*
5405 * ZTEST_DIROBJ is the object directory for the entire dataset.
5406 * Therefore, the number of objects in use should equal the
5407 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
5408 * If not, we have an object leak.
5409 *
5410 * Note that we can only check this in ztest_dataset_open(),
5411 * when the open-context and syncing-context values agree.
5412 * That's because zap_count() returns the open-context value,
5413 * while dmu_objset_space() returns the rootbp fill count.
5414 */
5415 VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
5416 dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
5417 ASSERT3U(dirobjs + 1, ==, usedobjs);
5418 }
5419
5420 static int
5421 ztest_dataset_open(int d)
5422 {
5423 ztest_ds_t *zd = &ztest_ds[d];
5424 uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq;
5425 objset_t *os;
5426 zilog_t *zilog;
5427 char name[MAXNAMELEN];
5428 int error;
5429
5430 ztest_dataset_name(name, ztest_opts.zo_pool, d);
5431
5432 (void) rw_rdlock(&ztest_name_lock);
5433
5434 error = ztest_dataset_create(name);
5435 if (error == ENOSPC) {
5436 (void) rw_unlock(&ztest_name_lock);
5437 ztest_record_enospc(FTAG);
5438 return (error);
5439 }
5440 ASSERT(error == 0 || error == EEXIST);
5441
5442 VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os));
5443 (void) rw_unlock(&ztest_name_lock);
5444
5445 ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os);
5446
5447 zilog = zd->zd_zilog;
5448
5449 if (zilog->zl_header->zh_claim_lr_seq != 0 &&
5450 zilog->zl_header->zh_claim_lr_seq < committed_seq)
5451 fatal(0, "missing log records: claimed %llu < committed %llu",
5452 zilog->zl_header->zh_claim_lr_seq, committed_seq);
5453
5454 ztest_dataset_dirobj_verify(zd);
5455
5456 zil_replay(os, zd, ztest_replay_vector);
5457
5458 ztest_dataset_dirobj_verify(zd);
5459
5460 if (ztest_opts.zo_verbose >= 6)
5461 (void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
5462 zd->zd_name,
5463 (u_longlong_t)zilog->zl_parse_blk_count,
5464 (u_longlong_t)zilog->zl_parse_lr_count,
5465 (u_longlong_t)zilog->zl_replaying_seq);
5466
5467 zilog = zil_open(os, ztest_get_data);
5468
5469 if (zilog->zl_replaying_seq != 0 &&
5470 zilog->zl_replaying_seq < committed_seq)
5471 fatal(0, "missing log records: replayed %llu < committed %llu",
5472 zilog->zl_replaying_seq, committed_seq);
5473
5474 return (0);
5475 }
5476
5477 static void
5478 ztest_dataset_close(int d)
5479 {
5480 ztest_ds_t *zd = &ztest_ds[d];
5481
5482 zil_close(zd->zd_zilog);
5483 dmu_objset_disown(zd->zd_os, zd);
5484
5485 ztest_zd_fini(zd);
5486 }
5487
5488 /*
5489 * Kick off threads to run tests on all datasets in parallel.
5490 */
5491 static void
5492 ztest_run(ztest_shared_t *zs)
5493 {
5494 thread_t *tid;
5495 spa_t *spa;
5496 objset_t *os;
5497 thread_t resume_tid;
5498 int error;
5499
5500 ztest_exiting = B_FALSE;
5501
5502 /*
5503 * Initialize parent/child shared state.
5504 */
5505 VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
5506 VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
5507
5508 zs->zs_thread_start = gethrtime();
5509 zs->zs_thread_stop =
5510 zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC;
5511 zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
5512 zs->zs_thread_kill = zs->zs_thread_stop;
5513 if (ztest_random(100) < ztest_opts.zo_killrate) {
5514 zs->zs_thread_kill -=
5515 ztest_random(ztest_opts.zo_passtime * NANOSEC);
5516 }
5517
5518 (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
5519
5520 list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
5521 offsetof(ztest_cb_data_t, zcd_node));
5522
5523 /*
5524 * Open our pool.
5525 */
5526 kernel_init(FREAD | FWRITE);
5527 VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG));
5528 spa->spa_debug = B_TRUE;
5529 ztest_spa = spa;
5530
5531 VERIFY0(dmu_objset_own(ztest_opts.zo_pool,
5532 DMU_OST_ANY, B_TRUE, FTAG, &os));
5533 zs->zs_guid = dmu_objset_fsid_guid(os);
5534 dmu_objset_disown(os, FTAG);
5535
5536 spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
5537
5538 /*
5539 * We don't expect the pool to suspend unless maxfaults == 0,
5540 * in which case ztest_fault_inject() temporarily takes away
5541 * the only valid replica.
5542 */
5543 if (MAXFAULTS() == 0)
5544 spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
5545 else
5546 spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
5547
5548 /*
5549 * Create a thread to periodically resume suspended I/O.
5550 */
5551 VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
5552 &resume_tid) == 0);
5553
5554 /*
5555 * Create a deadman thread to abort() if we hang.
5556 */
5557 VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
5558 NULL) == 0);
5559
5560 /*
5561 * Verify that we can safely inquire about about any object,
5562 * whether it's allocated or not. To make it interesting,
5563 * we probe a 5-wide window around each power of two.
5564 * This hits all edge cases, including zero and the max.
5565 */
5566 for (int t = 0; t < 64; t++) {
5567 for (int d = -5; d <= 5; d++) {
5568 error = dmu_object_info(spa->spa_meta_objset,
5569 (1ULL << t) + d, NULL);
5570 ASSERT(error == 0 || error == ENOENT ||
5571 error == EINVAL);
5572 }
5573 }
5574
5575 /*
5576 * If we got any ENOSPC errors on the previous run, destroy something.
5577 */
5578 if (zs->zs_enospc_count != 0) {
5579 int d = ztest_random(ztest_opts.zo_datasets);
5580 ztest_dataset_destroy(d);
5581 }
5582 zs->zs_enospc_count = 0;
5583
5584 tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t),
5585 UMEM_NOFAIL);
5586
5587 if (ztest_opts.zo_verbose >= 4)
5588 (void) printf("starting main threads...\n");
5589
5590 /*
5591 * Kick off all the tests that run in parallel.
5592 */
5593 for (int t = 0; t < ztest_opts.zo_threads; t++) {
5594 if (t < ztest_opts.zo_datasets &&
5595 ztest_dataset_open(t) != 0)
5596 return;
5597 VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
5598 THR_BOUND, &tid[t]) == 0);
5599 }
5600
5601 /*
5602 * Wait for all of the tests to complete. We go in reverse order
5603 * so we don't close datasets while threads are still using them.
5604 */
5605 for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) {
5606 VERIFY(thr_join(tid[t], NULL, NULL) == 0);
5607 if (t < ztest_opts.zo_datasets)
5608 ztest_dataset_close(t);
5609 }
5610
5611 txg_wait_synced(spa_get_dsl(spa), 0);
5612
5613 zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
5614 zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
5615
5616 umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t));
5617
5618 /* Kill the resume thread */
5619 ztest_exiting = B_TRUE;
5620 VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
5621 ztest_resume(spa);
5622
5623 /*
5624 * Right before closing the pool, kick off a bunch of async I/O;
5625 * spa_close() should wait for it to complete.
5626 */
5627 for (uint64_t object = 1; object < 50; object++)
5628 dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
5629
5630 spa_close(spa, FTAG);
5631
5632 /*
5633 * Verify that we can loop over all pools.
5634 */
5635 mutex_enter(&spa_namespace_lock);
5636 for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
5637 if (ztest_opts.zo_verbose > 3)
5638 (void) printf("spa_next: found %s\n", spa_name(spa));
5639 mutex_exit(&spa_namespace_lock);
5640
5641 /*
5642 * Verify that we can export the pool and reimport it under a
5643 * different name.
5644 */
5645 if (ztest_random(2) == 0) {
5646 char name[MAXNAMELEN];
5647 (void) snprintf(name, MAXNAMELEN, "%s_import",
5648 ztest_opts.zo_pool);
5649 ztest_spa_import_export(ztest_opts.zo_pool, name);
5650 ztest_spa_import_export(name, ztest_opts.zo_pool);
5651 }
5652
5653 kernel_fini();
5654
5655 list_destroy(&zcl.zcl_callbacks);
5656
5657 (void) _mutex_destroy(&zcl.zcl_callbacks_lock);
5658
5659 (void) rwlock_destroy(&ztest_name_lock);
5660 (void) _mutex_destroy(&ztest_vdev_lock);
5661 }
5662
5663 static void
5664 ztest_freeze(void)
5665 {
5666 ztest_ds_t *zd = &ztest_ds[0];
5667 spa_t *spa;
5668 int numloops = 0;
5669
5670 if (ztest_opts.zo_verbose >= 3)
5671 (void) printf("testing spa_freeze()...\n");
5672
5673 kernel_init(FREAD | FWRITE);
5674 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
5675 VERIFY3U(0, ==, ztest_dataset_open(0));
5676 spa->spa_debug = B_TRUE;
5677 ztest_spa = spa;
5678
5679 /*
5680 * Force the first log block to be transactionally allocated.
5681 * We have to do this before we freeze the pool -- otherwise
5682 * the log chain won't be anchored.
5683 */
5684 while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
5685 ztest_dmu_object_alloc_free(zd, 0);
5686 zil_commit(zd->zd_zilog, 0);
5687 }
5688
5689 txg_wait_synced(spa_get_dsl(spa), 0);
5690
5691 /*
5692 * Freeze the pool. This stops spa_sync() from doing anything,
5693 * so that the only way to record changes from now on is the ZIL.
5694 */
5695 spa_freeze(spa);
5696
5697 /*
5698 * Run tests that generate log records but don't alter the pool config
5699 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
5700 * We do a txg_wait_synced() after each iteration to force the txg
5701 * to increase well beyond the last synced value in the uberblock.
5702 * The ZIL should be OK with that.
5703 */
5704 while (ztest_random(10) != 0 &&
5705 numloops++ < ztest_opts.zo_maxloops) {
5706 ztest_dmu_write_parallel(zd, 0);
5707 ztest_dmu_object_alloc_free(zd, 0);
5708 txg_wait_synced(spa_get_dsl(spa), 0);
5709 }
5710
5711 /*
5712 * Commit all of the changes we just generated.
5713 */
5714 zil_commit(zd->zd_zilog, 0);
5715 txg_wait_synced(spa_get_dsl(spa), 0);
5716
5717 /*
5718 * Close our dataset and close the pool.
5719 */
5720 ztest_dataset_close(0);
5721 spa_close(spa, FTAG);
5722 kernel_fini();
5723
5724 /*
5725 * Open and close the pool and dataset to induce log replay.
5726 */
5727 kernel_init(FREAD | FWRITE);
5728 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
5729 ASSERT(spa_freeze_txg(spa) == UINT64_MAX);
5730 VERIFY3U(0, ==, ztest_dataset_open(0));
5731 ztest_dataset_close(0);
5732
5733 spa->spa_debug = B_TRUE;
5734 ztest_spa = spa;
5735 txg_wait_synced(spa_get_dsl(spa), 0);
5736 ztest_reguid(NULL, 0);
5737
5738 spa_close(spa, FTAG);
5739 kernel_fini();
5740 }
5741
5742 void
5743 print_time(hrtime_t t, char *timebuf)
5744 {
5745 hrtime_t s = t / NANOSEC;
5746 hrtime_t m = s / 60;
5747 hrtime_t h = m / 60;
5748 hrtime_t d = h / 24;
5749
5750 s -= m * 60;
5751 m -= h * 60;
5752 h -= d * 24;
5753
5754 timebuf[0] = '\0';
5755
5756 if (d)
5757 (void) sprintf(timebuf,
5758 "%llud%02lluh%02llum%02llus", d, h, m, s);
5759 else if (h)
5760 (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s);
5761 else if (m)
5762 (void) sprintf(timebuf, "%llum%02llus", m, s);
5763 else
5764 (void) sprintf(timebuf, "%llus", s);
5765 }
5766
5767 static nvlist_t *
5768 make_random_props()
5769 {
5770 nvlist_t *props;
5771
5772 VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
5773 if (ztest_random(2) == 0)
5774 return (props);
5775 VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
5776
5777 return (props);
5778 }
5779
5780 /*
5781 * Create a storage pool with the given name and initial vdev size.
5782 * Then test spa_freeze() functionality.
5783 */
5784 static void
5785 ztest_init(ztest_shared_t *zs)
5786 {
5787 spa_t *spa;
5788 nvlist_t *nvroot, *props;
5789
5790 VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0);
5791 VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0);
5792
5793 kernel_init(FREAD | FWRITE);
5794
5795 /*
5796 * Create the storage pool.
5797 */
5798 (void) spa_destroy(ztest_opts.zo_pool);
5799 ztest_shared->zs_vdev_next_leaf = 0;
5800 zs->zs_splits = 0;
5801 zs->zs_mirrors = ztest_opts.zo_mirrors;
5802 nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
5803 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1);
5804 props = make_random_props();
5805 for (int i = 0; i < SPA_FEATURES; i++) {
5806 char buf[1024];
5807 (void) snprintf(buf, sizeof (buf), "feature@%s",
5808 spa_feature_table[i].fi_uname);
5809 VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
5810 }
5811 VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL));
5812 nvlist_free(nvroot);
5813
5814 VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
5815 zs->zs_metaslab_sz =
5816 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
5817
5818 spa_close(spa, FTAG);
5819
5820 kernel_fini();
5821
5822 ztest_run_zdb(ztest_opts.zo_pool);
5823
5824 ztest_freeze();
5825
5826 ztest_run_zdb(ztest_opts.zo_pool);
5827
5828 (void) rwlock_destroy(&ztest_name_lock);
5829 (void) _mutex_destroy(&ztest_vdev_lock);
5830 }
5831
5832 static void
5833 setup_data_fd(void)
5834 {
5835 static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX";
5836
5837 ztest_fd_data = mkstemp(ztest_name_data);
5838 ASSERT3S(ztest_fd_data, >=, 0);
5839 (void) unlink(ztest_name_data);
5840 }
5841
5842
5843 static int
5844 shared_data_size(ztest_shared_hdr_t *hdr)
5845 {
5846 int size;
5847
5848 size = hdr->zh_hdr_size;
5849 size += hdr->zh_opts_size;
5850 size += hdr->zh_size;
5851 size += hdr->zh_stats_size * hdr->zh_stats_count;
5852 size += hdr->zh_ds_size * hdr->zh_ds_count;
5853
5854 return (size);
5855 }
5856
5857 static void
5858 setup_hdr(void)
5859 {
5860 int size;
5861 ztest_shared_hdr_t *hdr;
5862
5863 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
5864 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
5865 ASSERT(hdr != MAP_FAILED);
5866
5867 VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t)));
5868
5869 hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t);
5870 hdr->zh_opts_size = sizeof (ztest_shared_opts_t);
5871 hdr->zh_size = sizeof (ztest_shared_t);
5872 hdr->zh_stats_size = sizeof (ztest_shared_callstate_t);
5873 hdr->zh_stats_count = ZTEST_FUNCS;
5874 hdr->zh_ds_size = sizeof (ztest_shared_ds_t);
5875 hdr->zh_ds_count = ztest_opts.zo_datasets;
5876
5877 size = shared_data_size(hdr);
5878 VERIFY3U(0, ==, ftruncate(ztest_fd_data, size));
5879
5880 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
5881 }
5882
5883 static void
5884 setup_data(void)
5885 {
5886 int size, offset;
5887 ztest_shared_hdr_t *hdr;
5888 uint8_t *buf;
5889
5890 hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()),
5891 PROT_READ, MAP_SHARED, ztest_fd_data, 0);
5892 ASSERT(hdr != MAP_FAILED);
5893
5894 size = shared_data_size(hdr);
5895
5896 (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize()));
5897 hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()),
5898 PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0);
5899 ASSERT(hdr != MAP_FAILED);
5900 buf = (uint8_t *)hdr;
5901
5902 offset = hdr->zh_hdr_size;
5903 ztest_shared_opts = (void *)&buf[offset];
5904 offset += hdr->zh_opts_size;
5905 ztest_shared = (void *)&buf[offset];
5906 offset += hdr->zh_size;
5907 ztest_shared_callstate = (void *)&buf[offset];
5908 offset += hdr->zh_stats_size * hdr->zh_stats_count;
5909 ztest_shared_ds = (void *)&buf[offset];
5910 }
5911
5912 static boolean_t
5913 exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp)
5914 {
5915 pid_t pid;
5916 int status;
5917 char *cmdbuf = NULL;
5918
5919 pid = fork();
5920
5921 if (cmd == NULL) {
5922 cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL);
5923 (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN);
5924 cmd = cmdbuf;
5925 }
5926
5927 if (pid == -1)
5928 fatal(1, "fork failed");
5929
5930 if (pid == 0) { /* child */
5931 char *emptyargv[2] = { cmd, NULL };
5932 char fd_data_str[12];
5933
5934 struct rlimit rl = { 1024, 1024 };
5935 (void) setrlimit(RLIMIT_NOFILE, &rl);
5936
5937 (void) close(ztest_fd_rand);
5938 VERIFY3U(11, >=,
5939 snprintf(fd_data_str, 12, "%d", ztest_fd_data));
5940 VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1));
5941
5942 (void) enable_extended_FILE_stdio(-1, -1);
5943 if (libpath != NULL)
5944 VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1));
5945 (void) execv(cmd, emptyargv);
5946 ztest_dump_core = B_FALSE;
5947 fatal(B_TRUE, "exec failed: %s", cmd);
5948 }
5949
5950 if (cmdbuf != NULL) {
5951 umem_free(cmdbuf, MAXPATHLEN);
5952 cmd = NULL;
5953 }
5954
5955 while (waitpid(pid, &status, 0) != pid)
5956 continue;
5957 if (statusp != NULL)
5958 *statusp = status;
5959
5960 if (WIFEXITED(status)) {
5961 if (WEXITSTATUS(status) != 0) {
5962 (void) fprintf(stderr, "child exited with code %d\n",
5963 WEXITSTATUS(status));
5964 exit(2);
5965 }
5966 return (B_FALSE);
5967 } else if (WIFSIGNALED(status)) {
5968 if (!ignorekill || WTERMSIG(status) != SIGKILL) {
5969 (void) fprintf(stderr, "child died with signal %d\n",
5970 WTERMSIG(status));
5971 exit(3);
5972 }
5973 return (B_TRUE);
5974 } else {
5975 (void) fprintf(stderr, "something strange happened to child\n");
5976 exit(4);
5977 /* NOTREACHED */
5978 }
5979 }
5980
5981 static void
5982 ztest_run_init(void)
5983 {
5984 ztest_shared_t *zs = ztest_shared;
5985
5986 ASSERT(ztest_opts.zo_init != 0);
5987
5988 /*
5989 * Blow away any existing copy of zpool.cache
5990 */
5991 (void) remove(spa_config_path);
5992
5993 /*
5994 * Create and initialize our storage pool.
5995 */
5996 for (int i = 1; i <= ztest_opts.zo_init; i++) {
5997 bzero(zs, sizeof (ztest_shared_t));
5998 if (ztest_opts.zo_verbose >= 3 &&
5999 ztest_opts.zo_init != 1) {
6000 (void) printf("ztest_init(), pass %d\n", i);
6001 }
6002 ztest_init(zs);
6003 }
6004 }
6005
6006 int
6007 main(int argc, char **argv)
6008 {
6009 int kills = 0;
6010 int iters = 0;
6011 int older = 0;
6012 int newer = 0;
6013 ztest_shared_t *zs;
6014 ztest_info_t *zi;
6015 ztest_shared_callstate_t *zc;
6016 char timebuf[100];
6017 char numbuf[6];
6018 spa_t *spa;
6019 char *cmd;
6020 boolean_t hasalt;
6021 char *fd_data_str = getenv("ZTEST_FD_DATA");
6022
6023 (void) setvbuf(stdout, NULL, _IOLBF, 0);
6024
6025 dprintf_setup(&argc, argv);
6026
6027 ztest_fd_rand = open("/dev/urandom", O_RDONLY);
6028 ASSERT3S(ztest_fd_rand, >=, 0);
6029
6030 if (!fd_data_str) {
6031 process_options(argc, argv);
6032
6033 setup_data_fd();
6034 setup_hdr();
6035 setup_data();
6036 bcopy(&ztest_opts, ztest_shared_opts,
6037 sizeof (*ztest_shared_opts));
6038 } else {
6039 ztest_fd_data = atoi(fd_data_str);
6040 setup_data();
6041 bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts));
6042 }
6043 ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count);
6044
6045 /* Override location of zpool.cache */
6046 VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache",
6047 ztest_opts.zo_dir), !=, -1);
6048
6049 ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t),
6050 UMEM_NOFAIL);
6051 zs = ztest_shared;
6052
6053 if (fd_data_str) {
6054 metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang;
6055 metaslab_df_alloc_threshold =
6056 zs->zs_metaslab_df_alloc_threshold;
6057
6058 if (zs->zs_do_init)
6059 ztest_run_init();
6060 else
6061 ztest_run(zs);
6062 exit(0);
6063 }
6064
6065 hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0);
6066
6067 if (ztest_opts.zo_verbose >= 1) {
6068 (void) printf("%llu vdevs, %d datasets, %d threads,"
6069 " %llu seconds...\n",
6070 (u_longlong_t)ztest_opts.zo_vdevs,
6071 ztest_opts.zo_datasets,
6072 ztest_opts.zo_threads,
6073 (u_longlong_t)ztest_opts.zo_time);
6074 }
6075
6076 cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL);
6077 (void) strlcpy(cmd, getexecname(), MAXNAMELEN);
6078
6079 zs->zs_do_init = B_TRUE;
6080 if (strlen(ztest_opts.zo_alt_ztest) != 0) {
6081 if (ztest_opts.zo_verbose >= 1) {
6082 (void) printf("Executing older ztest for "
6083 "initialization: %s\n", ztest_opts.zo_alt_ztest);
6084 }
6085 VERIFY(!exec_child(ztest_opts.zo_alt_ztest,
6086 ztest_opts.zo_alt_libpath, B_FALSE, NULL));
6087 } else {
6088 VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL));
6089 }
6090 zs->zs_do_init = B_FALSE;
6091
6092 zs->zs_proc_start = gethrtime();
6093 zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC;
6094
6095 for (int f = 0; f < ZTEST_FUNCS; f++) {
6096 zi = &ztest_info[f];
6097 zc = ZTEST_GET_SHARED_CALLSTATE(f);
6098 if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
6099 zc->zc_next = UINT64_MAX;
6100 else
6101 zc->zc_next = zs->zs_proc_start +
6102 ztest_random(2 * zi->zi_interval[0] + 1);
6103 }
6104
6105 /*
6106 * Run the tests in a loop. These tests include fault injection
6107 * to verify that self-healing data works, and forced crashes
6108 * to verify that we never lose on-disk consistency.
6109 */
6110 while (gethrtime() < zs->zs_proc_stop) {
6111 int status;
6112 boolean_t killed;
6113
6114 /*
6115 * Initialize the workload counters for each function.
6116 */
6117 for (int f = 0; f < ZTEST_FUNCS; f++) {
6118 zc = ZTEST_GET_SHARED_CALLSTATE(f);
6119 zc->zc_count = 0;
6120 zc->zc_time = 0;
6121 }
6122
6123 /* Set the allocation switch size */
6124 zs->zs_metaslab_df_alloc_threshold =
6125 ztest_random(zs->zs_metaslab_sz / 4) + 1;
6126
6127 if (!hasalt || ztest_random(2) == 0) {
6128 if (hasalt && ztest_opts.zo_verbose >= 1) {
6129 (void) printf("Executing newer ztest: %s\n",
6130 cmd);
6131 }
6132 newer++;
6133 killed = exec_child(cmd, NULL, B_TRUE, &status);
6134 } else {
6135 if (hasalt && ztest_opts.zo_verbose >= 1) {
6136 (void) printf("Executing older ztest: %s\n",
6137 ztest_opts.zo_alt_ztest);
6138 }
6139 older++;
6140 killed = exec_child(ztest_opts.zo_alt_ztest,
6141 ztest_opts.zo_alt_libpath, B_TRUE, &status);
6142 }
6143
6144 if (killed)
6145 kills++;
6146 iters++;
6147
6148 if (ztest_opts.zo_verbose >= 1) {
6149 hrtime_t now = gethrtime();
6150
6151 now = MIN(now, zs->zs_proc_stop);
6152 print_time(zs->zs_proc_stop - now, timebuf);
6153 nicenum(zs->zs_space, numbuf);
6154
6155 (void) printf("Pass %3d, %8s, %3llu ENOSPC, "
6156 "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n",
6157 iters,
6158 WIFEXITED(status) ? "Complete" : "SIGKILL",
6159 (u_longlong_t)zs->zs_enospc_count,
6160 100.0 * zs->zs_alloc / zs->zs_space,
6161 numbuf,
6162 100.0 * (now - zs->zs_proc_start) /
6163 (ztest_opts.zo_time * NANOSEC), timebuf);
6164 }
6165
6166 if (ztest_opts.zo_verbose >= 2) {
6167 (void) printf("\nWorkload summary:\n\n");
6168 (void) printf("%7s %9s %s\n",
6169 "Calls", "Time", "Function");
6170 (void) printf("%7s %9s %s\n",
6171 "-----", "----", "--------");
6172 for (int f = 0; f < ZTEST_FUNCS; f++) {
6173 Dl_info dli;
6174
6175 zi = &ztest_info[f];
6176 zc = ZTEST_GET_SHARED_CALLSTATE(f);
6177 print_time(zc->zc_time, timebuf);
6178 (void) dladdr((void *)zi->zi_func, &dli);
6179 (void) printf("%7llu %9s %s\n",
6180 (u_longlong_t)zc->zc_count, timebuf,
6181 dli.dli_sname);
6182 }
6183 (void) printf("\n");
6184 }
6185
6186 /*
6187 * It's possible that we killed a child during a rename test,
6188 * in which case we'll have a 'ztest_tmp' pool lying around
6189 * instead of 'ztest'. Do a blind rename in case this happened.
6190 */
6191 kernel_init(FREAD);
6192 if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) {
6193 spa_close(spa, FTAG);
6194 } else {
6195 char tmpname[MAXNAMELEN];
6196 kernel_fini();
6197 kernel_init(FREAD | FWRITE);
6198 (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
6199 ztest_opts.zo_pool);
6200 (void) spa_rename(tmpname, ztest_opts.zo_pool);
6201 }
6202 kernel_fini();
6203
6204 ztest_run_zdb(ztest_opts.zo_pool);
6205 }
6206
6207 if (ztest_opts.zo_verbose >= 1) {
6208 if (hasalt) {
6209 (void) printf("%d runs of older ztest: %s\n", older,
6210 ztest_opts.zo_alt_ztest);
6211 (void) printf("%d runs of newer ztest: %s\n", newer,
6212 cmd);
6213 }
6214 (void) printf("%d killed, %d completed, %.0f%% kill rate\n",
6215 kills, iters - kills, (100.0 * kills) / MAX(1, iters));
6216 }
6217
6218 umem_free(cmd, MAXNAMELEN);
6219
6220 return (0);
6221 }