49 * requirement. In the event of a panic or power fail then those log
50 * records (transactions) are replayed.
51 *
52 * There is one ZIL per file system. Its on-disk (pool) format consists
53 * of 3 parts:
54 *
55 * - ZIL header
56 * - ZIL blocks
57 * - ZIL records
58 *
59 * A log record holds a system call transaction. Log blocks can
60 * hold many log records and the blocks are chained together.
61 * Each ZIL block contains a block pointer (blkptr_t) to the next
62 * ZIL block in the chain. The ZIL header points to the first
63 * block in the chain. Note there is not a fixed place in the pool
64 * to hold blocks. They are dynamically allocated and freed as
65 * needed from the blocks available. Figure X shows the ZIL structure:
66 */
67
68 /*
69 * This global ZIL switch affects all pools
70 */
71 int zil_replay_disable = 0; /* disable intent logging replay */
72
73 /*
74 * Tunable parameter for debugging or performance analysis. Setting
75 * zfs_nocacheflush will cause corruption on power loss if a volatile
76 * out-of-order write cache is enabled.
77 */
78 boolean_t zfs_nocacheflush = B_FALSE;
79
80 static kmem_cache_t *zil_lwb_cache;
81
82 static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
83
84 #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
85 sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
86
87
88 /*
89 * ziltest is by and large an ugly hack, but very useful in
90 * checking replay without tedious work.
91 * When running ziltest we want to keep all itx's and so maintain
862 zbookmark_t zb;
863
864 SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
865 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
866 lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
867
868 if (zilog->zl_root_zio == NULL) {
869 zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
870 ZIO_FLAG_CANFAIL);
871 }
872 if (lwb->lwb_zio == NULL) {
873 lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
874 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
875 zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
876 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
877 }
878 }
879
880 /*
881 * Define a limited set of intent log block sizes.
882 * These must be a multiple of 4KB. Note only the amount used (again
883 * aligned to 4KB) actually gets written. However, we can't always just
884 * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
885 */
886 uint64_t zil_block_buckets[] = {
887 4096, /* non TX_WRITE */
888 8192+4096, /* data base */
889 32*1024 + 4096, /* NFS writes */
890 UINT64_MAX
891 };
892
893 /*
894 * Use the slog as long as the logbias is 'latency' and the current commit size
895 * is less than the limit or the total list size is less than 2X the limit.
896 * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
897 */
898 uint64_t zil_slog_limit = 1024 * 1024;
899 #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
900 (((zilog)->zl_cur_used < zil_slog_limit) || \
901 ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
|
49 * requirement. In the event of a panic or power fail then those log
50 * records (transactions) are replayed.
51 *
52 * There is one ZIL per file system. Its on-disk (pool) format consists
53 * of 3 parts:
54 *
55 * - ZIL header
56 * - ZIL blocks
57 * - ZIL records
58 *
59 * A log record holds a system call transaction. Log blocks can
60 * hold many log records and the blocks are chained together.
61 * Each ZIL block contains a block pointer (blkptr_t) to the next
62 * ZIL block in the chain. The ZIL header points to the first
63 * block in the chain. Note there is not a fixed place in the pool
64 * to hold blocks. They are dynamically allocated and freed as
65 * needed from the blocks available. Figure X shows the ZIL structure:
66 */
67
68 /*
69 * Disable intent logging replay. This global ZIL switch affects all pools.
70 */
71 int zil_replay_disable = 0;
72
73 /*
74 * Tunable parameter for debugging or performance analysis. Setting
75 * zfs_nocacheflush will cause corruption on power loss if a volatile
76 * out-of-order write cache is enabled.
77 */
78 boolean_t zfs_nocacheflush = B_FALSE;
79
80 static kmem_cache_t *zil_lwb_cache;
81
82 static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
83
84 #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
85 sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
86
87
88 /*
89 * ziltest is by and large an ugly hack, but very useful in
90 * checking replay without tedious work.
91 * When running ziltest we want to keep all itx's and so maintain
862 zbookmark_t zb;
863
864 SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
865 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
866 lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
867
868 if (zilog->zl_root_zio == NULL) {
869 zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
870 ZIO_FLAG_CANFAIL);
871 }
872 if (lwb->lwb_zio == NULL) {
873 lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
874 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
875 zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
876 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
877 }
878 }
879
880 /*
881 * Define a limited set of intent log block sizes.
882 *
883 * These must be a multiple of 4KB. Note only the amount used (again
884 * aligned to 4KB) actually gets written. However, we can't always just
885 * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
886 */
887 uint64_t zil_block_buckets[] = {
888 4096, /* non TX_WRITE */
889 8192+4096, /* data base */
890 32*1024 + 4096, /* NFS writes */
891 UINT64_MAX
892 };
893
894 /*
895 * Use the slog as long as the logbias is 'latency' and the current commit size
896 * is less than the limit or the total list size is less than 2X the limit.
897 * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
898 */
899 uint64_t zil_slog_limit = 1024 * 1024;
900 #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
901 (((zilog)->zl_cur_used < zil_slog_limit) || \
902 ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
|