Print this page
4101 metaslab_debug should allow for fine-grained control
4102 space_maps should store more information about themselves
4103 space map object blocksize should be increased
4104 ::spa_space no longer works
4105 removing a mirrored log device results in a leaked object
4106 asynchronously load metaslab
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Sebastien Roy <seb@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/zfeature.c
+++ new/usr/src/uts/common/fs/zfs/zfeature.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 24 */
25 25
26 26 #include <sys/zfs_context.h>
27 27 #include <sys/zfeature.h>
28 28 #include <sys/dmu.h>
29 29 #include <sys/nvpair.h>
30 30 #include <sys/zap.h>
31 31 #include <sys/dmu_tx.h>
32 32 #include "zfeature_common.h"
33 33 #include <sys/spa_impl.h>
34 34
35 35 /*
36 36 * ZFS Feature Flags
37 37 * -----------------
38 38 *
39 39 * ZFS feature flags are used to provide fine-grained versioning to the ZFS
40 40 * on-disk format. Once enabled on a pool feature flags replace the old
41 41 * spa_version() number.
42 42 *
43 43 * Each new on-disk format change will be given a uniquely identifying string
44 44 * guid rather than a version number. This avoids the problem of different
45 45 * organizations creating new on-disk formats with the same version number. To
46 46 * keep feature guids unique they should consist of the reverse dns name of the
47 47 * organization which implemented the feature and a short name for the feature,
48 48 * separated by a colon (e.g. com.delphix:async_destroy).
49 49 *
50 50 * Reference Counts
51 51 * ----------------
52 52 *
53 53 * Within each pool features can be in one of three states: disabled, enabled,
54 54 * or active. These states are differentiated by a reference count stored on
55 55 * disk for each feature:
56 56 *
57 57 * 1) If there is no reference count stored on disk the feature is disabled.
58 58 * 2) If the reference count is 0 a system administrator has enabled the
59 59 * feature, but the feature has not been used yet, so no on-disk
60 60 * format changes have been made.
61 61 * 3) If the reference count is greater than 0 the feature is active.
62 62 * The format changes required by the feature are currently on disk.
63 63 * Note that if the feature's format changes are reversed the feature
64 64 * may choose to set its reference count back to 0.
65 65 *
66 66 * Feature flags makes no differentiation between non-zero reference counts
67 67 * for an active feature (e.g. a reference count of 1 means the same thing as a
68 68 * reference count of 27834721), but feature implementations may choose to use
69 69 * the reference count to store meaningful information. For example, a new RAID
70 70 * implementation might set the reference count to the number of vdevs using
71 71 * it. If all those disks are removed from the pool the feature goes back to
72 72 * having a reference count of 0.
73 73 *
74 74 * It is the responsibility of the individual features to maintain a non-zero
75 75 * reference count as long as the feature's format changes are present on disk.
76 76 *
77 77 * Dependencies
78 78 * ------------
79 79 *
80 80 * Each feature may depend on other features. The only effect of this
81 81 * relationship is that when a feature is enabled all of its dependencies are
82 82 * automatically enabled as well. Any future work to support disabling of
83 83 * features would need to ensure that features cannot be disabled if other
84 84 * enabled features depend on them.
85 85 *
86 86 * On-disk Format
87 87 * --------------
88 88 *
89 89 * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
90 90 * (5000). In order for this to work the pool is automatically upgraded to
91 91 * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
92 92 * format changes will be in use.
93 93 *
94 94 * Information about features is stored in 3 ZAP objects in the pool's MOS.
95 95 * These objects are linked to by the following names in the pool directory
96 96 * object:
97 97 *
98 98 * 1) features_for_read: feature guid -> reference count
99 99 * Features needed to open the pool for reading.
100 100 * 2) features_for_write: feature guid -> reference count
101 101 * Features needed to open the pool for writing.
102 102 * 3) feature_descriptions: feature guid -> descriptive string
103 103 * A human readable string.
104 104 *
105 105 * All enabled features appear in either features_for_read or
106 106 * features_for_write, but not both.
107 107 *
108 108 * To open a pool in read-only mode only the features listed in
109 109 * features_for_read need to be supported.
110 110 *
111 111 * To open the pool in read-write mode features in both features_for_read and
112 112 * features_for_write need to be supported.
113 113 *
114 114 * Some features may be required to read the ZAP objects containing feature
115 115 * information. To allow software to check for compatibility with these features
116 116 * before the pool is opened their names must be stored in the label in a
117 117 * new "features_for_read" entry (note that features that are only required
118 118 * to write to a pool never need to be stored in the label since the
119 119 * features_for_write ZAP object can be read before the pool is written to).
120 120 * To save space in the label features must be explicitly marked as needing to
121 121 * be written to the label. Also, reference counts are not stored in the label,
122 122 * instead any feature whose reference count drops to 0 is removed from the
123 123 * label.
124 124 *
125 125 * Adding New Features
126 126 * -------------------
127 127 *
128 128 * Features must be registered in zpool_feature_init() function in
129 129 * zfeature_common.c using the zfeature_register() function. This function
130 130 * has arguments to specify if the feature should be stored in the
131 131 * features_for_read or features_for_write ZAP object and if it needs to be
132 132 * written to the label when active.
133 133 *
134 134 * Once a feature is registered it will appear as a "feature@<feature name>"
135 135 * property which can be set by an administrator. Feature implementors should
136 136 * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
137 137 * query the state of a feature and the spa_feature_incr() and
138 138 * spa_feature_decr() functions to change an enabled feature's reference count.
139 139 * Reference counts may only be updated in the syncing context.
140 140 *
141 141 * Features may not perform enable-time initialization. Instead, any such
142 142 * initialization should occur when the feature is first used. This design
143 143 * enforces that on-disk changes be made only when features are used. Code
144 144 * should only check if a feature is enabled using spa_feature_is_enabled(),
145 145 * not by relying on any feature specific metadata existing. If a feature is
146 146 * enabled, but the feature's metadata is not on disk yet then it should be
147 147 * created as needed.
148 148 *
149 149 * As an example, consider the com.delphix:async_destroy feature. This feature
150 150 * relies on the existence of a bptree in the MOS that store blocks for
151 151 * asynchronous freeing. This bptree is not created when async_destroy is
152 152 * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
153 153 * called to check if async_destroy is enabled. If it is and the bptree object
154 154 * does not exist yet, the bptree object is created as part of the dataset
155 155 * destroy and async_destroy's reference count is incremented to indicate it
156 156 * has made an on-disk format change. Later, after the destroyed dataset's
157 157 * blocks have all been asynchronously freed there is no longer any use for the
158 158 * bptree object, so it is destroyed and async_destroy's reference count is
159 159 * decremented back to 0 to indicate that it has undone its on-disk format
160 160 * changes.
161 161 */
162 162
163 163 typedef enum {
164 164 FEATURE_ACTION_ENABLE,
165 165 FEATURE_ACTION_INCR,
166 166 FEATURE_ACTION_DECR,
167 167 } feature_action_t;
168 168
169 169 /*
170 170 * Checks that the features active in the specified object are supported by
171 171 * this software. Adds each unsupported feature (name -> description) to
172 172 * the supplied nvlist.
173 173 */
174 174 boolean_t
175 175 feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj,
176 176 nvlist_t *unsup_feat, nvlist_t *enabled_feat)
177 177 {
178 178 boolean_t supported;
179 179 zap_cursor_t zc;
180 180 zap_attribute_t za;
181 181
182 182 supported = B_TRUE;
183 183 for (zap_cursor_init(&zc, os, obj);
184 184 zap_cursor_retrieve(&zc, &za) == 0;
185 185 zap_cursor_advance(&zc)) {
186 186 ASSERT(za.za_integer_length == sizeof (uint64_t) &&
187 187 za.za_num_integers == 1);
188 188
189 189 if (NULL != enabled_feat) {
190 190 fnvlist_add_uint64(enabled_feat, za.za_name,
191 191 za.za_first_integer);
192 192 }
193 193
194 194 if (za.za_first_integer != 0 &&
195 195 !zfeature_is_supported(za.za_name)) {
196 196 supported = B_FALSE;
197 197
198 198 if (NULL != unsup_feat) {
199 199 char *desc = "";
200 200 char buf[MAXPATHLEN];
201 201
202 202 if (zap_lookup(os, desc_obj, za.za_name,
203 203 1, sizeof (buf), buf) == 0)
204 204 desc = buf;
205 205
206 206 VERIFY(nvlist_add_string(unsup_feat, za.za_name,
207 207 desc) == 0);
208 208 }
209 209 }
210 210 }
211 211 zap_cursor_fini(&zc);
212 212
213 213 return (supported);
214 214 }
215 215
216 216 static int
217 217 feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
218 218 zfeature_info_t *feature, uint64_t *res)
219 219 {
220 220 int err;
221 221 uint64_t refcount;
222 222 uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
223 223
224 224 /*
225 225 * If the pool is currently being created, the feature objects may not
226 226 * have been allocated yet. Act as though all features are disabled.
227 227 */
228 228 if (zapobj == 0)
229 229 return (SET_ERROR(ENOTSUP));
230 230
231 231 err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
232 232 &refcount);
233 233 if (err != 0) {
234 234 if (err == ENOENT)
235 235 return (SET_ERROR(ENOTSUP));
236 236 else
237 237 return (err);
238 238 }
239 239 *res = refcount;
240 240 return (0);
241 241 }
242 242
243 243 static int
244 244 feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj,
245 245 uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action,
246 246 dmu_tx_t *tx)
247 247 {
248 248 int error;
249 249 uint64_t refcount;
250 250 uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
251 251
252 252 ASSERT(0 != zapobj);
253 253 ASSERT(zfeature_is_valid_guid(feature->fi_guid));
254 254
255 255 error = zap_lookup(os, zapobj, feature->fi_guid,
256 256 sizeof (uint64_t), 1, &refcount);
257 257
258 258 /*
259 259 * If we can't ascertain the status of the specified feature, an I/O
260 260 * error occurred.
261 261 */
262 262 if (error != 0 && error != ENOENT)
263 263 return (error);
264 264
265 265 switch (action) {
266 266 case FEATURE_ACTION_ENABLE:
267 267 /*
268 268 * If the feature is already enabled, ignore the request.
269 269 */
270 270 if (error == 0)
271 271 return (0);
272 272 refcount = 0;
273 273 break;
274 274 case FEATURE_ACTION_INCR:
275 275 if (error == ENOENT)
276 276 return (SET_ERROR(ENOTSUP));
277 277 if (refcount == UINT64_MAX)
278 278 return (SET_ERROR(EOVERFLOW));
279 279 refcount++;
280 280 break;
281 281 case FEATURE_ACTION_DECR:
282 282 if (error == ENOENT)
283 283 return (SET_ERROR(ENOTSUP));
284 284 if (refcount == 0)
285 285 return (SET_ERROR(EOVERFLOW));
286 286 refcount--;
287 287 break;
288 288 default:
289 289 ASSERT(0);
290 290 break;
291 291 }
292 292
293 293 if (action == FEATURE_ACTION_ENABLE) {
294 294 int i;
295 295
296 296 for (i = 0; feature->fi_depends[i] != NULL; i++) {
297 297 zfeature_info_t *dep = feature->fi_depends[i];
298 298
299 299 error = feature_do_action(os, read_obj, write_obj,
300 300 desc_obj, dep, FEATURE_ACTION_ENABLE, tx);
301 301 if (error != 0)
302 302 return (error);
303 303 }
304 304 }
305 305
306 306 error = zap_update(os, zapobj, feature->fi_guid,
307 307 sizeof (uint64_t), 1, &refcount, tx);
308 308 if (error != 0)
309 309 return (error);
310 310
311 311 if (action == FEATURE_ACTION_ENABLE) {
312 312 error = zap_update(os, desc_obj,
313 313 feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
314 314 feature->fi_desc, tx);
315 315 if (error != 0)
316 316 return (error);
317 317 }
318 318
319 319 if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) {
320 320 spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid);
321 321 }
322 322
323 323 if (action == FEATURE_ACTION_DECR && refcount == 0) {
324 324 spa_deactivate_mos_feature(dmu_objset_spa(os),
325 325 feature->fi_guid);
326 326 }
327 327
328 328 return (0);
329 329 }
330 330
331 331 void
332 332 spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
333 333 {
334 334 /*
335 335 * We create feature flags ZAP objects in two instances: during pool
336 336 * creation and during pool upgrade.
337 337 */
338 338 ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on &&
339 339 tx->tx_txg == TXG_INITIAL));
340 340
341 341 spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
342 342 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
343 343 DMU_POOL_FEATURES_FOR_READ, tx);
344 344 spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
345 345 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
346 346 DMU_POOL_FEATURES_FOR_WRITE, tx);
347 347 spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
348 348 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
349 349 DMU_POOL_FEATURE_DESCRIPTIONS, tx);
350 350 }
351 351
352 352 /*
353 353 * Enable any required dependencies, then enable the requested feature.
↓ open down ↓ |
353 lines elided |
↑ open up ↑ |
354 354 */
355 355 void
356 356 spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
357 357 {
358 358 ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
359 359 VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
360 360 spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
361 361 spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
362 362 }
363 363
364 -/*
365 - * If the specified feature has not yet been enabled, this function returns
366 - * ENOTSUP; otherwise, this function increments the feature's refcount (or
367 - * returns EOVERFLOW if the refcount cannot be incremented). This function must
368 - * be called from syncing context.
369 - */
370 364 void
371 365 spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
372 366 {
367 + ASSERT(dmu_tx_is_syncing(tx));
373 368 ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
374 369 VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
375 370 spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
376 371 spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
377 372 }
378 373
379 -/*
380 - * If the specified feature has not yet been enabled, this function returns
381 - * ENOTSUP; otherwise, this function decrements the feature's refcount (or
382 - * returns EOVERFLOW if the refcount is already 0). This function must
383 - * be called from syncing context.
384 - */
385 374 void
386 375 spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
387 376 {
377 + ASSERT(dmu_tx_is_syncing(tx));
388 378 ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
389 379 VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
390 380 spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
391 381 spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
392 382 }
393 383
384 +/*
385 + * This interface is for debugging only. Normal consumers should use
386 + * spa_feature_is_enabled/spa_feature_is_active.
387 + */
388 +int
389 +spa_feature_get_refcount(spa_t *spa, zfeature_info_t *feature)
390 +{
391 + int err;
392 + uint64_t refcount;
393 +
394 + if (spa_version(spa) < SPA_VERSION_FEATURES)
395 + return (B_FALSE);
396 +
397 + err = feature_get_refcount(spa->spa_meta_objset,
398 + spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
399 + feature, &refcount);
400 + ASSERT(err == 0 || err == ENOTSUP);
401 + return (err == 0 ? refcount : 0);
402 +}
403 +
394 404 boolean_t
395 405 spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
396 406 {
397 407 int err;
398 408 uint64_t refcount;
399 409
400 410 if (spa_version(spa) < SPA_VERSION_FEATURES)
401 411 return (B_FALSE);
402 412
403 413 err = feature_get_refcount(spa->spa_meta_objset,
404 414 spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
405 415 feature, &refcount);
406 416 ASSERT(err == 0 || err == ENOTSUP);
407 417 return (err == 0);
408 418 }
409 419
410 420 boolean_t
411 421 spa_feature_is_active(spa_t *spa, zfeature_info_t *feature)
412 422 {
413 423 int err;
414 424 uint64_t refcount;
415 425
416 426 if (spa_version(spa) < SPA_VERSION_FEATURES)
417 427 return (B_FALSE);
418 428
419 429 err = feature_get_refcount(spa->spa_meta_objset,
420 430 spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
421 431 feature, &refcount);
422 432 ASSERT(err == 0 || err == ENOTSUP);
423 433 return (err == 0 && refcount > 0);
424 434 }
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX