Print this page
OS-1566 filesystem limits for ZFS datasets
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2012 by Delphix. All rights reserved.
24 + * Copyright (c) 2012 Joyent, Inc. All rights reserved.
24 25 */
25 26
26 27 #include <sys/dmu.h>
27 28 #include <sys/dmu_objset.h>
28 29 #include <sys/dmu_tx.h>
29 30 #include <sys/dsl_dataset.h>
30 31 #include <sys/dsl_dir.h>
31 32 #include <sys/dsl_prop.h>
32 33 #include <sys/dsl_synctask.h>
33 34 #include <sys/dsl_deleg.h>
34 35 #include <sys/spa.h>
35 36 #include <sys/metaslab.h>
36 37 #include <sys/zap.h>
37 38 #include <sys/zio.h>
38 39 #include <sys/arc.h>
39 40 #include <sys/sunddi.h>
40 41 #include <sys/zfs_zone.h>
42 +#include <sys/zfeature.h>
41 43 #include "zfs_namecheck.h"
44 +#include "zfs_prop.h"
42 45
46 +/*
47 + * Filesystem and Snapshot Limits
48 + * ------------------------------
49 + *
50 + * These limits are used to restrict the number of filesystems and/or snapshots
51 + * that can be created at a given level in the tree or below. The standard
52 + * use-case is with a delegated dataset where the administrator wants to ensure
53 + * that a user within the zone is not creating too many additional filesystems
54 + * or snapshots, even though they're not exceeding their space quota.
55 + *
56 + * The count of filesystems and snapshots is stored in the dsl_dir_phys_t which
57 + * impacts the on-disk format. As such, this capability is controlled by a
58 + * feature flag and must be enabled to be used. Once enabled, the feature is
59 + * not active until the first limit is set. At that point, future operations to
60 + * create/destroy filesystems or snapshots will validate and update the counts.
61 + *
62 + * Because the on-disk counts will be uninitialized (0) before the feature is
63 + * active, the counts are updated when a limit is first set on an uninitialized
64 + * node (The filesystem/snapshot counts on a node includes all of the nested
65 + * filesystems/snapshots, plus the node itself. Thus, a new leaf node has a
66 + * filesystem count of 1 and a snapshot count of 0. A filesystem count of 0 on
67 + * a node indicates uninitialized counts on that node.) When setting a limit on
68 + * an uninitialized node, the code starts at the filesystem with the new limit
69 + * and descends into all sub-filesystems and updates the counts to be accurate.
70 + * In practice this is lightweight since a limit is typically set when the
71 + * filesystem is created and thus has no children. Once valid, changing the
72 + * limit value won't require a re-traversal since the counts are already valid.
73 + * When recursively fixing the counts, if a node with a limit is encountered
74 + * during the descent, the counts are known to be valid and there is no need to
75 + * descend into that filesystem's children. The counts on filesystems above the
76 + * one with the new limit will still be uninitialized (0), unless a limit is
77 + * eventually set on one of those filesystems. It is possible for the counts
78 + * to appear initialized, but be invalid, if the feature was previously active
79 + * but then deactivated. For this reason, the counts are always recursively
80 + * updated when a limit is set on a dataset, unless there is already a limit.
81 + * When a new limit value is set on a filesystem with an existing limit, the
82 + * new limit must be greater than the current count at that level or an error
83 + * is returned and the limit is not changed.
84 + *
85 + * Once the feature is active, then whenever a filesystem or snapshot is
86 + * created, the code recurses up the tree, validating the new count against the
87 + * limit at each initialized level. In practice, most levels will not have a
88 + * limit set. If there is a limit at any initialized level up the tree, the
89 + * check must pass or the creation will fail. Likewise, when a filesystem or
90 + * snapshot is destroyed, the counts are recursively adjusted all the way up
91 + * the initizized nodes in the tree. Renaming a filesystem into different point
92 + * in the tree will first validate, then update the counts on each branch up to
93 + * the common ancestor. A receive will also validate the counts and then update
94 + * them.
95 + *
96 + * An exception to the above behavior is that the limits are never enforced
97 + * for the administrative user in the global zone. This is primarily so that
98 + * recursive snapshots in the global zone always work. We want to prevent a
99 + * denial-of-service in which a lower level delegated dataset could max out its
100 + * limit and thus block recursive snapshots from being taken in the global zone.
101 + * Because of this, it is possible for the snapshot count to be over the limit
102 + * and snapshots taken in the global zone could cause a lower level dataset to
103 + * hit or exceed its limit. The administrator taking the global zone recursive
104 + * snapshot should be aware of this side-effect and behave accordingly.
105 + * For consistency, the filesystem limit is also not enforced for the admin
106 + * user in the global zone.
107 + *
108 + * The filesystem limit is validated by dsl_dir_fscount_check() and updated by
109 + * dsl_dir_fscount_adjust(). The snapshot limit is validated by
110 + * dsl_snapcount_check() and updated by dsl_snapcount_adjust().
111 + * A new limit value is validated in dsl_dir_validate_fs_ss_limit() and the
112 + * filesystem counts are adjusted, if necessary, by dsl_dir_set_fs_ss_count().
113 + */
114 +
43 115 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
44 116 static void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd,
45 117 uint64_t value, dmu_tx_t *tx);
46 118
119 +extern dsl_syncfunc_t dsl_prop_set_sync;
120 +
47 121 /* ARGSUSED */
48 122 static void
49 123 dsl_dir_evict(dmu_buf_t *db, void *arg)
50 124 {
51 125 dsl_dir_t *dd = arg;
52 126 dsl_pool_t *dp = dd->dd_pool;
53 127 int t;
54 128
55 129 for (t = 0; t < TXG_SIZE; t++) {
56 130 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
57 131 ASSERT(dd->dd_tempreserved[t] == 0);
58 132 ASSERT(dd->dd_space_towrite[t] == 0);
59 133 }
60 134
61 135 if (dd->dd_parent)
62 136 dsl_dir_close(dd->dd_parent, dd);
63 137
64 138 spa_close(dd->dd_pool->dp_spa, dd);
65 139
66 140 /*
67 141 * The props callback list should have been cleaned up by
68 142 * objset_evict().
69 143 */
70 144 list_destroy(&dd->dd_prop_cbs);
71 145 mutex_destroy(&dd->dd_lock);
72 146 kmem_free(dd, sizeof (dsl_dir_t));
73 147 }
74 148
75 149 int
76 150 dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
77 151 const char *tail, void *tag, dsl_dir_t **ddp)
78 152 {
79 153 dmu_buf_t *dbuf;
80 154 dsl_dir_t *dd;
81 155 int err;
82 156
83 157 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
84 158 dsl_pool_sync_context(dp));
85 159
86 160 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
87 161 if (err)
88 162 return (err);
89 163 dd = dmu_buf_get_user(dbuf);
90 164 #ifdef ZFS_DEBUG
91 165 {
92 166 dmu_object_info_t doi;
93 167 dmu_object_info_from_db(dbuf, &doi);
94 168 ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
95 169 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
96 170 }
97 171 #endif
98 172 if (dd == NULL) {
99 173 dsl_dir_t *winner;
100 174
101 175 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
102 176 dd->dd_object = ddobj;
103 177 dd->dd_dbuf = dbuf;
104 178 dd->dd_pool = dp;
105 179 dd->dd_phys = dbuf->db_data;
106 180 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
107 181
108 182 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
109 183 offsetof(dsl_prop_cb_record_t, cbr_node));
110 184
111 185 dsl_dir_snap_cmtime_update(dd);
112 186
113 187 if (dd->dd_phys->dd_parent_obj) {
114 188 err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
115 189 NULL, dd, &dd->dd_parent);
116 190 if (err)
117 191 goto errout;
118 192 if (tail) {
119 193 #ifdef ZFS_DEBUG
120 194 uint64_t foundobj;
121 195
122 196 err = zap_lookup(dp->dp_meta_objset,
123 197 dd->dd_parent->dd_phys->dd_child_dir_zapobj,
124 198 tail, sizeof (foundobj), 1, &foundobj);
125 199 ASSERT(err || foundobj == ddobj);
126 200 #endif
127 201 (void) strcpy(dd->dd_myname, tail);
128 202 } else {
129 203 err = zap_value_search(dp->dp_meta_objset,
130 204 dd->dd_parent->dd_phys->dd_child_dir_zapobj,
131 205 ddobj, 0, dd->dd_myname);
132 206 }
133 207 if (err)
134 208 goto errout;
135 209 } else {
136 210 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
137 211 }
138 212
139 213 if (dsl_dir_is_clone(dd)) {
140 214 dmu_buf_t *origin_bonus;
141 215 dsl_dataset_phys_t *origin_phys;
142 216
143 217 /*
144 218 * We can't open the origin dataset, because
145 219 * that would require opening this dsl_dir.
146 220 * Just look at its phys directly instead.
147 221 */
148 222 err = dmu_bonus_hold(dp->dp_meta_objset,
149 223 dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
150 224 if (err)
151 225 goto errout;
152 226 origin_phys = origin_bonus->db_data;
153 227 dd->dd_origin_txg =
154 228 origin_phys->ds_creation_txg;
155 229 dmu_buf_rele(origin_bonus, FTAG);
156 230 }
157 231
158 232 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
159 233 dsl_dir_evict);
160 234 if (winner) {
161 235 if (dd->dd_parent)
162 236 dsl_dir_close(dd->dd_parent, dd);
163 237 mutex_destroy(&dd->dd_lock);
164 238 kmem_free(dd, sizeof (dsl_dir_t));
165 239 dd = winner;
166 240 } else {
167 241 spa_open_ref(dp->dp_spa, dd);
168 242 }
169 243 }
170 244
171 245 /*
172 246 * The dsl_dir_t has both open-to-close and instantiate-to-evict
173 247 * holds on the spa. We need the open-to-close holds because
174 248 * otherwise the spa_refcnt wouldn't change when we open a
175 249 * dir which the spa also has open, so we could incorrectly
176 250 * think it was OK to unload/export/destroy the pool. We need
177 251 * the instantiate-to-evict hold because the dsl_dir_t has a
178 252 * pointer to the dd_pool, which has a pointer to the spa_t.
179 253 */
180 254 spa_open_ref(dp->dp_spa, tag);
181 255 ASSERT3P(dd->dd_pool, ==, dp);
182 256 ASSERT3U(dd->dd_object, ==, ddobj);
183 257 ASSERT3P(dd->dd_dbuf, ==, dbuf);
184 258 *ddp = dd;
185 259 return (0);
186 260
187 261 errout:
188 262 if (dd->dd_parent)
189 263 dsl_dir_close(dd->dd_parent, dd);
190 264 mutex_destroy(&dd->dd_lock);
191 265 kmem_free(dd, sizeof (dsl_dir_t));
192 266 dmu_buf_rele(dbuf, tag);
193 267 return (err);
194 268 }
195 269
196 270 void
197 271 dsl_dir_close(dsl_dir_t *dd, void *tag)
198 272 {
199 273 dprintf_dd(dd, "%s\n", "");
200 274 spa_close(dd->dd_pool->dp_spa, tag);
201 275 dmu_buf_rele(dd->dd_dbuf, tag);
202 276 }
203 277
204 278 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
205 279 void
206 280 dsl_dir_name(dsl_dir_t *dd, char *buf)
207 281 {
208 282 if (dd->dd_parent) {
209 283 dsl_dir_name(dd->dd_parent, buf);
210 284 (void) strcat(buf, "/");
211 285 } else {
212 286 buf[0] = '\0';
213 287 }
214 288 if (!MUTEX_HELD(&dd->dd_lock)) {
215 289 /*
216 290 * recursive mutex so that we can use
217 291 * dprintf_dd() with dd_lock held
218 292 */
219 293 mutex_enter(&dd->dd_lock);
220 294 (void) strcat(buf, dd->dd_myname);
221 295 mutex_exit(&dd->dd_lock);
222 296 } else {
223 297 (void) strcat(buf, dd->dd_myname);
224 298 }
225 299 }
226 300
227 301 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
228 302 int
229 303 dsl_dir_namelen(dsl_dir_t *dd)
230 304 {
231 305 int result = 0;
232 306
233 307 if (dd->dd_parent) {
234 308 /* parent's name + 1 for the "/" */
235 309 result = dsl_dir_namelen(dd->dd_parent) + 1;
236 310 }
237 311
238 312 if (!MUTEX_HELD(&dd->dd_lock)) {
239 313 /* see dsl_dir_name */
240 314 mutex_enter(&dd->dd_lock);
241 315 result += strlen(dd->dd_myname);
242 316 mutex_exit(&dd->dd_lock);
243 317 } else {
244 318 result += strlen(dd->dd_myname);
245 319 }
246 320
247 321 return (result);
248 322 }
249 323
250 324 static int
251 325 getcomponent(const char *path, char *component, const char **nextp)
252 326 {
253 327 char *p;
254 328 if ((path == NULL) || (path[0] == '\0'))
255 329 return (ENOENT);
256 330 /* This would be a good place to reserve some namespace... */
257 331 p = strpbrk(path, "/@");
258 332 if (p && (p[1] == '/' || p[1] == '@')) {
259 333 /* two separators in a row */
260 334 return (EINVAL);
261 335 }
262 336 if (p == NULL || p == path) {
263 337 /*
264 338 * if the first thing is an @ or /, it had better be an
265 339 * @ and it had better not have any more ats or slashes,
266 340 * and it had better have something after the @.
267 341 */
268 342 if (p != NULL &&
269 343 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
270 344 return (EINVAL);
271 345 if (strlen(path) >= MAXNAMELEN)
272 346 return (ENAMETOOLONG);
273 347 (void) strcpy(component, path);
274 348 p = NULL;
275 349 } else if (p[0] == '/') {
276 350 if (p-path >= MAXNAMELEN)
277 351 return (ENAMETOOLONG);
278 352 (void) strncpy(component, path, p - path);
279 353 component[p-path] = '\0';
280 354 p++;
281 355 } else if (p[0] == '@') {
282 356 /*
283 357 * if the next separator is an @, there better not be
284 358 * any more slashes.
285 359 */
286 360 if (strchr(path, '/'))
287 361 return (EINVAL);
288 362 if (p-path >= MAXNAMELEN)
289 363 return (ENAMETOOLONG);
290 364 (void) strncpy(component, path, p - path);
291 365 component[p-path] = '\0';
292 366 } else {
293 367 ASSERT(!"invalid p");
294 368 }
295 369 *nextp = p;
296 370 return (0);
297 371 }
298 372
299 373 /*
300 374 * same as dsl_open_dir, ignore the first component of name and use the
301 375 * spa instead
302 376 */
303 377 int
304 378 dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
305 379 dsl_dir_t **ddp, const char **tailp)
306 380 {
307 381 char buf[MAXNAMELEN];
308 382 const char *next, *nextnext = NULL;
309 383 int err;
310 384 dsl_dir_t *dd;
311 385 dsl_pool_t *dp;
312 386 uint64_t ddobj;
313 387 int openedspa = FALSE;
314 388
315 389 dprintf("%s\n", name);
316 390
317 391 err = getcomponent(name, buf, &next);
318 392 if (err)
319 393 return (err);
320 394 if (spa == NULL) {
321 395 err = spa_open(buf, &spa, FTAG);
322 396 if (err) {
323 397 dprintf("spa_open(%s) failed\n", buf);
324 398 return (err);
325 399 }
326 400 openedspa = TRUE;
327 401
328 402 /* XXX this assertion belongs in spa_open */
329 403 ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa)));
330 404 }
331 405
332 406 dp = spa_get_dsl(spa);
333 407
334 408 rw_enter(&dp->dp_config_rwlock, RW_READER);
335 409 err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
336 410 if (err) {
337 411 rw_exit(&dp->dp_config_rwlock);
338 412 if (openedspa)
339 413 spa_close(spa, FTAG);
340 414 return (err);
341 415 }
342 416
343 417 while (next != NULL) {
344 418 dsl_dir_t *child_ds;
345 419 err = getcomponent(next, buf, &nextnext);
346 420 if (err)
347 421 break;
348 422 ASSERT(next[0] != '\0');
349 423 if (next[0] == '@')
350 424 break;
351 425 dprintf("looking up %s in obj%lld\n",
352 426 buf, dd->dd_phys->dd_child_dir_zapobj);
353 427
354 428 err = zap_lookup(dp->dp_meta_objset,
355 429 dd->dd_phys->dd_child_dir_zapobj,
356 430 buf, sizeof (ddobj), 1, &ddobj);
357 431 if (err) {
358 432 if (err == ENOENT)
359 433 err = 0;
360 434 break;
361 435 }
362 436
363 437 err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
364 438 if (err)
365 439 break;
366 440 dsl_dir_close(dd, tag);
367 441 dd = child_ds;
368 442 next = nextnext;
369 443 }
370 444 rw_exit(&dp->dp_config_rwlock);
371 445
372 446 if (err) {
373 447 dsl_dir_close(dd, tag);
374 448 if (openedspa)
375 449 spa_close(spa, FTAG);
376 450 return (err);
377 451 }
378 452
379 453 /*
380 454 * It's an error if there's more than one component left, or
381 455 * tailp==NULL and there's any component left.
382 456 */
383 457 if (next != NULL &&
384 458 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
385 459 /* bad path name */
386 460 dsl_dir_close(dd, tag);
387 461 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
388 462 err = ENOENT;
389 463 }
390 464 if (tailp)
391 465 *tailp = next;
392 466 if (openedspa)
393 467 spa_close(spa, FTAG);
394 468 *ddp = dd;
395 469 return (err);
396 470 }
397 471
398 472 /*
399 473 * Return the dsl_dir_t, and possibly the last component which couldn't
↓ open down ↓ |
343 lines elided |
↑ open up ↑ |
400 474 * be found in *tail. Return NULL if the path is bogus, or if
401 475 * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
402 476 * means that the last component is a snapshot.
403 477 */
404 478 int
405 479 dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
406 480 {
407 481 return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
408 482 }
409 483
484 +/*
485 + * Check if the counts are already valid for this filesystem and its
486 + * descendants. The counts on this filesystem, and those below, may be
487 + * uninitialized due to either the use of a pre-existing pool which did not
488 + * support the filesystem/snapshot limit feature, or one in which the feature
489 + * had not yet been enabled. The counts can also be invalid if the feature was
490 + * previously active but then deactivated.
491 + *
492 + * Recursively descend the filesystem tree and update the filesystem/snapshot
493 + * counts on each filesystem below, then update the cumulative count on the
494 + * current filesystem. If the filesystem already has a limit set on it,
495 + * then we know that its counts, and the counts on the filesystems below it,
496 + * have been updated to be correct, so we can skip this filesystem.
497 + */
498 +static void
499 +dsl_dir_set_fs_ss_count(const char *nm, dsl_dir_t *dd, dmu_tx_t *tx,
500 + uint64_t *fscnt, uint64_t *sscnt)
501 +{
502 + uint64_t my_fs_cnt = 0;
503 + uint64_t my_ss_cnt = 0;
504 + objset_t *os = dd->dd_pool->dp_meta_objset;
505 + zap_cursor_t *zc;
506 + zap_attribute_t *za;
507 + char *namebuf;
508 + int err;
509 + boolean_t limit_set = B_FALSE;
510 + uint64_t fslimit, sslimit;
511 + dsl_dataset_t *ds;
512 +
513 + err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
514 + 8, 1, &fslimit, NULL, B_FALSE);
515 + if (err == 0 && fslimit != MAXLIMIT)
516 + limit_set = B_TRUE;
517 +
518 + if (!limit_set) {
519 + err = dsl_prop_get_dd(dd,
520 + zfs_prop_to_name(ZFS_PROP_SNAPSHOT_LIMIT), 8, 1, &sslimit,
521 + NULL, B_FALSE);
522 + if (err == 0 && sslimit != MAXLIMIT)
523 + limit_set = B_TRUE;
524 + }
525 +
526 + /*
527 + * If the dd has a limit, we know its count is already good and we
528 + * don't need to recurse down any further.
529 + *
530 + * We can't check for an initialized (non-0) count since the feature
531 + * might have been previously active, then deactivated and is now
532 + * being activated again.
533 + */
534 + if (limit_set) {
535 + *fscnt = dd->dd_phys->dd_filesystem_count;
536 + *sscnt = dd->dd_phys->dd_snapshot_count;
537 + return;
538 + }
539 +
540 + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
541 + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
542 + namebuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
543 +
544 + mutex_enter(&dd->dd_lock);
545 +
546 + /* Iterate datasets */
547 + for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
548 + zap_cursor_retrieve(zc, za) == 0;
549 + zap_cursor_advance(zc)) {
550 + dsl_dir_t *chld_dd;
551 + uint64_t chld_fs_cnt = 0;
552 + uint64_t chld_ss_cnt = 0;
553 +
554 + (void) snprintf(namebuf, MAXPATHLEN, "%s/%s", nm, za->za_name);
555 +
556 + if (dsl_dir_open(namebuf, FTAG, &chld_dd, NULL))
557 + continue;
558 +
559 + dsl_dir_set_fs_ss_count(namebuf, chld_dd, tx, &chld_fs_cnt,
560 + &chld_ss_cnt);
561 +
562 + dsl_dir_close(chld_dd, FTAG);
563 +
564 + my_fs_cnt += chld_fs_cnt;
565 + my_ss_cnt += chld_ss_cnt;
566 + }
567 + zap_cursor_fini(zc);
568 +
569 + kmem_free(namebuf, MAXPATHLEN);
570 +
571 + /* Iterate snapshots */
572 + if (dsl_dataset_hold(nm, FTAG, &ds) == 0) {
573 + for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
574 + zap_cursor_retrieve(zc, za) == 0;
575 + zap_cursor_advance(zc)) {
576 + my_ss_cnt++;
577 + }
578 + zap_cursor_fini(zc);
579 + dsl_dataset_rele(ds, FTAG);
580 + }
581 +
582 + kmem_free(zc, sizeof (zap_cursor_t));
583 + kmem_free(za, sizeof (zap_attribute_t));
584 +
585 + /* Add 1 for self */
586 + my_fs_cnt++;
587 +
588 +#ifdef _KERNEL
589 + extern void __dtrace_probe_zfs__fs__fix__count(char *, uint64_t,
590 + uint64_t);
591 + __dtrace_probe_zfs__fs__fix__count((char *)nm, my_fs_cnt, my_ss_cnt);
592 +#endif
593 +
594 + /* save updated counts */
595 + dmu_buf_will_dirty(dd->dd_dbuf, tx);
596 + dd->dd_phys->dd_filesystem_count = my_fs_cnt;
597 + dd->dd_phys->dd_snapshot_count = my_ss_cnt;
598 +
599 + mutex_exit(&dd->dd_lock);
600 +
601 + /* Return child dataset count plus self */
602 + *fscnt = my_fs_cnt;
603 + *sscnt = my_ss_cnt;
604 +}
605 +
606 +/*
607 + * Return ENOSPC if new limit is less than the existing count, otherwise return
608 + * -1 to force the zfs_set_prop_nvlist code down the default path to set the
609 + * value in the nvlist.
610 + */
611 +int
612 +dsl_dir_validate_fs_ss_limit(const char *ddname, uint64_t limit,
613 + zfs_prop_t ptype)
614 +{
615 + dsl_dir_t *dd;
616 + dsl_dataset_t *ds;
617 + int err = -1;
618 + uint64_t count;
619 + dmu_tx_t *tx;
620 + uint64_t my_fs_cnt = 0;
621 + uint64_t my_ss_cnt = 0;
622 + uint64_t curr_limit;
623 + spa_t *spa;
624 + zfeature_info_t *limit_feat =
625 + &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
626 +
627 + if (dsl_dataset_hold(ddname, FTAG, &ds))
628 + return (EACCES);
629 +
630 + spa = dsl_dataset_get_spa(ds);
631 + if (!spa_feature_is_enabled(spa,
632 + &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT])) {
633 + dsl_dataset_rele(ds, FTAG);
634 + return (ENOTSUP);
635 + }
636 +
637 + if (dsl_dir_open(ddname, FTAG, &dd, NULL)) {
638 + dsl_dataset_rele(ds, FTAG);
639 + return (EACCES);
640 + }
641 +
642 + ASSERT(ds->ds_dir == dd);
643 +
644 + if (dsl_prop_get_dd(dd, zfs_prop_to_name(ptype), 8, 1, &curr_limit,
645 + NULL, B_FALSE) != 0)
646 + curr_limit = MAXLIMIT;
647 +
648 + tx = dmu_tx_create_dd(dd);
649 + if (dmu_tx_assign(tx, TXG_WAIT)) {
650 + dmu_tx_abort(tx);
651 + dsl_dir_close(dd, FTAG);
652 + dsl_dataset_rele(ds, FTAG);
653 + return (ENOSPC);
654 + }
655 +
656 + if (limit == MAXLIMIT) {
657 + /*
658 + * If we had a limit, since we're now removing that limit,
659 + * decrement the feature-active counter so that the feature
660 + * becomes inactive (only enabled) if we remove the last limit.
661 + */
662 + if (curr_limit != MAXLIMIT)
663 + spa_feature_decr(spa, limit_feat, tx);
664 +
665 + dmu_tx_commit(tx);
666 + dsl_dir_close(dd, FTAG);
667 + dsl_dataset_rele(ds, FTAG);
668 + return (-1);
669 + }
670 +
671 + /*
672 + * Since we are now setting a non-MAXLIMIT on the filesystem, we need
673 + * to ensure the counts are correct. Descend down the tree from this
674 + * point and update all of the counts to be accurate.
675 + */
676 + rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
677 + dsl_dir_set_fs_ss_count(ddname, dd, tx, &my_fs_cnt, &my_ss_cnt);
678 + rw_exit(&dd->dd_pool->dp_config_rwlock);
679 +
680 + if (ptype == ZFS_PROP_FILESYSTEM_LIMIT)
681 + count = dd->dd_phys->dd_filesystem_count;
682 + else
683 + count = dd->dd_phys->dd_snapshot_count;
684 +
685 + if (limit < count) {
686 + err = ENOSPC;
687 + } else {
688 + /*
689 + * If we had no limit, since we're now setting a limit
690 + * increment the feature-active counter so that the feature
691 + * either becomes active for the first time, or the count
692 + * simply increases so that we can decrement it when we remove
693 + * the limit.
694 + */
695 + if (curr_limit == MAXLIMIT)
696 + spa_feature_incr(spa, limit_feat, tx);
697 + }
698 +
699 + dmu_tx_commit(tx);
700 +
701 + dsl_dir_close(dd, FTAG);
702 + dsl_dataset_rele(ds, FTAG);
703 +
704 + return (err);
705 +}
706 +
707 +/*
708 + * Check if adding additional child filesystem(s) would exceed any filesystem
709 + * limits. Note that all filesystem limits up to the root (or the highest
710 + * initialized) filesystem or the given ancestor must be satisfied.
711 + */
712 +int
713 +dsl_dir_fscount_check(dsl_dir_t *dd, uint64_t cnt, dsl_dir_t *ancestor)
714 +{
715 + uint64_t limit;
716 + int err = 0;
717 +
718 + VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
719 +
720 + /*
721 + * The limit is never enforced for the admin user in global zone.
722 + * If we're not in the global zone then we need to run this check in
723 + * open context, since thats when we know what zone we're in and
724 + * syncing is only performed in the global zone.
725 + */
726 + if (INGLOBALZONE(curproc))
727 + return (0);
728 +
729 + /*
730 + * If an ancestor has been provided, stop checking the limit once we
731 + * hit that dir. We need this during rename so that we don't overcount
732 + * the check once we recurse up to the common ancestor.
733 + */
734 + if (ancestor == dd)
735 + return (0);
736 +
737 + /*
738 + * If we hit an uninitialized node while recursing up the tree, we can
739 + * stop since we know the counts are not valid on this node and we
740 + * know we won't touch this node's counts.
741 + */
742 + if (dd->dd_phys->dd_filesystem_count == 0)
743 + return (0);
744 +
745 + /*
746 + * If there's no value for this property, there's no need to enforce a
747 + * filesystem limit.
748 + */
749 + err = dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_FILESYSTEM_LIMIT),
750 + 8, 1, &limit, NULL, B_FALSE);
751 + if (err == ENOENT)
752 + return (0);
753 + else if (err != 0)
754 + return (err);
755 +
756 +#ifdef _KERNEL
757 + extern void __dtrace_probe_zfs__fs__limit(uint64_t, uint64_t, char *);
758 + __dtrace_probe_zfs__fs__limit(
759 + (uint64_t)dd->dd_phys->dd_filesystem_count, (uint64_t)limit,
760 + dd->dd_myname);
761 +#endif
762 +
763 + if (limit != MAXLIMIT &&
764 + (dd->dd_phys->dd_filesystem_count + cnt) > limit)
765 + return (EDQUOT);
766 +
767 + if (dd->dd_parent != NULL)
768 + err = dsl_dir_fscount_check(dd->dd_parent, cnt, ancestor);
769 +
770 + return (err);
771 +}
772 +
773 +/*
774 + * Adjust the filesystem count for the specified dsl_dir_t and all parent
775 + * filesystems. When a new filesystem is created, increment the count on all
776 + * parents, and when a filesystem is destroyed, decrement the count.
777 + */
778 +void
779 +dsl_dir_fscount_adjust(dsl_dir_t *dd, dmu_tx_t *tx, int64_t delta,
780 + boolean_t syncing, boolean_t first)
781 +{
782 + VERIFY(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
783 + if (syncing)
784 + VERIFY(dmu_tx_is_syncing(tx));
785 +
786 + /*
787 + * There is a special case where we are receiving a filesystem that
788 + * already exists. In this case a temporary clone name of %X is created
789 + * (see dmu_recv_begin). In dmu_recv_existing_end we destroy this
790 + * temporary clone. We never update the filesystem counts for temporary
791 + * clones. To detect this case we check the filesystem name to see if
792 + * its a hidden filesystem (%X).
793 + */
794 + if (dd->dd_myname[0] == '%')
795 + return;
796 +
797 + /*
798 + * If we hit an uninitialized node while recursing up the tree, we can
799 + * stop since we know the counts are not valid on this node and we
800 + * know we shouldn't touch this node's counts. An uninitialized count
801 + * on the node indicates that either the feature has not yet been
802 + * activated or there are no limits on this part of the tree.
803 + */
804 + if (dd->dd_phys->dd_filesystem_count == 0)
805 + return;
806 +
807 + /*
808 + * The feature might have previously been active, so there could be
809 + * non-0 counts on the nodes, but it might now be inactive.
810 + *
811 + * On initial entry we need to check if this feature is active, but
812 + * we don't want to re-check this on each recursive call. Note: the
813 + * feature cannot be active if its not enabled. If the feature is not
814 + * active, don't touch the on-disk count fields.
815 + */
816 + if (first) {
817 + dsl_dataset_t *ds = NULL;
818 + spa_t *spa;
819 + zfeature_info_t *quota_feat =
820 + &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
821 +
822 + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
823 + dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
824 + spa = dsl_dataset_get_spa(ds);
825 + dsl_dataset_rele(ds, FTAG);
826 + if (!spa_feature_is_active(spa, quota_feat))
827 + return;
828 + }
829 +
830 + dmu_buf_will_dirty(dd->dd_dbuf, tx);
831 +
832 + mutex_enter(&dd->dd_lock);
833 +
834 + dd->dd_phys->dd_filesystem_count += delta;
835 +
836 + if (dd->dd_parent != NULL)
837 + dsl_dir_fscount_adjust(dd->dd_parent, tx, delta, syncing,
838 + B_FALSE);
839 +
840 + mutex_exit(&dd->dd_lock);
841 +}
842 +
410 843 uint64_t
411 844 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
412 845 dmu_tx_t *tx)
413 846 {
414 847 objset_t *mos = dp->dp_meta_objset;
415 848 uint64_t ddobj;
416 849 dsl_dir_phys_t *ddphys;
417 850 dmu_buf_t *dbuf;
851 + zfeature_info_t *limit_feat =
852 + &spa_feature_table[SPA_FEATURE_FS_SS_LIMIT];
418 853
854 +
419 855 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
420 856 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
421 857 if (pds) {
422 858 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
423 859 name, sizeof (uint64_t), 1, &ddobj, tx));
424 860 } else {
425 861 /* it's the root dir */
426 862 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
427 863 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
428 864 }
429 865 VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
430 866 dmu_buf_will_dirty(dbuf, tx);
431 867 ddphys = dbuf->db_data;
432 868
433 869 ddphys->dd_creation_time = gethrestime_sec();
870 + /* Only initialize the count if the limit feature is active */
871 + if (spa_feature_is_active(dp->dp_spa, limit_feat))
872 + ddphys->dd_filesystem_count = 1;
434 873 if (pds)
435 874 ddphys->dd_parent_obj = pds->dd_object;
436 875 ddphys->dd_props_zapobj = zap_create(mos,
437 876 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
438 877 ddphys->dd_child_dir_zapobj = zap_create(mos,
439 878 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
440 879 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
441 880 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
442 881 dmu_buf_rele(dbuf, FTAG);
443 882
444 883 return (ddobj);
445 884 }
446 885
447 886 /* ARGSUSED */
448 887 int
449 888 dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
450 889 {
451 890 dsl_dir_t *dd = arg1;
452 891 dsl_pool_t *dp = dd->dd_pool;
453 892 objset_t *mos = dp->dp_meta_objset;
454 893 int err;
455 894 uint64_t count;
456 895
457 896 /*
458 897 * There should be exactly two holds, both from
459 898 * dsl_dataset_destroy: one on the dd directory, and one on its
460 899 * head ds. If there are more holds, then a concurrent thread is
461 900 * performing a lookup inside this dir while we're trying to destroy
462 901 * it. To minimize this possibility, we perform this check only
463 902 * in syncing context and fail the operation if we encounter
464 903 * additional holds. The dp_config_rwlock ensures that nobody else
465 904 * opens it after we check.
466 905 */
467 906 if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 2)
468 907 return (EBUSY);
469 908
470 909 err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count);
471 910 if (err)
472 911 return (err);
473 912 if (count != 0)
474 913 return (EEXIST);
475 914
476 915 return (0);
477 916 }
478 917
479 918 void
↓ open down ↓ |
36 lines elided |
↑ open up ↑ |
480 919 dsl_dir_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
481 920 {
482 921 dsl_dir_t *dd = arg1;
483 922 objset_t *mos = dd->dd_pool->dp_meta_objset;
484 923 uint64_t obj;
485 924 dd_used_t t;
486 925
487 926 ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
488 927 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
489 928
929 + /* Decrement the filesystem count for all parent filesystems. */
930 + if (dd->dd_parent != NULL)
931 + dsl_dir_fscount_adjust(dd->dd_parent, tx, -1, B_TRUE, B_TRUE);
932 +
490 933 /*
491 934 * Remove our reservation. The impl() routine avoids setting the
492 935 * actual property, which would require the (already destroyed) ds.
493 936 */
494 937 dsl_dir_set_reservation_sync_impl(dd, 0, tx);
495 938
496 939 ASSERT0(dd->dd_phys->dd_used_bytes);
497 940 ASSERT0(dd->dd_phys->dd_reserved);
498 941 for (t = 0; t < DD_USED_NUM; t++)
499 942 ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
500 943
501 944 VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
502 945 VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
503 946 VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
504 947 VERIFY(0 == zap_remove(mos,
505 948 dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
506 949
507 950 obj = dd->dd_object;
508 951 dsl_dir_close(dd, tag);
509 952 VERIFY(0 == dmu_object_free(mos, obj, tx));
510 953 }
511 954
512 955 boolean_t
513 956 dsl_dir_is_clone(dsl_dir_t *dd)
514 957 {
515 958 return (dd->dd_phys->dd_origin_obj &&
516 959 (dd->dd_pool->dp_origin_snap == NULL ||
517 960 dd->dd_phys->dd_origin_obj !=
518 961 dd->dd_pool->dp_origin_snap->ds_object));
519 962 }
520 963
521 964 void
522 965 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
523 966 {
524 967 mutex_enter(&dd->dd_lock);
525 968 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
526 969 dd->dd_phys->dd_used_bytes);
527 970 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
528 971 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
529 972 dd->dd_phys->dd_reserved);
530 973 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
531 974 dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
532 975 (dd->dd_phys->dd_uncompressed_bytes * 100 /
533 976 dd->dd_phys->dd_compressed_bytes));
534 977 if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
535 978 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
536 979 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
537 980 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
538 981 dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
539 982 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
540 983 dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
541 984 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
542 985 dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
543 986 dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
544 987 }
545 988 mutex_exit(&dd->dd_lock);
546 989
547 990 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
548 991 if (dsl_dir_is_clone(dd)) {
549 992 dsl_dataset_t *ds;
550 993 char buf[MAXNAMELEN];
551 994
552 995 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool,
553 996 dd->dd_phys->dd_origin_obj, FTAG, &ds));
554 997 dsl_dataset_name(ds, buf);
555 998 dsl_dataset_rele(ds, FTAG);
556 999 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
557 1000 }
558 1001 rw_exit(&dd->dd_pool->dp_config_rwlock);
559 1002 }
560 1003
561 1004 void
562 1005 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
563 1006 {
564 1007 dsl_pool_t *dp = dd->dd_pool;
565 1008
566 1009 ASSERT(dd->dd_phys);
567 1010
568 1011 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) {
569 1012 /* up the hold count until we can be written out */
570 1013 dmu_buf_add_ref(dd->dd_dbuf, dd);
571 1014 }
572 1015 }
573 1016
574 1017 static int64_t
575 1018 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
576 1019 {
577 1020 uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
578 1021 uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
579 1022 return (new_accounted - old_accounted);
580 1023 }
581 1024
582 1025 void
583 1026 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
584 1027 {
585 1028 ASSERT(dmu_tx_is_syncing(tx));
586 1029
587 1030 mutex_enter(&dd->dd_lock);
588 1031 ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
589 1032 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
590 1033 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
591 1034 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
592 1035 mutex_exit(&dd->dd_lock);
593 1036
594 1037 /* release the hold from dsl_dir_dirty */
595 1038 dmu_buf_rele(dd->dd_dbuf, dd);
596 1039 }
597 1040
598 1041 static uint64_t
599 1042 dsl_dir_space_towrite(dsl_dir_t *dd)
600 1043 {
601 1044 uint64_t space = 0;
602 1045 int i;
603 1046
604 1047 ASSERT(MUTEX_HELD(&dd->dd_lock));
605 1048
606 1049 for (i = 0; i < TXG_SIZE; i++) {
607 1050 space += dd->dd_space_towrite[i&TXG_MASK];
608 1051 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
609 1052 }
610 1053 return (space);
611 1054 }
612 1055
613 1056 /*
614 1057 * How much space would dd have available if ancestor had delta applied
615 1058 * to it? If ondiskonly is set, we're only interested in what's
616 1059 * on-disk, not estimated pending changes.
617 1060 */
618 1061 uint64_t
619 1062 dsl_dir_space_available(dsl_dir_t *dd,
620 1063 dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
621 1064 {
622 1065 uint64_t parentspace, myspace, quota, used;
623 1066
624 1067 /*
625 1068 * If there are no restrictions otherwise, assume we have
626 1069 * unlimited space available.
627 1070 */
628 1071 quota = UINT64_MAX;
629 1072 parentspace = UINT64_MAX;
630 1073
631 1074 if (dd->dd_parent != NULL) {
632 1075 parentspace = dsl_dir_space_available(dd->dd_parent,
633 1076 ancestor, delta, ondiskonly);
634 1077 }
635 1078
636 1079 mutex_enter(&dd->dd_lock);
637 1080 if (dd->dd_phys->dd_quota != 0)
638 1081 quota = dd->dd_phys->dd_quota;
639 1082 used = dd->dd_phys->dd_used_bytes;
640 1083 if (!ondiskonly)
641 1084 used += dsl_dir_space_towrite(dd);
642 1085
643 1086 if (dd->dd_parent == NULL) {
644 1087 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
645 1088 quota = MIN(quota, poolsize);
646 1089 }
647 1090
648 1091 if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
649 1092 /*
650 1093 * We have some space reserved, in addition to what our
651 1094 * parent gave us.
652 1095 */
653 1096 parentspace += dd->dd_phys->dd_reserved - used;
654 1097 }
655 1098
656 1099 if (dd == ancestor) {
657 1100 ASSERT(delta <= 0);
658 1101 ASSERT(used >= -delta);
659 1102 used += delta;
660 1103 if (parentspace != UINT64_MAX)
661 1104 parentspace -= delta;
662 1105 }
663 1106
664 1107 if (used > quota) {
665 1108 /* over quota */
666 1109 myspace = 0;
667 1110 } else {
668 1111 /*
669 1112 * the lesser of the space provided by our parent and
670 1113 * the space left in our quota
671 1114 */
672 1115 myspace = MIN(parentspace, quota - used);
673 1116 }
674 1117
675 1118 mutex_exit(&dd->dd_lock);
676 1119
677 1120 return (myspace);
678 1121 }
679 1122
680 1123 struct tempreserve {
681 1124 list_node_t tr_node;
682 1125 dsl_pool_t *tr_dp;
683 1126 dsl_dir_t *tr_ds;
684 1127 uint64_t tr_size;
685 1128 };
686 1129
687 1130 static int
688 1131 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
689 1132 boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
690 1133 dmu_tx_t *tx, boolean_t first)
691 1134 {
692 1135 uint64_t txg = tx->tx_txg;
693 1136 uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
694 1137 uint64_t deferred = 0;
695 1138 struct tempreserve *tr;
696 1139 int retval = EDQUOT;
697 1140 int txgidx = txg & TXG_MASK;
698 1141 int i;
699 1142 uint64_t ref_rsrv = 0;
700 1143
701 1144 ASSERT3U(txg, !=, 0);
702 1145 ASSERT3S(asize, >, 0);
703 1146
704 1147 mutex_enter(&dd->dd_lock);
705 1148
706 1149 /*
707 1150 * Check against the dsl_dir's quota. We don't add in the delta
708 1151 * when checking for over-quota because they get one free hit.
709 1152 */
710 1153 est_inflight = dsl_dir_space_towrite(dd);
711 1154 for (i = 0; i < TXG_SIZE; i++)
712 1155 est_inflight += dd->dd_tempreserved[i];
713 1156 used_on_disk = dd->dd_phys->dd_used_bytes;
714 1157
715 1158 /*
716 1159 * On the first iteration, fetch the dataset's used-on-disk and
717 1160 * refreservation values. Also, if checkrefquota is set, test if
718 1161 * allocating this space would exceed the dataset's refquota.
719 1162 */
720 1163 if (first && tx->tx_objset) {
721 1164 int error;
722 1165 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
723 1166
724 1167 error = dsl_dataset_check_quota(ds, checkrefquota,
725 1168 asize, est_inflight, &used_on_disk, &ref_rsrv);
726 1169 if (error) {
727 1170 mutex_exit(&dd->dd_lock);
728 1171 return (error);
729 1172 }
730 1173 }
731 1174
732 1175 /*
733 1176 * If this transaction will result in a net free of space,
734 1177 * we want to let it through.
735 1178 */
736 1179 if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
737 1180 quota = UINT64_MAX;
738 1181 else
739 1182 quota = dd->dd_phys->dd_quota;
740 1183
741 1184 /*
742 1185 * Adjust the quota against the actual pool size at the root
743 1186 * minus any outstanding deferred frees.
744 1187 * To ensure that it's possible to remove files from a full
745 1188 * pool without inducing transient overcommits, we throttle
746 1189 * netfree transactions against a quota that is slightly larger,
747 1190 * but still within the pool's allocation slop. In cases where
748 1191 * we're very close to full, this will allow a steady trickle of
749 1192 * removes to get through.
750 1193 */
751 1194 if (dd->dd_parent == NULL) {
752 1195 spa_t *spa = dd->dd_pool->dp_spa;
753 1196 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
754 1197 deferred = metaslab_class_get_deferred(spa_normal_class(spa));
755 1198 if (poolsize - deferred < quota) {
756 1199 quota = poolsize - deferred;
757 1200 retval = ENOSPC;
758 1201 }
759 1202 }
760 1203
761 1204 /*
762 1205 * If they are requesting more space, and our current estimate
763 1206 * is over quota, they get to try again unless the actual
764 1207 * on-disk is over quota and there are no pending changes (which
765 1208 * may free up space for us).
766 1209 */
767 1210 if (used_on_disk + est_inflight >= quota) {
768 1211 if (est_inflight > 0 || used_on_disk < quota ||
769 1212 (retval == ENOSPC && used_on_disk < quota + deferred))
770 1213 retval = ERESTART;
771 1214 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
772 1215 "quota=%lluK tr=%lluK err=%d\n",
773 1216 used_on_disk>>10, est_inflight>>10,
774 1217 quota>>10, asize>>10, retval);
775 1218 mutex_exit(&dd->dd_lock);
776 1219 return (retval);
777 1220 }
778 1221
779 1222 /* We need to up our estimated delta before dropping dd_lock */
780 1223 dd->dd_tempreserved[txgidx] += asize;
781 1224
782 1225 parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
783 1226 asize - ref_rsrv);
784 1227 mutex_exit(&dd->dd_lock);
785 1228
786 1229 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
787 1230 tr->tr_ds = dd;
788 1231 tr->tr_size = asize;
789 1232 list_insert_tail(tr_list, tr);
790 1233
791 1234 /* see if it's OK with our parent */
792 1235 if (dd->dd_parent && parent_rsrv) {
793 1236 boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
794 1237
795 1238 return (dsl_dir_tempreserve_impl(dd->dd_parent,
796 1239 parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
797 1240 } else {
798 1241 return (0);
799 1242 }
800 1243 }
801 1244
802 1245 /*
803 1246 * Reserve space in this dsl_dir, to be used in this tx's txg.
804 1247 * After the space has been dirtied (and dsl_dir_willuse_space()
805 1248 * has been called), the reservation should be canceled, using
806 1249 * dsl_dir_tempreserve_clear().
807 1250 */
808 1251 int
809 1252 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
810 1253 uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
811 1254 {
812 1255 int err;
813 1256 list_t *tr_list;
814 1257
815 1258 if (asize == 0) {
816 1259 *tr_cookiep = NULL;
817 1260 return (0);
818 1261 }
819 1262
820 1263 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
821 1264 list_create(tr_list, sizeof (struct tempreserve),
822 1265 offsetof(struct tempreserve, tr_node));
823 1266 ASSERT3S(asize, >, 0);
824 1267 ASSERT3S(fsize, >=, 0);
825 1268
826 1269 err = arc_tempreserve_space(lsize, tx->tx_txg);
827 1270 if (err == 0) {
828 1271 struct tempreserve *tr;
829 1272
830 1273 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
831 1274 tr->tr_size = lsize;
832 1275 list_insert_tail(tr_list, tr);
833 1276
834 1277 err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
835 1278 } else {
836 1279 if (err == EAGAIN) {
837 1280 txg_delay(dd->dd_pool, tx->tx_txg,
838 1281 zfs_zone_txg_delay());
839 1282 err = ERESTART;
840 1283 }
841 1284 dsl_pool_memory_pressure(dd->dd_pool);
842 1285 }
843 1286
844 1287 if (err == 0) {
845 1288 struct tempreserve *tr;
846 1289
847 1290 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
848 1291 tr->tr_dp = dd->dd_pool;
849 1292 tr->tr_size = asize;
850 1293 list_insert_tail(tr_list, tr);
851 1294
852 1295 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
853 1296 FALSE, asize > usize, tr_list, tx, TRUE);
854 1297 }
855 1298
856 1299 if (err)
857 1300 dsl_dir_tempreserve_clear(tr_list, tx);
858 1301 else
859 1302 *tr_cookiep = tr_list;
860 1303
861 1304 return (err);
862 1305 }
863 1306
864 1307 /*
865 1308 * Clear a temporary reservation that we previously made with
866 1309 * dsl_dir_tempreserve_space().
867 1310 */
868 1311 void
869 1312 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
870 1313 {
871 1314 int txgidx = tx->tx_txg & TXG_MASK;
872 1315 list_t *tr_list = tr_cookie;
873 1316 struct tempreserve *tr;
874 1317
875 1318 ASSERT3U(tx->tx_txg, !=, 0);
876 1319
877 1320 if (tr_cookie == NULL)
878 1321 return;
879 1322
880 1323 while (tr = list_head(tr_list)) {
881 1324 if (tr->tr_dp) {
882 1325 dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
883 1326 } else if (tr->tr_ds) {
884 1327 mutex_enter(&tr->tr_ds->dd_lock);
885 1328 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
886 1329 tr->tr_size);
887 1330 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
888 1331 mutex_exit(&tr->tr_ds->dd_lock);
889 1332 } else {
890 1333 arc_tempreserve_clear(tr->tr_size);
891 1334 }
892 1335 list_remove(tr_list, tr);
893 1336 kmem_free(tr, sizeof (struct tempreserve));
894 1337 }
895 1338
896 1339 kmem_free(tr_list, sizeof (list_t));
897 1340 }
898 1341
899 1342 static void
900 1343 dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
901 1344 {
902 1345 int64_t parent_space;
903 1346 uint64_t est_used;
904 1347
905 1348 mutex_enter(&dd->dd_lock);
906 1349 if (space > 0)
907 1350 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
908 1351
909 1352 est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
910 1353 parent_space = parent_delta(dd, est_used, space);
911 1354 mutex_exit(&dd->dd_lock);
912 1355
913 1356 /* Make sure that we clean up dd_space_to* */
914 1357 dsl_dir_dirty(dd, tx);
915 1358
916 1359 /* XXX this is potentially expensive and unnecessary... */
917 1360 if (parent_space && dd->dd_parent)
918 1361 dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
919 1362 }
920 1363
921 1364 /*
922 1365 * Call in open context when we think we're going to write/free space,
923 1366 * eg. when dirtying data. Be conservative (ie. OK to write less than
924 1367 * this or free more than this, but don't write more or free less).
925 1368 */
926 1369 void
927 1370 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
928 1371 {
929 1372 dsl_pool_willuse_space(dd->dd_pool, space, tx);
930 1373 dsl_dir_willuse_space_impl(dd, space, tx);
931 1374 }
932 1375
933 1376 /* call from syncing context when we actually write/free space for this dd */
934 1377 void
935 1378 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
936 1379 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
937 1380 {
938 1381 int64_t accounted_delta;
939 1382 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
940 1383
941 1384 ASSERT(dmu_tx_is_syncing(tx));
942 1385 ASSERT(type < DD_USED_NUM);
943 1386
944 1387 if (needlock)
945 1388 mutex_enter(&dd->dd_lock);
946 1389 accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
947 1390 ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
948 1391 ASSERT(compressed >= 0 ||
949 1392 dd->dd_phys->dd_compressed_bytes >= -compressed);
950 1393 ASSERT(uncompressed >= 0 ||
951 1394 dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
952 1395 dmu_buf_will_dirty(dd->dd_dbuf, tx);
953 1396 dd->dd_phys->dd_used_bytes += used;
954 1397 dd->dd_phys->dd_uncompressed_bytes += uncompressed;
955 1398 dd->dd_phys->dd_compressed_bytes += compressed;
956 1399
957 1400 if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
958 1401 ASSERT(used > 0 ||
959 1402 dd->dd_phys->dd_used_breakdown[type] >= -used);
960 1403 dd->dd_phys->dd_used_breakdown[type] += used;
961 1404 #ifdef DEBUG
962 1405 dd_used_t t;
963 1406 uint64_t u = 0;
964 1407 for (t = 0; t < DD_USED_NUM; t++)
965 1408 u += dd->dd_phys->dd_used_breakdown[t];
966 1409 ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
967 1410 #endif
968 1411 }
969 1412 if (needlock)
970 1413 mutex_exit(&dd->dd_lock);
971 1414
972 1415 if (dd->dd_parent != NULL) {
973 1416 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
974 1417 accounted_delta, compressed, uncompressed, tx);
975 1418 dsl_dir_transfer_space(dd->dd_parent,
976 1419 used - accounted_delta,
977 1420 DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
978 1421 }
979 1422 }
980 1423
981 1424 void
982 1425 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
983 1426 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
984 1427 {
985 1428 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
986 1429
987 1430 ASSERT(dmu_tx_is_syncing(tx));
988 1431 ASSERT(oldtype < DD_USED_NUM);
989 1432 ASSERT(newtype < DD_USED_NUM);
990 1433
991 1434 if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
992 1435 return;
993 1436
994 1437 if (needlock)
995 1438 mutex_enter(&dd->dd_lock);
996 1439 ASSERT(delta > 0 ?
997 1440 dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
998 1441 dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
999 1442 ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1000 1443 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1001 1444 dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1002 1445 dd->dd_phys->dd_used_breakdown[newtype] += delta;
1003 1446 if (needlock)
1004 1447 mutex_exit(&dd->dd_lock);
1005 1448 }
1006 1449
1007 1450 static int
1008 1451 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
1009 1452 {
1010 1453 dsl_dataset_t *ds = arg1;
1011 1454 dsl_dir_t *dd = ds->ds_dir;
1012 1455 dsl_prop_setarg_t *psa = arg2;
1013 1456 int err;
1014 1457 uint64_t towrite;
1015 1458
1016 1459 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1017 1460 return (err);
1018 1461
1019 1462 if (psa->psa_effective_value == 0)
1020 1463 return (0);
1021 1464
1022 1465 mutex_enter(&dd->dd_lock);
1023 1466 /*
1024 1467 * If we are doing the preliminary check in open context, and
1025 1468 * there are pending changes, then don't fail it, since the
1026 1469 * pending changes could under-estimate the amount of space to be
1027 1470 * freed up.
1028 1471 */
↓ open down ↓ |
529 lines elided |
↑ open up ↑ |
1029 1472 towrite = dsl_dir_space_towrite(dd);
1030 1473 if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1031 1474 (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
1032 1475 psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
1033 1476 err = ENOSPC;
1034 1477 }
1035 1478 mutex_exit(&dd->dd_lock);
1036 1479 return (err);
1037 1480 }
1038 1481
1039 -extern dsl_syncfunc_t dsl_prop_set_sync;
1040 -
1041 1482 static void
1042 1483 dsl_dir_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1043 1484 {
1044 1485 dsl_dataset_t *ds = arg1;
1045 1486 dsl_dir_t *dd = ds->ds_dir;
1046 1487 dsl_prop_setarg_t *psa = arg2;
1047 1488 uint64_t effective_value = psa->psa_effective_value;
1048 1489
1049 1490 dsl_prop_set_sync(ds, psa, tx);
1050 1491 DSL_PROP_CHECK_PREDICTION(dd, psa);
1051 1492
1052 1493 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1053 1494
1054 1495 mutex_enter(&dd->dd_lock);
1055 1496 dd->dd_phys->dd_quota = effective_value;
1056 1497 mutex_exit(&dd->dd_lock);
1057 1498 }
1058 1499
1059 1500 int
1060 1501 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1061 1502 {
1062 1503 dsl_dir_t *dd;
1063 1504 dsl_dataset_t *ds;
1064 1505 dsl_prop_setarg_t psa;
1065 1506 int err;
1066 1507
1067 1508 dsl_prop_setarg_init_uint64(&psa, "quota", source, "a);
1068 1509
1069 1510 err = dsl_dataset_hold(ddname, FTAG, &ds);
1070 1511 if (err)
1071 1512 return (err);
1072 1513
1073 1514 err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1074 1515 if (err) {
1075 1516 dsl_dataset_rele(ds, FTAG);
1076 1517 return (err);
1077 1518 }
1078 1519
1079 1520 ASSERT(ds->ds_dir == dd);
1080 1521
1081 1522 /*
1082 1523 * If someone removes a file, then tries to set the quota, we want to
1083 1524 * make sure the file freeing takes effect.
1084 1525 */
1085 1526 txg_wait_open(dd->dd_pool, 0);
1086 1527
1087 1528 err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
1088 1529 dsl_dir_set_quota_sync, ds, &psa, 0);
1089 1530
1090 1531 dsl_dir_close(dd, FTAG);
1091 1532 dsl_dataset_rele(ds, FTAG);
1092 1533 return (err);
1093 1534 }
1094 1535
1095 1536 int
1096 1537 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
1097 1538 {
1098 1539 dsl_dataset_t *ds = arg1;
1099 1540 dsl_dir_t *dd = ds->ds_dir;
1100 1541 dsl_prop_setarg_t *psa = arg2;
1101 1542 uint64_t effective_value;
1102 1543 uint64_t used, avail;
1103 1544 int err;
1104 1545
1105 1546 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
1106 1547 return (err);
1107 1548
1108 1549 effective_value = psa->psa_effective_value;
1109 1550
1110 1551 /*
1111 1552 * If we are doing the preliminary check in open context, the
1112 1553 * space estimates may be inaccurate.
1113 1554 */
1114 1555 if (!dmu_tx_is_syncing(tx))
1115 1556 return (0);
1116 1557
1117 1558 mutex_enter(&dd->dd_lock);
1118 1559 used = dd->dd_phys->dd_used_bytes;
1119 1560 mutex_exit(&dd->dd_lock);
1120 1561
1121 1562 if (dd->dd_parent) {
1122 1563 avail = dsl_dir_space_available(dd->dd_parent,
1123 1564 NULL, 0, FALSE);
1124 1565 } else {
1125 1566 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1126 1567 }
1127 1568
1128 1569 if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
1129 1570 uint64_t delta = MAX(used, effective_value) -
1130 1571 MAX(used, dd->dd_phys->dd_reserved);
1131 1572
1132 1573 if (delta > avail)
1133 1574 return (ENOSPC);
1134 1575 if (dd->dd_phys->dd_quota > 0 &&
1135 1576 effective_value > dd->dd_phys->dd_quota)
1136 1577 return (ENOSPC);
1137 1578 }
1138 1579
1139 1580 return (0);
1140 1581 }
1141 1582
1142 1583 static void
1143 1584 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1144 1585 {
1145 1586 uint64_t used;
1146 1587 int64_t delta;
1147 1588
1148 1589 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1149 1590
1150 1591 mutex_enter(&dd->dd_lock);
1151 1592 used = dd->dd_phys->dd_used_bytes;
1152 1593 delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1153 1594 dd->dd_phys->dd_reserved = value;
1154 1595
1155 1596 if (dd->dd_parent != NULL) {
1156 1597 /* Roll up this additional usage into our ancestors */
1157 1598 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1158 1599 delta, 0, 0, tx);
1159 1600 }
1160 1601 mutex_exit(&dd->dd_lock);
1161 1602 }
1162 1603
1163 1604
1164 1605 static void
1165 1606 dsl_dir_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1166 1607 {
1167 1608 dsl_dataset_t *ds = arg1;
1168 1609 dsl_dir_t *dd = ds->ds_dir;
1169 1610 dsl_prop_setarg_t *psa = arg2;
1170 1611 uint64_t value = psa->psa_effective_value;
1171 1612
1172 1613 dsl_prop_set_sync(ds, psa, tx);
1173 1614 DSL_PROP_CHECK_PREDICTION(dd, psa);
1174 1615
1175 1616 dsl_dir_set_reservation_sync_impl(dd, value, tx);
1176 1617 }
1177 1618
1178 1619 int
1179 1620 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1180 1621 uint64_t reservation)
1181 1622 {
1182 1623 dsl_dir_t *dd;
1183 1624 dsl_dataset_t *ds;
1184 1625 dsl_prop_setarg_t psa;
1185 1626 int err;
1186 1627
1187 1628 dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
1188 1629
1189 1630 err = dsl_dataset_hold(ddname, FTAG, &ds);
1190 1631 if (err)
1191 1632 return (err);
1192 1633
1193 1634 err = dsl_dir_open(ddname, FTAG, &dd, NULL);
1194 1635 if (err) {
1195 1636 dsl_dataset_rele(ds, FTAG);
1196 1637 return (err);
1197 1638 }
1198 1639
1199 1640 ASSERT(ds->ds_dir == dd);
1200 1641
1201 1642 err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
1202 1643 dsl_dir_set_reservation_sync, ds, &psa, 0);
1203 1644
1204 1645 dsl_dir_close(dd, FTAG);
1205 1646 dsl_dataset_rele(ds, FTAG);
1206 1647 return (err);
1207 1648 }
1208 1649
1209 1650 static dsl_dir_t *
1210 1651 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1211 1652 {
1212 1653 for (; ds1; ds1 = ds1->dd_parent) {
1213 1654 dsl_dir_t *dd;
1214 1655 for (dd = ds2; dd; dd = dd->dd_parent) {
1215 1656 if (ds1 == dd)
1216 1657 return (dd);
1217 1658 }
1218 1659 }
1219 1660 return (NULL);
1220 1661 }
1221 1662
1222 1663 /*
1223 1664 * If delta is applied to dd, how much of that delta would be applied to
1224 1665 * ancestor? Syncing context only.
1225 1666 */
1226 1667 static int64_t
1227 1668 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1228 1669 {
1229 1670 if (dd == ancestor)
1230 1671 return (delta);
1231 1672
1232 1673 mutex_enter(&dd->dd_lock);
1233 1674 delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1234 1675 mutex_exit(&dd->dd_lock);
1235 1676 return (would_change(dd->dd_parent, delta, ancestor));
1236 1677 }
1237 1678
1238 1679 struct renamearg {
1239 1680 dsl_dir_t *newparent;
1240 1681 const char *mynewname;
1241 1682 };
1242 1683
1243 1684 static int
1244 1685 dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx)
1245 1686 {
1246 1687 dsl_dir_t *dd = arg1;
1247 1688 struct renamearg *ra = arg2;
1248 1689 dsl_pool_t *dp = dd->dd_pool;
1249 1690 objset_t *mos = dp->dp_meta_objset;
1250 1691 int err;
1251 1692 uint64_t val;
1252 1693
1253 1694 /*
1254 1695 * There should only be one reference, from dmu_objset_rename().
1255 1696 * Fleeting holds are also possible (eg, from "zfs list" getting
1256 1697 * stats), but any that are present in open context will likely
1257 1698 * be gone by syncing context, so only fail from syncing
1258 1699 * context.
1259 1700 */
1260 1701 if (dmu_tx_is_syncing(tx) && dmu_buf_refcount(dd->dd_dbuf) > 1)
1261 1702 return (EBUSY);
1262 1703
1263 1704 /* check for existing name */
1264 1705 err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1265 1706 ra->mynewname, 8, 1, &val);
1266 1707 if (err == 0)
1267 1708 return (EEXIST);
1268 1709 if (err != ENOENT)
1269 1710 return (err);
1270 1711
↓ open down ↓ |
220 lines elided |
↑ open up ↑ |
1271 1712 if (ra->newparent != dd->dd_parent) {
1272 1713 /* is there enough space? */
1273 1714 uint64_t myspace =
1274 1715 MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1275 1716
1276 1717 /* no rename into our descendant */
1277 1718 if (closest_common_ancestor(dd, ra->newparent) == dd)
1278 1719 return (EINVAL);
1279 1720
1280 1721 if (err = dsl_dir_transfer_possible(dd->dd_parent,
1281 - ra->newparent, myspace))
1722 + ra->newparent, dd, myspace, tx))
1282 1723 return (err);
1283 1724 }
1284 1725
1285 1726 return (0);
1286 1727 }
1287 1728
1288 1729 static void
1289 1730 dsl_dir_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1290 1731 {
1291 1732 dsl_dir_t *dd = arg1;
1292 1733 struct renamearg *ra = arg2;
1293 1734 dsl_pool_t *dp = dd->dd_pool;
1294 1735 objset_t *mos = dp->dp_meta_objset;
1295 1736 int err;
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
1296 1737 char namebuf[MAXNAMELEN];
1297 1738
1298 1739 ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2);
1299 1740
1300 1741 /* Log this before we change the name. */
1301 1742 dsl_dir_name(ra->newparent, namebuf);
1302 1743 spa_history_log_internal_dd(dd, "rename", tx,
1303 1744 "-> %s/%s", namebuf, ra->mynewname);
1304 1745
1305 1746 if (ra->newparent != dd->dd_parent) {
1747 + int cnt;
1748 +
1749 + mutex_enter(&dd->dd_lock);
1750 +
1751 + cnt = dd->dd_phys->dd_filesystem_count;
1752 + dsl_dir_fscount_adjust(dd->dd_parent, tx, -cnt, B_TRUE, B_TRUE);
1753 + dsl_dir_fscount_adjust(ra->newparent, tx, cnt, B_TRUE, B_TRUE);
1754 +
1755 + cnt = dd->dd_phys->dd_snapshot_count;
1756 + dsl_snapcount_adjust(dd->dd_parent, tx, -cnt, B_TRUE);
1757 + dsl_snapcount_adjust(ra->newparent, tx, cnt, B_TRUE);
1758 +
1759 + mutex_exit(&dd->dd_lock);
1760 +
1306 1761 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1307 1762 -dd->dd_phys->dd_used_bytes,
1308 1763 -dd->dd_phys->dd_compressed_bytes,
1309 1764 -dd->dd_phys->dd_uncompressed_bytes, tx);
1310 1765 dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD,
1311 1766 dd->dd_phys->dd_used_bytes,
1312 1767 dd->dd_phys->dd_compressed_bytes,
1313 1768 dd->dd_phys->dd_uncompressed_bytes, tx);
1314 1769
1315 1770 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1316 1771 uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1317 1772 dd->dd_phys->dd_used_bytes;
1318 1773
1319 1774 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1320 1775 -unused_rsrv, 0, 0, tx);
1321 1776 dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV,
1322 1777 unused_rsrv, 0, 0, tx);
1323 1778 }
1324 1779 }
1325 1780
1326 1781 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1327 1782
1328 1783 /* remove from old parent zapobj */
1329 1784 err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1330 1785 dd->dd_myname, tx);
1331 1786 ASSERT0(err);
1332 1787
1333 1788 (void) strcpy(dd->dd_myname, ra->mynewname);
1334 1789 dsl_dir_close(dd->dd_parent, dd);
1335 1790 dd->dd_phys->dd_parent_obj = ra->newparent->dd_object;
1336 1791 VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
1337 1792 ra->newparent->dd_object, NULL, dd, &dd->dd_parent));
1338 1793
1339 1794 /* add to new parent zapobj */
1340 1795 err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj,
1341 1796 dd->dd_myname, 8, 1, &dd->dd_object, tx);
1342 1797 ASSERT0(err);
1343 1798
1344 1799 }
1345 1800
1346 1801 int
1347 1802 dsl_dir_rename(dsl_dir_t *dd, const char *newname)
1348 1803 {
1349 1804 struct renamearg ra;
1350 1805 int err;
1351 1806
1352 1807 /* new parent should exist */
1353 1808 err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname);
1354 1809 if (err)
1355 1810 return (err);
1356 1811
1357 1812 /* can't rename to different pool */
1358 1813 if (dd->dd_pool != ra.newparent->dd_pool) {
1359 1814 err = ENXIO;
1360 1815 goto out;
1361 1816 }
1362 1817
1363 1818 /* new name should not already exist */
1364 1819 if (ra.mynewname == NULL) {
1365 1820 err = EEXIST;
1366 1821 goto out;
1367 1822 }
↓ open down ↓ |
52 lines elided |
↑ open up ↑ |
1368 1823
1369 1824 err = dsl_sync_task_do(dd->dd_pool,
1370 1825 dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3);
1371 1826
1372 1827 out:
1373 1828 dsl_dir_close(ra.newparent, FTAG);
1374 1829 return (err);
1375 1830 }
1376 1831
1377 1832 int
1378 -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
1833 +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, dsl_dir_t *moving_dd,
1834 + uint64_t space, dmu_tx_t *tx)
1379 1835 {
1380 1836 dsl_dir_t *ancestor;
1381 1837 int64_t adelta;
1382 1838 uint64_t avail;
1839 + int err;
1383 1840
1384 1841 ancestor = closest_common_ancestor(sdd, tdd);
1385 1842 adelta = would_change(sdd, -space, ancestor);
1386 1843 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1387 1844 if (avail < space)
1388 1845 return (ENOSPC);
1389 1846
1847 + if (sdd != moving_dd) {
1848 + err = dsl_dir_fscount_check(tdd,
1849 + moving_dd->dd_phys->dd_filesystem_count, ancestor);
1850 + if (err != 0)
1851 + return (err);
1852 + }
1853 + err = dsl_snapcount_check(tdd, moving_dd->dd_phys->dd_snapshot_count,
1854 + ancestor);
1855 + if (err != 0)
1856 + return (err);
1857 +
1390 1858 return (0);
1391 1859 }
1392 1860
1393 1861 timestruc_t
1394 1862 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1395 1863 {
1396 1864 timestruc_t t;
1397 1865
1398 1866 mutex_enter(&dd->dd_lock);
1399 1867 t = dd->dd_snap_cmtime;
1400 1868 mutex_exit(&dd->dd_lock);
1401 1869
1402 1870 return (t);
1403 1871 }
1404 1872
1405 1873 void
1406 1874 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1407 1875 {
1408 1876 timestruc_t t;
1409 1877
1410 1878 gethrestime(&t);
1411 1879 mutex_enter(&dd->dd_lock);
1412 1880 dd->dd_snap_cmtime = t;
1413 1881 mutex_exit(&dd->dd_lock);
1414 1882 }
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX