Print this page
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 25 */
26 26
27 27 /* Portions Copyright 2007 Jeremy Teo */
28 28 /* Portions Copyright 2010 Robert Milkowski */
29 29
30 30 #include <sys/types.h>
31 31 #include <sys/param.h>
32 32 #include <sys/time.h>
33 33 #include <sys/systm.h>
34 34 #include <sys/sysmacros.h>
35 35 #include <sys/resource.h>
36 36 #include <sys/vfs.h>
37 37 #include <sys/vfs_opreg.h>
38 38 #include <sys/vnode.h>
39 39 #include <sys/file.h>
40 40 #include <sys/stat.h>
41 41 #include <sys/kmem.h>
42 42 #include <sys/taskq.h>
43 43 #include <sys/uio.h>
44 44 #include <sys/vmsystm.h>
45 45 #include <sys/atomic.h>
46 46 #include <sys/vm.h>
47 47 #include <vm/seg_vn.h>
48 48 #include <vm/pvn.h>
49 49 #include <vm/as.h>
50 50 #include <vm/kpm.h>
51 51 #include <vm/seg_kpm.h>
52 52 #include <sys/mman.h>
53 53 #include <sys/pathname.h>
54 54 #include <sys/cmn_err.h>
55 55 #include <sys/errno.h>
56 56 #include <sys/unistd.h>
57 57 #include <sys/zfs_dir.h>
58 58 #include <sys/zfs_acl.h>
59 59 #include <sys/zfs_ioctl.h>
60 60 #include <sys/fs/zfs.h>
61 61 #include <sys/dmu.h>
62 62 #include <sys/dmu_objset.h>
63 63 #include <sys/spa.h>
64 64 #include <sys/txg.h>
65 65 #include <sys/dbuf.h>
66 66 #include <sys/zap.h>
67 67 #include <sys/sa.h>
68 68 #include <sys/dirent.h>
69 69 #include <sys/policy.h>
70 70 #include <sys/sunddi.h>
71 71 #include <sys/filio.h>
72 72 #include <sys/sid.h>
73 73 #include "fs/fs_subr.h"
74 74 #include <sys/zfs_ctldir.h>
75 75 #include <sys/zfs_fuid.h>
76 76 #include <sys/zfs_sa.h>
77 77 #include <sys/dnlc.h>
78 78 #include <sys/zfs_rlock.h>
79 79 #include <sys/extdirent.h>
80 80 #include <sys/kidmap.h>
81 81 #include <sys/cred.h>
82 82 #include <sys/attr.h>
83 83
84 84 /*
85 85 * Programming rules.
86 86 *
87 87 * Each vnode op performs some logical unit of work. To do this, the ZPL must
88 88 * properly lock its in-core state, create a DMU transaction, do the work,
89 89 * record this work in the intent log (ZIL), commit the DMU transaction,
90 90 * and wait for the intent log to commit if it is a synchronous operation.
91 91 * Moreover, the vnode ops must work in both normal and log replay context.
92 92 * The ordering of events is important to avoid deadlocks and references
93 93 * to freed memory. The example below illustrates the following Big Rules:
94 94 *
95 95 * (1) A check must be made in each zfs thread for a mounted file system.
96 96 * This is done avoiding races using ZFS_ENTER(zfsvfs).
97 97 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
98 98 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
99 99 * can return EIO from the calling function.
100 100 *
101 101 * (2) VN_RELE() should always be the last thing except for zil_commit()
102 102 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
103 103 * First, if it's the last reference, the vnode/znode
104 104 * can be freed, so the zp may point to freed memory. Second, the last
105 105 * reference will call zfs_zinactive(), which may induce a lot of work --
106 106 * pushing cached pages (which acquires range locks) and syncing out
107 107 * cached atime changes. Third, zfs_zinactive() may require a new tx,
108 108 * which could deadlock the system if you were already holding one.
109 109 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
110 110 *
111 111 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
112 112 * as they can span dmu_tx_assign() calls.
113 113 *
114 114 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
115 115 * This is critical because we don't want to block while holding locks.
↓ open down ↓ |
115 lines elided |
↑ open up ↑ |
116 116 * Note, in particular, that if a lock is sometimes acquired before
117 117 * the tx assigns, and sometimes after (e.g. z_lock), then failing to
118 118 * use a non-blocking assign can deadlock the system. The scenario:
119 119 *
120 120 * Thread A has grabbed a lock before calling dmu_tx_assign().
121 121 * Thread B is in an already-assigned tx, and blocks for this lock.
122 122 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
123 123 * forever, because the previous txg can't quiesce until B's tx commits.
124 124 *
125 125 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
126 - * then drop all locks, call dmu_tx_wait(), and try again.
126 + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
127 + * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
128 + * to indicate that this operation has already called dmu_tx_wait().
129 + * This will ensure that we don't retry forever, waiting a short bit
130 + * each time.
127 131 *
128 132 * (5) If the operation succeeded, generate the intent log entry for it
129 133 * before dropping locks. This ensures that the ordering of events
130 134 * in the intent log matches the order in which they actually occurred.
131 135 * During ZIL replay the zfs_log_* functions will update the sequence
132 136 * number to indicate the zil transaction has replayed.
133 137 *
134 138 * (6) At the end of each vnode op, the DMU tx must always commit,
135 139 * regardless of whether there were any errors.
136 140 *
137 141 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
138 142 * to ensure that synchronous semantics are provided when necessary.
139 143 *
140 144 * In general, this is how things should be ordered in each vnode op:
141 145 *
142 146 * ZFS_ENTER(zfsvfs); // exit if unmounted
143 147 * top:
144 148 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
145 149 * rw_enter(...); // grab any other locks you need
146 150 * tx = dmu_tx_create(...); // get DMU tx
147 151 * dmu_tx_hold_*(); // hold each object you might modify
148 - * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
152 + * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
149 153 * if (error) {
150 154 * rw_exit(...); // drop locks
151 155 * zfs_dirent_unlock(dl); // unlock directory entry
152 156 * VN_RELE(...); // release held vnodes
153 157 * if (error == ERESTART) {
158 + * waited = B_TRUE;
154 159 * dmu_tx_wait(tx);
155 160 * dmu_tx_abort(tx);
156 161 * goto top;
157 162 * }
158 163 * dmu_tx_abort(tx); // abort DMU tx
159 164 * ZFS_EXIT(zfsvfs); // finished in zfs
160 165 * return (error); // really out of space
161 166 * }
162 167 * error = do_real_work(); // do whatever this VOP does
163 168 * if (error == 0)
164 169 * zfs_log_*(...); // on success, make ZIL entry
165 170 * dmu_tx_commit(tx); // commit DMU tx -- error or not
166 171 * rw_exit(...); // drop locks
167 172 * zfs_dirent_unlock(dl); // unlock directory entry
168 173 * VN_RELE(...); // release held vnodes
169 174 * zil_commit(zilog, foid); // synchronous when necessary
170 175 * ZFS_EXIT(zfsvfs); // finished in zfs
171 176 * return (error); // done, report error
172 177 */
173 178
174 179 /* ARGSUSED */
175 180 static int
176 181 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
177 182 {
178 183 znode_t *zp = VTOZ(*vpp);
179 184 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
180 185
181 186 ZFS_ENTER(zfsvfs);
182 187 ZFS_VERIFY_ZP(zp);
183 188
184 189 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
185 190 ((flag & FAPPEND) == 0)) {
186 191 ZFS_EXIT(zfsvfs);
187 192 return (SET_ERROR(EPERM));
188 193 }
189 194
190 195 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
191 196 ZTOV(zp)->v_type == VREG &&
192 197 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
193 198 if (fs_vscan(*vpp, cr, 0) != 0) {
194 199 ZFS_EXIT(zfsvfs);
195 200 return (SET_ERROR(EACCES));
196 201 }
197 202 }
198 203
199 204 /* Keep a count of the synchronous opens in the znode */
200 205 if (flag & (FSYNC | FDSYNC))
201 206 atomic_inc_32(&zp->z_sync_cnt);
202 207
203 208 ZFS_EXIT(zfsvfs);
204 209 return (0);
205 210 }
206 211
207 212 /* ARGSUSED */
208 213 static int
209 214 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
210 215 caller_context_t *ct)
211 216 {
212 217 znode_t *zp = VTOZ(vp);
213 218 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
214 219
215 220 /*
216 221 * Clean up any locks held by this process on the vp.
217 222 */
218 223 cleanlocks(vp, ddi_get_pid(), 0);
219 224 cleanshares(vp, ddi_get_pid());
220 225
221 226 ZFS_ENTER(zfsvfs);
222 227 ZFS_VERIFY_ZP(zp);
223 228
224 229 /* Decrement the synchronous opens in the znode */
225 230 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
226 231 atomic_dec_32(&zp->z_sync_cnt);
227 232
228 233 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
229 234 ZTOV(zp)->v_type == VREG &&
230 235 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
231 236 VERIFY(fs_vscan(vp, cr, 1) == 0);
232 237
233 238 ZFS_EXIT(zfsvfs);
234 239 return (0);
235 240 }
236 241
237 242 /*
238 243 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
239 244 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
240 245 */
241 246 static int
242 247 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
243 248 {
244 249 znode_t *zp = VTOZ(vp);
245 250 uint64_t noff = (uint64_t)*off; /* new offset */
246 251 uint64_t file_sz;
247 252 int error;
248 253 boolean_t hole;
249 254
250 255 file_sz = zp->z_size;
251 256 if (noff >= file_sz) {
252 257 return (SET_ERROR(ENXIO));
253 258 }
254 259
255 260 if (cmd == _FIO_SEEK_HOLE)
256 261 hole = B_TRUE;
257 262 else
258 263 hole = B_FALSE;
259 264
260 265 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
261 266
262 267 /* end of file? */
263 268 if ((error == ESRCH) || (noff > file_sz)) {
264 269 /*
265 270 * Handle the virtual hole at the end of file.
266 271 */
267 272 if (hole) {
268 273 *off = file_sz;
269 274 return (0);
270 275 }
271 276 return (SET_ERROR(ENXIO));
272 277 }
273 278
274 279 if (noff < *off)
275 280 return (error);
276 281 *off = noff;
277 282 return (error);
278 283 }
279 284
280 285 /* ARGSUSED */
281 286 static int
282 287 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
283 288 int *rvalp, caller_context_t *ct)
284 289 {
285 290 offset_t off;
286 291 int error;
287 292 zfsvfs_t *zfsvfs;
288 293 znode_t *zp;
289 294
290 295 switch (com) {
291 296 case _FIOFFS:
292 297 return (zfs_sync(vp->v_vfsp, 0, cred));
293 298
294 299 /*
295 300 * The following two ioctls are used by bfu. Faking out,
296 301 * necessary to avoid bfu errors.
297 302 */
298 303 case _FIOGDIO:
299 304 case _FIOSDIO:
300 305 return (0);
301 306
302 307 case _FIO_SEEK_DATA:
303 308 case _FIO_SEEK_HOLE:
304 309 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
305 310 return (SET_ERROR(EFAULT));
306 311
307 312 zp = VTOZ(vp);
308 313 zfsvfs = zp->z_zfsvfs;
309 314 ZFS_ENTER(zfsvfs);
310 315 ZFS_VERIFY_ZP(zp);
311 316
312 317 /* offset parameter is in/out */
313 318 error = zfs_holey(vp, com, &off);
314 319 ZFS_EXIT(zfsvfs);
315 320 if (error)
316 321 return (error);
317 322 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
318 323 return (SET_ERROR(EFAULT));
319 324 return (0);
320 325 }
321 326 return (SET_ERROR(ENOTTY));
322 327 }
323 328
324 329 /*
325 330 * Utility functions to map and unmap a single physical page. These
326 331 * are used to manage the mappable copies of ZFS file data, and therefore
327 332 * do not update ref/mod bits.
328 333 */
329 334 caddr_t
330 335 zfs_map_page(page_t *pp, enum seg_rw rw)
331 336 {
332 337 if (kpm_enable)
333 338 return (hat_kpm_mapin(pp, 0));
334 339 ASSERT(rw == S_READ || rw == S_WRITE);
335 340 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
336 341 (caddr_t)-1));
337 342 }
338 343
339 344 void
340 345 zfs_unmap_page(page_t *pp, caddr_t addr)
341 346 {
342 347 if (kpm_enable) {
343 348 hat_kpm_mapout(pp, 0, addr);
344 349 } else {
345 350 ppmapout(addr);
346 351 }
347 352 }
348 353
349 354 /*
350 355 * When a file is memory mapped, we must keep the IO data synchronized
351 356 * between the DMU cache and the memory mapped pages. What this means:
352 357 *
353 358 * On Write: If we find a memory mapped page, we write to *both*
354 359 * the page and the dmu buffer.
355 360 */
356 361 static void
357 362 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
358 363 {
359 364 int64_t off;
360 365
361 366 off = start & PAGEOFFSET;
362 367 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
363 368 page_t *pp;
364 369 uint64_t nbytes = MIN(PAGESIZE - off, len);
365 370
366 371 if (pp = page_lookup(vp, start, SE_SHARED)) {
367 372 caddr_t va;
368 373
369 374 va = zfs_map_page(pp, S_WRITE);
370 375 (void) dmu_read(os, oid, start+off, nbytes, va+off,
371 376 DMU_READ_PREFETCH);
372 377 zfs_unmap_page(pp, va);
373 378 page_unlock(pp);
374 379 }
375 380 len -= nbytes;
376 381 off = 0;
377 382 }
378 383 }
379 384
380 385 /*
381 386 * When a file is memory mapped, we must keep the IO data synchronized
382 387 * between the DMU cache and the memory mapped pages. What this means:
383 388 *
384 389 * On Read: We "read" preferentially from memory mapped pages,
385 390 * else we default from the dmu buffer.
386 391 *
387 392 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
388 393 * the file is memory mapped.
389 394 */
390 395 static int
391 396 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
392 397 {
393 398 znode_t *zp = VTOZ(vp);
394 399 objset_t *os = zp->z_zfsvfs->z_os;
395 400 int64_t start, off;
396 401 int len = nbytes;
397 402 int error = 0;
398 403
399 404 start = uio->uio_loffset;
400 405 off = start & PAGEOFFSET;
401 406 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
402 407 page_t *pp;
403 408 uint64_t bytes = MIN(PAGESIZE - off, len);
404 409
405 410 if (pp = page_lookup(vp, start, SE_SHARED)) {
406 411 caddr_t va;
407 412
408 413 va = zfs_map_page(pp, S_READ);
409 414 error = uiomove(va + off, bytes, UIO_READ, uio);
410 415 zfs_unmap_page(pp, va);
411 416 page_unlock(pp);
412 417 } else {
413 418 error = dmu_read_uio(os, zp->z_id, uio, bytes);
414 419 }
415 420 len -= bytes;
416 421 off = 0;
417 422 if (error)
418 423 break;
419 424 }
420 425 return (error);
421 426 }
422 427
423 428 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
424 429
425 430 /*
426 431 * Read bytes from specified file into supplied buffer.
427 432 *
428 433 * IN: vp - vnode of file to be read from.
429 434 * uio - structure supplying read location, range info,
430 435 * and return buffer.
431 436 * ioflag - SYNC flags; used to provide FRSYNC semantics.
432 437 * cr - credentials of caller.
433 438 * ct - caller context
434 439 *
435 440 * OUT: uio - updated offset and range, buffer filled.
436 441 *
437 442 * RETURN: 0 on success, error code on failure.
438 443 *
439 444 * Side Effects:
440 445 * vp - atime updated if byte count > 0
441 446 */
442 447 /* ARGSUSED */
443 448 static int
444 449 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
445 450 {
446 451 znode_t *zp = VTOZ(vp);
447 452 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
448 453 objset_t *os;
449 454 ssize_t n, nbytes;
450 455 int error = 0;
451 456 rl_t *rl;
452 457 xuio_t *xuio = NULL;
453 458
454 459 ZFS_ENTER(zfsvfs);
455 460 ZFS_VERIFY_ZP(zp);
456 461 os = zfsvfs->z_os;
457 462
458 463 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
459 464 ZFS_EXIT(zfsvfs);
460 465 return (SET_ERROR(EACCES));
461 466 }
462 467
463 468 /*
464 469 * Validate file offset
465 470 */
466 471 if (uio->uio_loffset < (offset_t)0) {
467 472 ZFS_EXIT(zfsvfs);
468 473 return (SET_ERROR(EINVAL));
469 474 }
470 475
471 476 /*
472 477 * Fasttrack empty reads
473 478 */
474 479 if (uio->uio_resid == 0) {
475 480 ZFS_EXIT(zfsvfs);
476 481 return (0);
477 482 }
478 483
479 484 /*
480 485 * Check for mandatory locks
481 486 */
482 487 if (MANDMODE(zp->z_mode)) {
483 488 if (error = chklock(vp, FREAD,
484 489 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
485 490 ZFS_EXIT(zfsvfs);
486 491 return (error);
487 492 }
488 493 }
489 494
490 495 /*
491 496 * If we're in FRSYNC mode, sync out this znode before reading it.
492 497 */
493 498 if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
494 499 zil_commit(zfsvfs->z_log, zp->z_id);
495 500
496 501 /*
497 502 * Lock the range against changes.
498 503 */
499 504 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
500 505
501 506 /*
502 507 * If we are reading past end-of-file we can skip
503 508 * to the end; but we might still need to set atime.
504 509 */
505 510 if (uio->uio_loffset >= zp->z_size) {
506 511 error = 0;
507 512 goto out;
508 513 }
509 514
510 515 ASSERT(uio->uio_loffset < zp->z_size);
511 516 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
512 517
513 518 if ((uio->uio_extflg == UIO_XUIO) &&
514 519 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
515 520 int nblk;
516 521 int blksz = zp->z_blksz;
517 522 uint64_t offset = uio->uio_loffset;
518 523
519 524 xuio = (xuio_t *)uio;
520 525 if ((ISP2(blksz))) {
521 526 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
522 527 blksz)) / blksz;
523 528 } else {
524 529 ASSERT(offset + n <= blksz);
525 530 nblk = 1;
526 531 }
527 532 (void) dmu_xuio_init(xuio, nblk);
528 533
529 534 if (vn_has_cached_data(vp)) {
530 535 /*
531 536 * For simplicity, we always allocate a full buffer
532 537 * even if we only expect to read a portion of a block.
533 538 */
534 539 while (--nblk >= 0) {
535 540 (void) dmu_xuio_add(xuio,
536 541 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
537 542 blksz), 0, blksz);
538 543 }
539 544 }
540 545 }
541 546
542 547 while (n > 0) {
543 548 nbytes = MIN(n, zfs_read_chunk_size -
544 549 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
545 550
546 551 if (vn_has_cached_data(vp))
547 552 error = mappedread(vp, nbytes, uio);
548 553 else
549 554 error = dmu_read_uio(os, zp->z_id, uio, nbytes);
550 555 if (error) {
551 556 /* convert checksum errors into IO errors */
552 557 if (error == ECKSUM)
553 558 error = SET_ERROR(EIO);
554 559 break;
555 560 }
556 561
557 562 n -= nbytes;
558 563 }
559 564 out:
560 565 zfs_range_unlock(rl);
561 566
562 567 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
563 568 ZFS_EXIT(zfsvfs);
564 569 return (error);
565 570 }
566 571
567 572 /*
568 573 * Write the bytes to a file.
569 574 *
570 575 * IN: vp - vnode of file to be written to.
571 576 * uio - structure supplying write location, range info,
572 577 * and data buffer.
573 578 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
574 579 * set if in append mode.
575 580 * cr - credentials of caller.
576 581 * ct - caller context (NFS/CIFS fem monitor only)
577 582 *
578 583 * OUT: uio - updated offset and range.
579 584 *
580 585 * RETURN: 0 on success, error code on failure.
581 586 *
582 587 * Timestamps:
583 588 * vp - ctime|mtime updated if byte count > 0
584 589 */
585 590
586 591 /* ARGSUSED */
587 592 static int
588 593 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
589 594 {
590 595 znode_t *zp = VTOZ(vp);
591 596 rlim64_t limit = uio->uio_llimit;
592 597 ssize_t start_resid = uio->uio_resid;
593 598 ssize_t tx_bytes;
594 599 uint64_t end_size;
595 600 dmu_tx_t *tx;
596 601 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
597 602 zilog_t *zilog;
598 603 offset_t woff;
599 604 ssize_t n, nbytes;
600 605 rl_t *rl;
601 606 int max_blksz = zfsvfs->z_max_blksz;
602 607 int error = 0;
603 608 arc_buf_t *abuf;
604 609 iovec_t *aiov = NULL;
605 610 xuio_t *xuio = NULL;
606 611 int i_iov = 0;
607 612 int iovcnt = uio->uio_iovcnt;
608 613 iovec_t *iovp = uio->uio_iov;
609 614 int write_eof;
610 615 int count = 0;
611 616 sa_bulk_attr_t bulk[4];
612 617 uint64_t mtime[2], ctime[2];
613 618
614 619 /*
615 620 * Fasttrack empty write
616 621 */
617 622 n = start_resid;
618 623 if (n == 0)
619 624 return (0);
620 625
621 626 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
622 627 limit = MAXOFFSET_T;
623 628
624 629 ZFS_ENTER(zfsvfs);
625 630 ZFS_VERIFY_ZP(zp);
626 631
627 632 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
628 633 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
629 634 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
630 635 &zp->z_size, 8);
631 636 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
632 637 &zp->z_pflags, 8);
633 638
634 639 /*
635 640 * If immutable or not appending then return EPERM
636 641 */
637 642 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
638 643 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
639 644 (uio->uio_loffset < zp->z_size))) {
640 645 ZFS_EXIT(zfsvfs);
641 646 return (SET_ERROR(EPERM));
642 647 }
643 648
644 649 zilog = zfsvfs->z_log;
645 650
646 651 /*
647 652 * Validate file offset
648 653 */
649 654 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
650 655 if (woff < 0) {
651 656 ZFS_EXIT(zfsvfs);
652 657 return (SET_ERROR(EINVAL));
653 658 }
654 659
655 660 /*
656 661 * Check for mandatory locks before calling zfs_range_lock()
657 662 * in order to prevent a deadlock with locks set via fcntl().
658 663 */
659 664 if (MANDMODE((mode_t)zp->z_mode) &&
660 665 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
661 666 ZFS_EXIT(zfsvfs);
662 667 return (error);
663 668 }
664 669
665 670 /*
666 671 * Pre-fault the pages to ensure slow (eg NFS) pages
667 672 * don't hold up txg.
668 673 * Skip this if uio contains loaned arc_buf.
669 674 */
670 675 if ((uio->uio_extflg == UIO_XUIO) &&
671 676 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
672 677 xuio = (xuio_t *)uio;
673 678 else
674 679 uio_prefaultpages(MIN(n, max_blksz), uio);
675 680
676 681 /*
677 682 * If in append mode, set the io offset pointer to eof.
678 683 */
679 684 if (ioflag & FAPPEND) {
680 685 /*
681 686 * Obtain an appending range lock to guarantee file append
682 687 * semantics. We reset the write offset once we have the lock.
683 688 */
684 689 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
685 690 woff = rl->r_off;
686 691 if (rl->r_len == UINT64_MAX) {
687 692 /*
688 693 * We overlocked the file because this write will cause
689 694 * the file block size to increase.
690 695 * Note that zp_size cannot change with this lock held.
691 696 */
692 697 woff = zp->z_size;
693 698 }
694 699 uio->uio_loffset = woff;
695 700 } else {
696 701 /*
697 702 * Note that if the file block size will change as a result of
698 703 * this write, then this range lock will lock the entire file
699 704 * so that we can re-write the block safely.
700 705 */
701 706 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
702 707 }
703 708
704 709 if (woff >= limit) {
705 710 zfs_range_unlock(rl);
706 711 ZFS_EXIT(zfsvfs);
707 712 return (SET_ERROR(EFBIG));
708 713 }
709 714
710 715 if ((woff + n) > limit || woff > (limit - n))
711 716 n = limit - woff;
712 717
713 718 /* Will this write extend the file length? */
714 719 write_eof = (woff + n > zp->z_size);
715 720
716 721 end_size = MAX(zp->z_size, woff + n);
717 722
718 723 /*
719 724 * Write the file in reasonable size chunks. Each chunk is written
720 725 * in a separate transaction; this keeps the intent log records small
721 726 * and allows us to do more fine-grained space accounting.
722 727 */
723 728 while (n > 0) {
724 729 abuf = NULL;
725 730 woff = uio->uio_loffset;
726 731 again:
727 732 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
728 733 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
729 734 if (abuf != NULL)
730 735 dmu_return_arcbuf(abuf);
731 736 error = SET_ERROR(EDQUOT);
732 737 break;
733 738 }
734 739
735 740 if (xuio && abuf == NULL) {
736 741 ASSERT(i_iov < iovcnt);
737 742 aiov = &iovp[i_iov];
738 743 abuf = dmu_xuio_arcbuf(xuio, i_iov);
739 744 dmu_xuio_clear(xuio, i_iov);
740 745 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
741 746 iovec_t *, aiov, arc_buf_t *, abuf);
742 747 ASSERT((aiov->iov_base == abuf->b_data) ||
743 748 ((char *)aiov->iov_base - (char *)abuf->b_data +
744 749 aiov->iov_len == arc_buf_size(abuf)));
745 750 i_iov++;
746 751 } else if (abuf == NULL && n >= max_blksz &&
747 752 woff >= zp->z_size &&
748 753 P2PHASE(woff, max_blksz) == 0 &&
749 754 zp->z_blksz == max_blksz) {
750 755 /*
751 756 * This write covers a full block. "Borrow" a buffer
752 757 * from the dmu so that we can fill it before we enter
753 758 * a transaction. This avoids the possibility of
754 759 * holding up the transaction if the data copy hangs
755 760 * up on a pagefault (e.g., from an NFS server mapping).
756 761 */
757 762 size_t cbytes;
758 763
759 764 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
760 765 max_blksz);
761 766 ASSERT(abuf != NULL);
762 767 ASSERT(arc_buf_size(abuf) == max_blksz);
763 768 if (error = uiocopy(abuf->b_data, max_blksz,
764 769 UIO_WRITE, uio, &cbytes)) {
765 770 dmu_return_arcbuf(abuf);
766 771 break;
767 772 }
768 773 ASSERT(cbytes == max_blksz);
769 774 }
770 775
771 776 /*
772 777 * Start a transaction.
773 778 */
774 779 tx = dmu_tx_create(zfsvfs->z_os);
775 780 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
776 781 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
777 782 zfs_sa_upgrade_txholds(tx, zp);
778 783 error = dmu_tx_assign(tx, TXG_NOWAIT);
779 784 if (error) {
780 785 if (error == ERESTART) {
781 786 dmu_tx_wait(tx);
782 787 dmu_tx_abort(tx);
783 788 goto again;
784 789 }
785 790 dmu_tx_abort(tx);
786 791 if (abuf != NULL)
787 792 dmu_return_arcbuf(abuf);
788 793 break;
789 794 }
790 795
791 796 /*
792 797 * If zfs_range_lock() over-locked we grow the blocksize
793 798 * and then reduce the lock range. This will only happen
794 799 * on the first iteration since zfs_range_reduce() will
795 800 * shrink down r_len to the appropriate size.
796 801 */
797 802 if (rl->r_len == UINT64_MAX) {
798 803 uint64_t new_blksz;
799 804
800 805 if (zp->z_blksz > max_blksz) {
801 806 ASSERT(!ISP2(zp->z_blksz));
802 807 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
803 808 } else {
804 809 new_blksz = MIN(end_size, max_blksz);
805 810 }
806 811 zfs_grow_blocksize(zp, new_blksz, tx);
807 812 zfs_range_reduce(rl, woff, n);
808 813 }
809 814
810 815 /*
811 816 * XXX - should we really limit each write to z_max_blksz?
812 817 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
813 818 */
814 819 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
815 820
816 821 if (abuf == NULL) {
817 822 tx_bytes = uio->uio_resid;
818 823 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
819 824 uio, nbytes, tx);
820 825 tx_bytes -= uio->uio_resid;
821 826 } else {
822 827 tx_bytes = nbytes;
823 828 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
824 829 /*
825 830 * If this is not a full block write, but we are
826 831 * extending the file past EOF and this data starts
827 832 * block-aligned, use assign_arcbuf(). Otherwise,
828 833 * write via dmu_write().
829 834 */
830 835 if (tx_bytes < max_blksz && (!write_eof ||
831 836 aiov->iov_base != abuf->b_data)) {
832 837 ASSERT(xuio);
833 838 dmu_write(zfsvfs->z_os, zp->z_id, woff,
834 839 aiov->iov_len, aiov->iov_base, tx);
835 840 dmu_return_arcbuf(abuf);
836 841 xuio_stat_wbuf_copied();
837 842 } else {
838 843 ASSERT(xuio || tx_bytes == max_blksz);
839 844 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
840 845 woff, abuf, tx);
841 846 }
842 847 ASSERT(tx_bytes <= uio->uio_resid);
843 848 uioskip(uio, tx_bytes);
844 849 }
845 850 if (tx_bytes && vn_has_cached_data(vp)) {
846 851 update_pages(vp, woff,
847 852 tx_bytes, zfsvfs->z_os, zp->z_id);
848 853 }
849 854
850 855 /*
851 856 * If we made no progress, we're done. If we made even
852 857 * partial progress, update the znode and ZIL accordingly.
853 858 */
854 859 if (tx_bytes == 0) {
855 860 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
856 861 (void *)&zp->z_size, sizeof (uint64_t), tx);
857 862 dmu_tx_commit(tx);
858 863 ASSERT(error != 0);
859 864 break;
860 865 }
861 866
862 867 /*
863 868 * Clear Set-UID/Set-GID bits on successful write if not
864 869 * privileged and at least one of the excute bits is set.
865 870 *
866 871 * It would be nice to to this after all writes have
867 872 * been done, but that would still expose the ISUID/ISGID
868 873 * to another app after the partial write is committed.
869 874 *
870 875 * Note: we don't call zfs_fuid_map_id() here because
871 876 * user 0 is not an ephemeral uid.
872 877 */
873 878 mutex_enter(&zp->z_acl_lock);
874 879 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
875 880 (S_IXUSR >> 6))) != 0 &&
876 881 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
877 882 secpolicy_vnode_setid_retain(cr,
878 883 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
879 884 uint64_t newmode;
880 885 zp->z_mode &= ~(S_ISUID | S_ISGID);
881 886 newmode = zp->z_mode;
882 887 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
883 888 (void *)&newmode, sizeof (uint64_t), tx);
884 889 }
885 890 mutex_exit(&zp->z_acl_lock);
886 891
887 892 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
888 893 B_TRUE);
889 894
890 895 /*
891 896 * Update the file size (zp_size) if it has changed;
892 897 * account for possible concurrent updates.
893 898 */
894 899 while ((end_size = zp->z_size) < uio->uio_loffset) {
895 900 (void) atomic_cas_64(&zp->z_size, end_size,
896 901 uio->uio_loffset);
897 902 ASSERT(error == 0);
898 903 }
899 904 /*
900 905 * If we are replaying and eof is non zero then force
901 906 * the file size to the specified eof. Note, there's no
902 907 * concurrency during replay.
903 908 */
904 909 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
905 910 zp->z_size = zfsvfs->z_replay_eof;
906 911
907 912 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
908 913
909 914 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
910 915 dmu_tx_commit(tx);
911 916
912 917 if (error != 0)
913 918 break;
914 919 ASSERT(tx_bytes == nbytes);
915 920 n -= nbytes;
916 921
917 922 if (!xuio && n > 0)
918 923 uio_prefaultpages(MIN(n, max_blksz), uio);
919 924 }
920 925
921 926 zfs_range_unlock(rl);
922 927
923 928 /*
924 929 * If we're in replay mode, or we made no progress, return error.
925 930 * Otherwise, it's at least a partial write, so it's successful.
926 931 */
927 932 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
928 933 ZFS_EXIT(zfsvfs);
929 934 return (error);
930 935 }
931 936
932 937 if (ioflag & (FSYNC | FDSYNC) ||
933 938 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
934 939 zil_commit(zilog, zp->z_id);
935 940
936 941 ZFS_EXIT(zfsvfs);
937 942 return (0);
938 943 }
939 944
940 945 void
941 946 zfs_get_done(zgd_t *zgd, int error)
942 947 {
943 948 znode_t *zp = zgd->zgd_private;
944 949 objset_t *os = zp->z_zfsvfs->z_os;
945 950
946 951 if (zgd->zgd_db)
947 952 dmu_buf_rele(zgd->zgd_db, zgd);
948 953
949 954 zfs_range_unlock(zgd->zgd_rl);
950 955
951 956 /*
952 957 * Release the vnode asynchronously as we currently have the
953 958 * txg stopped from syncing.
954 959 */
955 960 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
956 961
957 962 if (error == 0 && zgd->zgd_bp)
958 963 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
959 964
960 965 kmem_free(zgd, sizeof (zgd_t));
961 966 }
962 967
963 968 #ifdef DEBUG
964 969 static int zil_fault_io = 0;
965 970 #endif
966 971
967 972 /*
968 973 * Get data to generate a TX_WRITE intent log record.
969 974 */
970 975 int
971 976 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
972 977 {
973 978 zfsvfs_t *zfsvfs = arg;
974 979 objset_t *os = zfsvfs->z_os;
975 980 znode_t *zp;
976 981 uint64_t object = lr->lr_foid;
977 982 uint64_t offset = lr->lr_offset;
978 983 uint64_t size = lr->lr_length;
979 984 blkptr_t *bp = &lr->lr_blkptr;
980 985 dmu_buf_t *db;
981 986 zgd_t *zgd;
982 987 int error = 0;
983 988
984 989 ASSERT(zio != NULL);
985 990 ASSERT(size != 0);
986 991
987 992 /*
988 993 * Nothing to do if the file has been removed
989 994 */
990 995 if (zfs_zget(zfsvfs, object, &zp) != 0)
991 996 return (SET_ERROR(ENOENT));
992 997 if (zp->z_unlinked) {
993 998 /*
994 999 * Release the vnode asynchronously as we currently have the
995 1000 * txg stopped from syncing.
996 1001 */
997 1002 VN_RELE_ASYNC(ZTOV(zp),
998 1003 dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
999 1004 return (SET_ERROR(ENOENT));
1000 1005 }
1001 1006
1002 1007 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1003 1008 zgd->zgd_zilog = zfsvfs->z_log;
1004 1009 zgd->zgd_private = zp;
1005 1010
1006 1011 /*
1007 1012 * Write records come in two flavors: immediate and indirect.
1008 1013 * For small writes it's cheaper to store the data with the
1009 1014 * log record (immediate); for large writes it's cheaper to
1010 1015 * sync the data and get a pointer to it (indirect) so that
1011 1016 * we don't have to write the data twice.
1012 1017 */
1013 1018 if (buf != NULL) { /* immediate write */
1014 1019 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1015 1020 /* test for truncation needs to be done while range locked */
1016 1021 if (offset >= zp->z_size) {
1017 1022 error = SET_ERROR(ENOENT);
1018 1023 } else {
1019 1024 error = dmu_read(os, object, offset, size, buf,
1020 1025 DMU_READ_NO_PREFETCH);
1021 1026 }
1022 1027 ASSERT(error == 0 || error == ENOENT);
1023 1028 } else { /* indirect write */
1024 1029 /*
1025 1030 * Have to lock the whole block to ensure when it's
1026 1031 * written out and it's checksum is being calculated
1027 1032 * that no one can change the data. We need to re-check
1028 1033 * blocksize after we get the lock in case it's changed!
1029 1034 */
1030 1035 for (;;) {
1031 1036 uint64_t blkoff;
1032 1037 size = zp->z_blksz;
1033 1038 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1034 1039 offset -= blkoff;
1035 1040 zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1036 1041 RL_READER);
1037 1042 if (zp->z_blksz == size)
1038 1043 break;
1039 1044 offset += blkoff;
1040 1045 zfs_range_unlock(zgd->zgd_rl);
1041 1046 }
1042 1047 /* test for truncation needs to be done while range locked */
1043 1048 if (lr->lr_offset >= zp->z_size)
1044 1049 error = SET_ERROR(ENOENT);
1045 1050 #ifdef DEBUG
1046 1051 if (zil_fault_io) {
1047 1052 error = SET_ERROR(EIO);
1048 1053 zil_fault_io = 0;
1049 1054 }
1050 1055 #endif
1051 1056 if (error == 0)
1052 1057 error = dmu_buf_hold(os, object, offset, zgd, &db,
1053 1058 DMU_READ_NO_PREFETCH);
1054 1059
1055 1060 if (error == 0) {
1056 1061 blkptr_t *obp = dmu_buf_get_blkptr(db);
1057 1062 if (obp) {
1058 1063 ASSERT(BP_IS_HOLE(bp));
1059 1064 *bp = *obp;
1060 1065 }
1061 1066
1062 1067 zgd->zgd_db = db;
1063 1068 zgd->zgd_bp = bp;
1064 1069
1065 1070 ASSERT(db->db_offset == offset);
1066 1071 ASSERT(db->db_size == size);
1067 1072
1068 1073 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1069 1074 zfs_get_done, zgd);
1070 1075 ASSERT(error || lr->lr_length <= zp->z_blksz);
1071 1076
1072 1077 /*
1073 1078 * On success, we need to wait for the write I/O
1074 1079 * initiated by dmu_sync() to complete before we can
1075 1080 * release this dbuf. We will finish everything up
1076 1081 * in the zfs_get_done() callback.
1077 1082 */
1078 1083 if (error == 0)
1079 1084 return (0);
1080 1085
1081 1086 if (error == EALREADY) {
1082 1087 lr->lr_common.lrc_txtype = TX_WRITE2;
1083 1088 error = 0;
1084 1089 }
1085 1090 }
1086 1091 }
1087 1092
1088 1093 zfs_get_done(zgd, error);
1089 1094
1090 1095 return (error);
1091 1096 }
1092 1097
1093 1098 /*ARGSUSED*/
1094 1099 static int
1095 1100 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1096 1101 caller_context_t *ct)
1097 1102 {
1098 1103 znode_t *zp = VTOZ(vp);
1099 1104 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1100 1105 int error;
1101 1106
1102 1107 ZFS_ENTER(zfsvfs);
1103 1108 ZFS_VERIFY_ZP(zp);
1104 1109
1105 1110 if (flag & V_ACE_MASK)
1106 1111 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1107 1112 else
1108 1113 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1109 1114
1110 1115 ZFS_EXIT(zfsvfs);
1111 1116 return (error);
1112 1117 }
1113 1118
1114 1119 /*
1115 1120 * If vnode is for a device return a specfs vnode instead.
1116 1121 */
1117 1122 static int
1118 1123 specvp_check(vnode_t **vpp, cred_t *cr)
1119 1124 {
1120 1125 int error = 0;
1121 1126
1122 1127 if (IS_DEVVP(*vpp)) {
1123 1128 struct vnode *svp;
1124 1129
1125 1130 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1126 1131 VN_RELE(*vpp);
1127 1132 if (svp == NULL)
1128 1133 error = SET_ERROR(ENOSYS);
1129 1134 *vpp = svp;
1130 1135 }
1131 1136 return (error);
1132 1137 }
1133 1138
1134 1139
1135 1140 /*
1136 1141 * Lookup an entry in a directory, or an extended attribute directory.
1137 1142 * If it exists, return a held vnode reference for it.
1138 1143 *
1139 1144 * IN: dvp - vnode of directory to search.
1140 1145 * nm - name of entry to lookup.
1141 1146 * pnp - full pathname to lookup [UNUSED].
1142 1147 * flags - LOOKUP_XATTR set if looking for an attribute.
1143 1148 * rdir - root directory vnode [UNUSED].
1144 1149 * cr - credentials of caller.
1145 1150 * ct - caller context
1146 1151 * direntflags - directory lookup flags
1147 1152 * realpnp - returned pathname.
1148 1153 *
1149 1154 * OUT: vpp - vnode of located entry, NULL if not found.
1150 1155 *
1151 1156 * RETURN: 0 on success, error code on failure.
1152 1157 *
1153 1158 * Timestamps:
1154 1159 * NA
1155 1160 */
1156 1161 /* ARGSUSED */
1157 1162 static int
1158 1163 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1159 1164 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1160 1165 int *direntflags, pathname_t *realpnp)
1161 1166 {
1162 1167 znode_t *zdp = VTOZ(dvp);
1163 1168 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1164 1169 int error = 0;
1165 1170
1166 1171 /* fast path */
1167 1172 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1168 1173
1169 1174 if (dvp->v_type != VDIR) {
1170 1175 return (SET_ERROR(ENOTDIR));
1171 1176 } else if (zdp->z_sa_hdl == NULL) {
1172 1177 return (SET_ERROR(EIO));
1173 1178 }
1174 1179
1175 1180 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1176 1181 error = zfs_fastaccesschk_execute(zdp, cr);
1177 1182 if (!error) {
1178 1183 *vpp = dvp;
1179 1184 VN_HOLD(*vpp);
1180 1185 return (0);
1181 1186 }
1182 1187 return (error);
1183 1188 } else {
1184 1189 vnode_t *tvp = dnlc_lookup(dvp, nm);
1185 1190
1186 1191 if (tvp) {
1187 1192 error = zfs_fastaccesschk_execute(zdp, cr);
1188 1193 if (error) {
1189 1194 VN_RELE(tvp);
1190 1195 return (error);
1191 1196 }
1192 1197 if (tvp == DNLC_NO_VNODE) {
1193 1198 VN_RELE(tvp);
1194 1199 return (SET_ERROR(ENOENT));
1195 1200 } else {
1196 1201 *vpp = tvp;
1197 1202 return (specvp_check(vpp, cr));
1198 1203 }
1199 1204 }
1200 1205 }
1201 1206 }
1202 1207
1203 1208 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1204 1209
1205 1210 ZFS_ENTER(zfsvfs);
1206 1211 ZFS_VERIFY_ZP(zdp);
1207 1212
1208 1213 *vpp = NULL;
1209 1214
1210 1215 if (flags & LOOKUP_XATTR) {
1211 1216 /*
1212 1217 * If the xattr property is off, refuse the lookup request.
1213 1218 */
1214 1219 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1215 1220 ZFS_EXIT(zfsvfs);
1216 1221 return (SET_ERROR(EINVAL));
1217 1222 }
1218 1223
1219 1224 /*
1220 1225 * We don't allow recursive attributes..
1221 1226 * Maybe someday we will.
1222 1227 */
1223 1228 if (zdp->z_pflags & ZFS_XATTR) {
1224 1229 ZFS_EXIT(zfsvfs);
1225 1230 return (SET_ERROR(EINVAL));
1226 1231 }
1227 1232
1228 1233 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1229 1234 ZFS_EXIT(zfsvfs);
1230 1235 return (error);
1231 1236 }
1232 1237
1233 1238 /*
1234 1239 * Do we have permission to get into attribute directory?
1235 1240 */
1236 1241
1237 1242 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1238 1243 B_FALSE, cr)) {
1239 1244 VN_RELE(*vpp);
1240 1245 *vpp = NULL;
1241 1246 }
1242 1247
1243 1248 ZFS_EXIT(zfsvfs);
1244 1249 return (error);
1245 1250 }
1246 1251
1247 1252 if (dvp->v_type != VDIR) {
1248 1253 ZFS_EXIT(zfsvfs);
1249 1254 return (SET_ERROR(ENOTDIR));
1250 1255 }
1251 1256
1252 1257 /*
1253 1258 * Check accessibility of directory.
1254 1259 */
1255 1260
1256 1261 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1257 1262 ZFS_EXIT(zfsvfs);
1258 1263 return (error);
1259 1264 }
1260 1265
1261 1266 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1262 1267 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1263 1268 ZFS_EXIT(zfsvfs);
1264 1269 return (SET_ERROR(EILSEQ));
1265 1270 }
1266 1271
1267 1272 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1268 1273 if (error == 0)
1269 1274 error = specvp_check(vpp, cr);
1270 1275
1271 1276 ZFS_EXIT(zfsvfs);
1272 1277 return (error);
1273 1278 }
1274 1279
1275 1280 /*
1276 1281 * Attempt to create a new entry in a directory. If the entry
1277 1282 * already exists, truncate the file if permissible, else return
1278 1283 * an error. Return the vp of the created or trunc'd file.
1279 1284 *
1280 1285 * IN: dvp - vnode of directory to put new file entry in.
1281 1286 * name - name of new file entry.
1282 1287 * vap - attributes of new file.
1283 1288 * excl - flag indicating exclusive or non-exclusive mode.
1284 1289 * mode - mode to open file with.
1285 1290 * cr - credentials of caller.
1286 1291 * flag - large file flag [UNUSED].
1287 1292 * ct - caller context
1288 1293 * vsecp - ACL to be set
1289 1294 *
1290 1295 * OUT: vpp - vnode of created or trunc'd entry.
1291 1296 *
1292 1297 * RETURN: 0 on success, error code on failure.
1293 1298 *
1294 1299 * Timestamps:
1295 1300 * dvp - ctime|mtime updated if new entry created
1296 1301 * vp - ctime|mtime always, atime if new
1297 1302 */
1298 1303
1299 1304 /* ARGSUSED */
1300 1305 static int
1301 1306 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1302 1307 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1303 1308 vsecattr_t *vsecp)
1304 1309 {
1305 1310 znode_t *zp, *dzp = VTOZ(dvp);
1306 1311 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1307 1312 zilog_t *zilog;
↓ open down ↓ |
1144 lines elided |
↑ open up ↑ |
1308 1313 objset_t *os;
1309 1314 zfs_dirlock_t *dl;
1310 1315 dmu_tx_t *tx;
1311 1316 int error;
1312 1317 ksid_t *ksid;
1313 1318 uid_t uid;
1314 1319 gid_t gid = crgetgid(cr);
1315 1320 zfs_acl_ids_t acl_ids;
1316 1321 boolean_t fuid_dirtied;
1317 1322 boolean_t have_acl = B_FALSE;
1323 + boolean_t waited = B_FALSE;
1318 1324
1319 1325 /*
1320 1326 * If we have an ephemeral id, ACL, or XVATTR then
1321 1327 * make sure file system is at proper version
1322 1328 */
1323 1329
1324 1330 ksid = crgetsid(cr, KSID_OWNER);
1325 1331 if (ksid)
1326 1332 uid = ksid_getid(ksid);
1327 1333 else
1328 1334 uid = crgetuid(cr);
1329 1335
1330 1336 if (zfsvfs->z_use_fuids == B_FALSE &&
1331 1337 (vsecp || (vap->va_mask & AT_XVATTR) ||
1332 1338 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1333 1339 return (SET_ERROR(EINVAL));
1334 1340
1335 1341 ZFS_ENTER(zfsvfs);
1336 1342 ZFS_VERIFY_ZP(dzp);
1337 1343 os = zfsvfs->z_os;
1338 1344 zilog = zfsvfs->z_log;
1339 1345
1340 1346 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1341 1347 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1342 1348 ZFS_EXIT(zfsvfs);
1343 1349 return (SET_ERROR(EILSEQ));
1344 1350 }
1345 1351
1346 1352 if (vap->va_mask & AT_XVATTR) {
1347 1353 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1348 1354 crgetuid(cr), cr, vap->va_type)) != 0) {
1349 1355 ZFS_EXIT(zfsvfs);
1350 1356 return (error);
1351 1357 }
1352 1358 }
1353 1359 top:
1354 1360 *vpp = NULL;
1355 1361
1356 1362 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1357 1363 vap->va_mode &= ~VSVTX;
1358 1364
1359 1365 if (*name == '\0') {
1360 1366 /*
1361 1367 * Null component name refers to the directory itself.
1362 1368 */
1363 1369 VN_HOLD(dvp);
1364 1370 zp = dzp;
1365 1371 dl = NULL;
1366 1372 error = 0;
1367 1373 } else {
1368 1374 /* possible VN_HOLD(zp) */
1369 1375 int zflg = 0;
1370 1376
1371 1377 if (flag & FIGNORECASE)
1372 1378 zflg |= ZCILOOK;
1373 1379
1374 1380 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1375 1381 NULL, NULL);
1376 1382 if (error) {
1377 1383 if (have_acl)
1378 1384 zfs_acl_ids_free(&acl_ids);
1379 1385 if (strcmp(name, "..") == 0)
1380 1386 error = SET_ERROR(EISDIR);
1381 1387 ZFS_EXIT(zfsvfs);
1382 1388 return (error);
1383 1389 }
1384 1390 }
1385 1391
1386 1392 if (zp == NULL) {
1387 1393 uint64_t txtype;
1388 1394
1389 1395 /*
1390 1396 * Create a new file object and update the directory
1391 1397 * to reference it.
1392 1398 */
1393 1399 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1394 1400 if (have_acl)
1395 1401 zfs_acl_ids_free(&acl_ids);
1396 1402 goto out;
1397 1403 }
1398 1404
1399 1405 /*
1400 1406 * We only support the creation of regular files in
1401 1407 * extended attribute directories.
1402 1408 */
1403 1409
1404 1410 if ((dzp->z_pflags & ZFS_XATTR) &&
1405 1411 (vap->va_type != VREG)) {
1406 1412 if (have_acl)
1407 1413 zfs_acl_ids_free(&acl_ids);
1408 1414 error = SET_ERROR(EINVAL);
1409 1415 goto out;
1410 1416 }
1411 1417
1412 1418 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1413 1419 cr, vsecp, &acl_ids)) != 0)
1414 1420 goto out;
1415 1421 have_acl = B_TRUE;
1416 1422
1417 1423 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1418 1424 zfs_acl_ids_free(&acl_ids);
1419 1425 error = SET_ERROR(EDQUOT);
1420 1426 goto out;
1421 1427 }
1422 1428
1423 1429 tx = dmu_tx_create(os);
1424 1430
1425 1431 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1426 1432 ZFS_SA_BASE_ATTR_SIZE);
1427 1433
↓ open down ↓ |
100 lines elided |
↑ open up ↑ |
1428 1434 fuid_dirtied = zfsvfs->z_fuid_dirty;
1429 1435 if (fuid_dirtied)
1430 1436 zfs_fuid_txhold(zfsvfs, tx);
1431 1437 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1432 1438 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1433 1439 if (!zfsvfs->z_use_sa &&
1434 1440 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1435 1441 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1436 1442 0, acl_ids.z_aclp->z_acl_bytes);
1437 1443 }
1438 - error = dmu_tx_assign(tx, TXG_NOWAIT);
1444 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1439 1445 if (error) {
1440 1446 zfs_dirent_unlock(dl);
1441 1447 if (error == ERESTART) {
1448 + waited = B_TRUE;
1442 1449 dmu_tx_wait(tx);
1443 1450 dmu_tx_abort(tx);
1444 1451 goto top;
1445 1452 }
1446 1453 zfs_acl_ids_free(&acl_ids);
1447 1454 dmu_tx_abort(tx);
1448 1455 ZFS_EXIT(zfsvfs);
1449 1456 return (error);
1450 1457 }
1451 1458 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1452 1459
1453 1460 if (fuid_dirtied)
1454 1461 zfs_fuid_sync(zfsvfs, tx);
1455 1462
1456 1463 (void) zfs_link_create(dl, zp, tx, ZNEW);
1457 1464 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1458 1465 if (flag & FIGNORECASE)
1459 1466 txtype |= TX_CI;
1460 1467 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1461 1468 vsecp, acl_ids.z_fuidp, vap);
1462 1469 zfs_acl_ids_free(&acl_ids);
1463 1470 dmu_tx_commit(tx);
1464 1471 } else {
1465 1472 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1466 1473
1467 1474 if (have_acl)
1468 1475 zfs_acl_ids_free(&acl_ids);
1469 1476 have_acl = B_FALSE;
1470 1477
1471 1478 /*
1472 1479 * A directory entry already exists for this name.
1473 1480 */
1474 1481 /*
1475 1482 * Can't truncate an existing file if in exclusive mode.
1476 1483 */
1477 1484 if (excl == EXCL) {
1478 1485 error = SET_ERROR(EEXIST);
1479 1486 goto out;
1480 1487 }
1481 1488 /*
1482 1489 * Can't open a directory for writing.
1483 1490 */
1484 1491 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1485 1492 error = SET_ERROR(EISDIR);
1486 1493 goto out;
1487 1494 }
1488 1495 /*
1489 1496 * Verify requested access to file.
1490 1497 */
1491 1498 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1492 1499 goto out;
1493 1500 }
1494 1501
1495 1502 mutex_enter(&dzp->z_lock);
1496 1503 dzp->z_seq++;
1497 1504 mutex_exit(&dzp->z_lock);
1498 1505
1499 1506 /*
1500 1507 * Truncate regular files if requested.
1501 1508 */
1502 1509 if ((ZTOV(zp)->v_type == VREG) &&
1503 1510 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1504 1511 /* we can't hold any locks when calling zfs_freesp() */
1505 1512 zfs_dirent_unlock(dl);
1506 1513 dl = NULL;
1507 1514 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1508 1515 if (error == 0) {
1509 1516 vnevent_create(ZTOV(zp), ct);
1510 1517 }
1511 1518 }
1512 1519 }
1513 1520 out:
1514 1521
1515 1522 if (dl)
1516 1523 zfs_dirent_unlock(dl);
1517 1524
1518 1525 if (error) {
1519 1526 if (zp)
1520 1527 VN_RELE(ZTOV(zp));
1521 1528 } else {
1522 1529 *vpp = ZTOV(zp);
1523 1530 error = specvp_check(vpp, cr);
1524 1531 }
1525 1532
1526 1533 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1527 1534 zil_commit(zilog, 0);
1528 1535
1529 1536 ZFS_EXIT(zfsvfs);
1530 1537 return (error);
1531 1538 }
1532 1539
1533 1540 /*
1534 1541 * Remove an entry from a directory.
1535 1542 *
1536 1543 * IN: dvp - vnode of directory to remove entry from.
1537 1544 * name - name of entry to remove.
1538 1545 * cr - credentials of caller.
1539 1546 * ct - caller context
1540 1547 * flags - case flags
1541 1548 *
1542 1549 * RETURN: 0 on success, error code on failure.
1543 1550 *
1544 1551 * Timestamps:
1545 1552 * dvp - ctime|mtime
1546 1553 * vp - ctime (if nlink > 0)
1547 1554 */
1548 1555
1549 1556 uint64_t null_xattr = 0;
1550 1557
1551 1558 /*ARGSUSED*/
1552 1559 static int
1553 1560 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1554 1561 int flags)
1555 1562 {
1556 1563 znode_t *zp, *dzp = VTOZ(dvp);
1557 1564 znode_t *xzp;
1558 1565 vnode_t *vp;
1559 1566 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1560 1567 zilog_t *zilog;
1561 1568 uint64_t acl_obj, xattr_obj;
1562 1569 uint64_t xattr_obj_unlinked = 0;
↓ open down ↓ |
111 lines elided |
↑ open up ↑ |
1563 1570 uint64_t obj = 0;
1564 1571 zfs_dirlock_t *dl;
1565 1572 dmu_tx_t *tx;
1566 1573 boolean_t may_delete_now, delete_now = FALSE;
1567 1574 boolean_t unlinked, toobig = FALSE;
1568 1575 uint64_t txtype;
1569 1576 pathname_t *realnmp = NULL;
1570 1577 pathname_t realnm;
1571 1578 int error;
1572 1579 int zflg = ZEXISTS;
1580 + boolean_t waited = B_FALSE;
1573 1581
1574 1582 ZFS_ENTER(zfsvfs);
1575 1583 ZFS_VERIFY_ZP(dzp);
1576 1584 zilog = zfsvfs->z_log;
1577 1585
1578 1586 if (flags & FIGNORECASE) {
1579 1587 zflg |= ZCILOOK;
1580 1588 pn_alloc(&realnm);
1581 1589 realnmp = &realnm;
1582 1590 }
1583 1591
1584 1592 top:
1585 1593 xattr_obj = 0;
1586 1594 xzp = NULL;
1587 1595 /*
1588 1596 * Attempt to lock directory; fail if entry doesn't exist.
1589 1597 */
1590 1598 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1591 1599 NULL, realnmp)) {
1592 1600 if (realnmp)
1593 1601 pn_free(realnmp);
1594 1602 ZFS_EXIT(zfsvfs);
1595 1603 return (error);
1596 1604 }
1597 1605
1598 1606 vp = ZTOV(zp);
1599 1607
1600 1608 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1601 1609 goto out;
1602 1610 }
1603 1611
1604 1612 /*
1605 1613 * Need to use rmdir for removing directories.
1606 1614 */
1607 1615 if (vp->v_type == VDIR) {
1608 1616 error = SET_ERROR(EPERM);
1609 1617 goto out;
1610 1618 }
1611 1619
1612 1620 vnevent_remove(vp, dvp, name, ct);
1613 1621
1614 1622 if (realnmp)
1615 1623 dnlc_remove(dvp, realnmp->pn_buf);
1616 1624 else
1617 1625 dnlc_remove(dvp, name);
1618 1626
1619 1627 mutex_enter(&vp->v_lock);
1620 1628 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1621 1629 mutex_exit(&vp->v_lock);
1622 1630
1623 1631 /*
1624 1632 * We may delete the znode now, or we may put it in the unlinked set;
1625 1633 * it depends on whether we're the last link, and on whether there are
1626 1634 * other holds on the vnode. So we dmu_tx_hold() the right things to
1627 1635 * allow for either case.
1628 1636 */
1629 1637 obj = zp->z_id;
1630 1638 tx = dmu_tx_create(zfsvfs->z_os);
1631 1639 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1632 1640 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1633 1641 zfs_sa_upgrade_txholds(tx, zp);
1634 1642 zfs_sa_upgrade_txholds(tx, dzp);
1635 1643 if (may_delete_now) {
1636 1644 toobig =
1637 1645 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1638 1646 /* if the file is too big, only hold_free a token amount */
1639 1647 dmu_tx_hold_free(tx, zp->z_id, 0,
1640 1648 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1641 1649 }
1642 1650
1643 1651 /* are there any extended attributes? */
1644 1652 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1645 1653 &xattr_obj, sizeof (xattr_obj));
1646 1654 if (error == 0 && xattr_obj) {
1647 1655 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1648 1656 ASSERT0(error);
1649 1657 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1650 1658 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
1651 1659 }
1652 1660
1653 1661 mutex_enter(&zp->z_lock);
1654 1662 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1655 1663 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1656 1664 mutex_exit(&zp->z_lock);
1657 1665
1658 1666 /* charge as an update -- would be nice not to charge at all */
1659 1667 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1660 1668
1661 - error = dmu_tx_assign(tx, TXG_NOWAIT);
1669 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1662 1670 if (error) {
1663 1671 zfs_dirent_unlock(dl);
1664 1672 VN_RELE(vp);
1665 1673 if (xzp)
1666 1674 VN_RELE(ZTOV(xzp));
1667 1675 if (error == ERESTART) {
1676 + waited = B_TRUE;
1668 1677 dmu_tx_wait(tx);
1669 1678 dmu_tx_abort(tx);
1670 1679 goto top;
1671 1680 }
1672 1681 if (realnmp)
1673 1682 pn_free(realnmp);
1674 1683 dmu_tx_abort(tx);
1675 1684 ZFS_EXIT(zfsvfs);
1676 1685 return (error);
1677 1686 }
1678 1687
1679 1688 /*
1680 1689 * Remove the directory entry.
1681 1690 */
1682 1691 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1683 1692
1684 1693 if (error) {
1685 1694 dmu_tx_commit(tx);
1686 1695 goto out;
1687 1696 }
1688 1697
1689 1698 if (unlinked) {
1690 1699
1691 1700 /*
1692 1701 * Hold z_lock so that we can make sure that the ACL obj
1693 1702 * hasn't changed. Could have been deleted due to
1694 1703 * zfs_sa_upgrade().
1695 1704 */
1696 1705 mutex_enter(&zp->z_lock);
1697 1706 mutex_enter(&vp->v_lock);
1698 1707 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1699 1708 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1700 1709 delete_now = may_delete_now && !toobig &&
1701 1710 vp->v_count == 1 && !vn_has_cached_data(vp) &&
1702 1711 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1703 1712 acl_obj;
1704 1713 mutex_exit(&vp->v_lock);
1705 1714 }
1706 1715
1707 1716 if (delete_now) {
1708 1717 if (xattr_obj_unlinked) {
1709 1718 ASSERT3U(xzp->z_links, ==, 2);
1710 1719 mutex_enter(&xzp->z_lock);
1711 1720 xzp->z_unlinked = 1;
1712 1721 xzp->z_links = 0;
1713 1722 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1714 1723 &xzp->z_links, sizeof (xzp->z_links), tx);
1715 1724 ASSERT3U(error, ==, 0);
1716 1725 mutex_exit(&xzp->z_lock);
1717 1726 zfs_unlinked_add(xzp, tx);
1718 1727
1719 1728 if (zp->z_is_sa)
1720 1729 error = sa_remove(zp->z_sa_hdl,
1721 1730 SA_ZPL_XATTR(zfsvfs), tx);
1722 1731 else
1723 1732 error = sa_update(zp->z_sa_hdl,
1724 1733 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1725 1734 sizeof (uint64_t), tx);
1726 1735 ASSERT0(error);
1727 1736 }
1728 1737 mutex_enter(&vp->v_lock);
1729 1738 vp->v_count--;
1730 1739 ASSERT0(vp->v_count);
1731 1740 mutex_exit(&vp->v_lock);
1732 1741 mutex_exit(&zp->z_lock);
1733 1742 zfs_znode_delete(zp, tx);
1734 1743 } else if (unlinked) {
1735 1744 mutex_exit(&zp->z_lock);
1736 1745 zfs_unlinked_add(zp, tx);
1737 1746 }
1738 1747
1739 1748 txtype = TX_REMOVE;
1740 1749 if (flags & FIGNORECASE)
1741 1750 txtype |= TX_CI;
1742 1751 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1743 1752
1744 1753 dmu_tx_commit(tx);
1745 1754 out:
1746 1755 if (realnmp)
1747 1756 pn_free(realnmp);
1748 1757
1749 1758 zfs_dirent_unlock(dl);
1750 1759
1751 1760 if (!delete_now)
1752 1761 VN_RELE(vp);
1753 1762 if (xzp)
1754 1763 VN_RELE(ZTOV(xzp));
1755 1764
1756 1765 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1757 1766 zil_commit(zilog, 0);
1758 1767
1759 1768 ZFS_EXIT(zfsvfs);
1760 1769 return (error);
1761 1770 }
1762 1771
1763 1772 /*
1764 1773 * Create a new directory and insert it into dvp using the name
1765 1774 * provided. Return a pointer to the inserted directory.
1766 1775 *
1767 1776 * IN: dvp - vnode of directory to add subdir to.
1768 1777 * dirname - name of new directory.
1769 1778 * vap - attributes of new directory.
1770 1779 * cr - credentials of caller.
1771 1780 * ct - caller context
1772 1781 * flags - case flags
1773 1782 * vsecp - ACL to be set
1774 1783 *
1775 1784 * OUT: vpp - vnode of created directory.
1776 1785 *
1777 1786 * RETURN: 0 on success, error code on failure.
1778 1787 *
1779 1788 * Timestamps:
1780 1789 * dvp - ctime|mtime updated
1781 1790 * vp - ctime|mtime|atime updated
1782 1791 */
1783 1792 /*ARGSUSED*/
1784 1793 static int
1785 1794 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1786 1795 caller_context_t *ct, int flags, vsecattr_t *vsecp)
1787 1796 {
1788 1797 znode_t *zp, *dzp = VTOZ(dvp);
1789 1798 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1790 1799 zilog_t *zilog;
↓ open down ↓ |
113 lines elided |
↑ open up ↑ |
1791 1800 zfs_dirlock_t *dl;
1792 1801 uint64_t txtype;
1793 1802 dmu_tx_t *tx;
1794 1803 int error;
1795 1804 int zf = ZNEW;
1796 1805 ksid_t *ksid;
1797 1806 uid_t uid;
1798 1807 gid_t gid = crgetgid(cr);
1799 1808 zfs_acl_ids_t acl_ids;
1800 1809 boolean_t fuid_dirtied;
1810 + boolean_t waited = B_FALSE;
1801 1811
1802 1812 ASSERT(vap->va_type == VDIR);
1803 1813
1804 1814 /*
1805 1815 * If we have an ephemeral id, ACL, or XVATTR then
1806 1816 * make sure file system is at proper version
1807 1817 */
1808 1818
1809 1819 ksid = crgetsid(cr, KSID_OWNER);
1810 1820 if (ksid)
1811 1821 uid = ksid_getid(ksid);
1812 1822 else
1813 1823 uid = crgetuid(cr);
1814 1824 if (zfsvfs->z_use_fuids == B_FALSE &&
1815 1825 (vsecp || (vap->va_mask & AT_XVATTR) ||
1816 1826 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1817 1827 return (SET_ERROR(EINVAL));
1818 1828
1819 1829 ZFS_ENTER(zfsvfs);
1820 1830 ZFS_VERIFY_ZP(dzp);
1821 1831 zilog = zfsvfs->z_log;
1822 1832
1823 1833 if (dzp->z_pflags & ZFS_XATTR) {
1824 1834 ZFS_EXIT(zfsvfs);
1825 1835 return (SET_ERROR(EINVAL));
1826 1836 }
1827 1837
1828 1838 if (zfsvfs->z_utf8 && u8_validate(dirname,
1829 1839 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1830 1840 ZFS_EXIT(zfsvfs);
1831 1841 return (SET_ERROR(EILSEQ));
1832 1842 }
1833 1843 if (flags & FIGNORECASE)
1834 1844 zf |= ZCILOOK;
1835 1845
1836 1846 if (vap->va_mask & AT_XVATTR) {
1837 1847 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1838 1848 crgetuid(cr), cr, vap->va_type)) != 0) {
1839 1849 ZFS_EXIT(zfsvfs);
1840 1850 return (error);
1841 1851 }
1842 1852 }
1843 1853
1844 1854 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1845 1855 vsecp, &acl_ids)) != 0) {
1846 1856 ZFS_EXIT(zfsvfs);
1847 1857 return (error);
1848 1858 }
1849 1859 /*
1850 1860 * First make sure the new directory doesn't exist.
1851 1861 *
1852 1862 * Existence is checked first to make sure we don't return
1853 1863 * EACCES instead of EEXIST which can cause some applications
1854 1864 * to fail.
1855 1865 */
1856 1866 top:
1857 1867 *vpp = NULL;
1858 1868
1859 1869 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1860 1870 NULL, NULL)) {
1861 1871 zfs_acl_ids_free(&acl_ids);
1862 1872 ZFS_EXIT(zfsvfs);
1863 1873 return (error);
1864 1874 }
1865 1875
1866 1876 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1867 1877 zfs_acl_ids_free(&acl_ids);
1868 1878 zfs_dirent_unlock(dl);
1869 1879 ZFS_EXIT(zfsvfs);
1870 1880 return (error);
1871 1881 }
1872 1882
1873 1883 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1874 1884 zfs_acl_ids_free(&acl_ids);
1875 1885 zfs_dirent_unlock(dl);
1876 1886 ZFS_EXIT(zfsvfs);
1877 1887 return (SET_ERROR(EDQUOT));
1878 1888 }
1879 1889
1880 1890 /*
1881 1891 * Add a new entry to the directory.
1882 1892 */
1883 1893 tx = dmu_tx_create(zfsvfs->z_os);
1884 1894 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1885 1895 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1886 1896 fuid_dirtied = zfsvfs->z_fuid_dirty;
↓ open down ↓ |
76 lines elided |
↑ open up ↑ |
1887 1897 if (fuid_dirtied)
1888 1898 zfs_fuid_txhold(zfsvfs, tx);
1889 1899 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1890 1900 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1891 1901 acl_ids.z_aclp->z_acl_bytes);
1892 1902 }
1893 1903
1894 1904 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1895 1905 ZFS_SA_BASE_ATTR_SIZE);
1896 1906
1897 - error = dmu_tx_assign(tx, TXG_NOWAIT);
1907 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1898 1908 if (error) {
1899 1909 zfs_dirent_unlock(dl);
1900 1910 if (error == ERESTART) {
1911 + waited = B_TRUE;
1901 1912 dmu_tx_wait(tx);
1902 1913 dmu_tx_abort(tx);
1903 1914 goto top;
1904 1915 }
1905 1916 zfs_acl_ids_free(&acl_ids);
1906 1917 dmu_tx_abort(tx);
1907 1918 ZFS_EXIT(zfsvfs);
1908 1919 return (error);
1909 1920 }
1910 1921
1911 1922 /*
1912 1923 * Create new node.
1913 1924 */
1914 1925 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1915 1926
1916 1927 if (fuid_dirtied)
1917 1928 zfs_fuid_sync(zfsvfs, tx);
1918 1929
1919 1930 /*
1920 1931 * Now put new name in parent dir.
1921 1932 */
1922 1933 (void) zfs_link_create(dl, zp, tx, ZNEW);
1923 1934
1924 1935 *vpp = ZTOV(zp);
1925 1936
1926 1937 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1927 1938 if (flags & FIGNORECASE)
1928 1939 txtype |= TX_CI;
1929 1940 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1930 1941 acl_ids.z_fuidp, vap);
1931 1942
1932 1943 zfs_acl_ids_free(&acl_ids);
1933 1944
1934 1945 dmu_tx_commit(tx);
1935 1946
1936 1947 zfs_dirent_unlock(dl);
1937 1948
1938 1949 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1939 1950 zil_commit(zilog, 0);
1940 1951
1941 1952 ZFS_EXIT(zfsvfs);
1942 1953 return (0);
1943 1954 }
1944 1955
1945 1956 /*
1946 1957 * Remove a directory subdir entry. If the current working
1947 1958 * directory is the same as the subdir to be removed, the
1948 1959 * remove will fail.
1949 1960 *
1950 1961 * IN: dvp - vnode of directory to remove from.
1951 1962 * name - name of directory to be removed.
1952 1963 * cwd - vnode of current working directory.
1953 1964 * cr - credentials of caller.
1954 1965 * ct - caller context
1955 1966 * flags - case flags
1956 1967 *
1957 1968 * RETURN: 0 on success, error code on failure.
1958 1969 *
1959 1970 * Timestamps:
1960 1971 * dvp - ctime|mtime updated
1961 1972 */
1962 1973 /*ARGSUSED*/
1963 1974 static int
1964 1975 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1965 1976 caller_context_t *ct, int flags)
↓ open down ↓ |
55 lines elided |
↑ open up ↑ |
1966 1977 {
1967 1978 znode_t *dzp = VTOZ(dvp);
1968 1979 znode_t *zp;
1969 1980 vnode_t *vp;
1970 1981 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1971 1982 zilog_t *zilog;
1972 1983 zfs_dirlock_t *dl;
1973 1984 dmu_tx_t *tx;
1974 1985 int error;
1975 1986 int zflg = ZEXISTS;
1987 + boolean_t waited = B_FALSE;
1976 1988
1977 1989 ZFS_ENTER(zfsvfs);
1978 1990 ZFS_VERIFY_ZP(dzp);
1979 1991 zilog = zfsvfs->z_log;
1980 1992
1981 1993 if (flags & FIGNORECASE)
1982 1994 zflg |= ZCILOOK;
1983 1995 top:
1984 1996 zp = NULL;
1985 1997
1986 1998 /*
1987 1999 * Attempt to lock directory; fail if entry doesn't exist.
1988 2000 */
1989 2001 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1990 2002 NULL, NULL)) {
1991 2003 ZFS_EXIT(zfsvfs);
1992 2004 return (error);
1993 2005 }
1994 2006
1995 2007 vp = ZTOV(zp);
1996 2008
1997 2009 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1998 2010 goto out;
1999 2011 }
2000 2012
2001 2013 if (vp->v_type != VDIR) {
2002 2014 error = SET_ERROR(ENOTDIR);
2003 2015 goto out;
2004 2016 }
2005 2017
2006 2018 if (vp == cwd) {
2007 2019 error = SET_ERROR(EINVAL);
2008 2020 goto out;
2009 2021 }
2010 2022
2011 2023 vnevent_rmdir(vp, dvp, name, ct);
2012 2024
2013 2025 /*
2014 2026 * Grab a lock on the directory to make sure that noone is
2015 2027 * trying to add (or lookup) entries while we are removing it.
2016 2028 */
2017 2029 rw_enter(&zp->z_name_lock, RW_WRITER);
2018 2030
2019 2031 /*
2020 2032 * Grab a lock on the parent pointer to make sure we play well
↓ open down ↓ |
35 lines elided |
↑ open up ↑ |
2021 2033 * with the treewalk and directory rename code.
2022 2034 */
2023 2035 rw_enter(&zp->z_parent_lock, RW_WRITER);
2024 2036
2025 2037 tx = dmu_tx_create(zfsvfs->z_os);
2026 2038 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2027 2039 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2028 2040 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2029 2041 zfs_sa_upgrade_txholds(tx, zp);
2030 2042 zfs_sa_upgrade_txholds(tx, dzp);
2031 - error = dmu_tx_assign(tx, TXG_NOWAIT);
2043 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2032 2044 if (error) {
2033 2045 rw_exit(&zp->z_parent_lock);
2034 2046 rw_exit(&zp->z_name_lock);
2035 2047 zfs_dirent_unlock(dl);
2036 2048 VN_RELE(vp);
2037 2049 if (error == ERESTART) {
2050 + waited = B_TRUE;
2038 2051 dmu_tx_wait(tx);
2039 2052 dmu_tx_abort(tx);
2040 2053 goto top;
2041 2054 }
2042 2055 dmu_tx_abort(tx);
2043 2056 ZFS_EXIT(zfsvfs);
2044 2057 return (error);
2045 2058 }
2046 2059
2047 2060 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2048 2061
2049 2062 if (error == 0) {
2050 2063 uint64_t txtype = TX_RMDIR;
2051 2064 if (flags & FIGNORECASE)
2052 2065 txtype |= TX_CI;
2053 2066 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2054 2067 }
2055 2068
2056 2069 dmu_tx_commit(tx);
2057 2070
2058 2071 rw_exit(&zp->z_parent_lock);
2059 2072 rw_exit(&zp->z_name_lock);
2060 2073 out:
2061 2074 zfs_dirent_unlock(dl);
2062 2075
2063 2076 VN_RELE(vp);
2064 2077
2065 2078 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2066 2079 zil_commit(zilog, 0);
2067 2080
2068 2081 ZFS_EXIT(zfsvfs);
2069 2082 return (error);
2070 2083 }
2071 2084
2072 2085 /*
2073 2086 * Read as many directory entries as will fit into the provided
2074 2087 * buffer from the given directory cursor position (specified in
2075 2088 * the uio structure).
2076 2089 *
2077 2090 * IN: vp - vnode of directory to read.
2078 2091 * uio - structure supplying read location, range info,
2079 2092 * and return buffer.
2080 2093 * cr - credentials of caller.
2081 2094 * ct - caller context
2082 2095 * flags - case flags
2083 2096 *
2084 2097 * OUT: uio - updated offset and range, buffer filled.
2085 2098 * eofp - set to true if end-of-file detected.
2086 2099 *
2087 2100 * RETURN: 0 on success, error code on failure.
2088 2101 *
2089 2102 * Timestamps:
2090 2103 * vp - atime updated
2091 2104 *
2092 2105 * Note that the low 4 bits of the cookie returned by zap is always zero.
2093 2106 * This allows us to use the low range for "special" directory entries:
2094 2107 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2095 2108 * we use the offset 2 for the '.zfs' directory.
2096 2109 */
2097 2110 /* ARGSUSED */
2098 2111 static int
2099 2112 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2100 2113 caller_context_t *ct, int flags)
2101 2114 {
2102 2115 znode_t *zp = VTOZ(vp);
2103 2116 iovec_t *iovp;
2104 2117 edirent_t *eodp;
2105 2118 dirent64_t *odp;
2106 2119 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2107 2120 objset_t *os;
2108 2121 caddr_t outbuf;
2109 2122 size_t bufsize;
2110 2123 zap_cursor_t zc;
2111 2124 zap_attribute_t zap;
2112 2125 uint_t bytes_wanted;
2113 2126 uint64_t offset; /* must be unsigned; checks for < 1 */
2114 2127 uint64_t parent;
2115 2128 int local_eof;
2116 2129 int outcount;
2117 2130 int error;
2118 2131 uint8_t prefetch;
2119 2132 boolean_t check_sysattrs;
2120 2133
2121 2134 ZFS_ENTER(zfsvfs);
2122 2135 ZFS_VERIFY_ZP(zp);
2123 2136
2124 2137 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2125 2138 &parent, sizeof (parent))) != 0) {
2126 2139 ZFS_EXIT(zfsvfs);
2127 2140 return (error);
2128 2141 }
2129 2142
2130 2143 /*
2131 2144 * If we are not given an eof variable,
2132 2145 * use a local one.
2133 2146 */
2134 2147 if (eofp == NULL)
2135 2148 eofp = &local_eof;
2136 2149
2137 2150 /*
2138 2151 * Check for valid iov_len.
2139 2152 */
2140 2153 if (uio->uio_iov->iov_len <= 0) {
2141 2154 ZFS_EXIT(zfsvfs);
2142 2155 return (SET_ERROR(EINVAL));
2143 2156 }
2144 2157
2145 2158 /*
2146 2159 * Quit if directory has been removed (posix)
2147 2160 */
2148 2161 if ((*eofp = zp->z_unlinked) != 0) {
2149 2162 ZFS_EXIT(zfsvfs);
2150 2163 return (0);
2151 2164 }
2152 2165
2153 2166 error = 0;
2154 2167 os = zfsvfs->z_os;
2155 2168 offset = uio->uio_loffset;
2156 2169 prefetch = zp->z_zn_prefetch;
2157 2170
2158 2171 /*
2159 2172 * Initialize the iterator cursor.
2160 2173 */
2161 2174 if (offset <= 3) {
2162 2175 /*
2163 2176 * Start iteration from the beginning of the directory.
2164 2177 */
2165 2178 zap_cursor_init(&zc, os, zp->z_id);
2166 2179 } else {
2167 2180 /*
2168 2181 * The offset is a serialized cursor.
2169 2182 */
2170 2183 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2171 2184 }
2172 2185
2173 2186 /*
2174 2187 * Get space to change directory entries into fs independent format.
2175 2188 */
2176 2189 iovp = uio->uio_iov;
2177 2190 bytes_wanted = iovp->iov_len;
2178 2191 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2179 2192 bufsize = bytes_wanted;
2180 2193 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2181 2194 odp = (struct dirent64 *)outbuf;
2182 2195 } else {
2183 2196 bufsize = bytes_wanted;
2184 2197 outbuf = NULL;
2185 2198 odp = (struct dirent64 *)iovp->iov_base;
2186 2199 }
2187 2200 eodp = (struct edirent *)odp;
2188 2201
2189 2202 /*
2190 2203 * If this VFS supports the system attribute view interface; and
2191 2204 * we're looking at an extended attribute directory; and we care
2192 2205 * about normalization conflicts on this vfs; then we must check
2193 2206 * for normalization conflicts with the sysattr name space.
2194 2207 */
2195 2208 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2196 2209 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2197 2210 (flags & V_RDDIR_ENTFLAGS);
2198 2211
2199 2212 /*
2200 2213 * Transform to file-system independent format
2201 2214 */
2202 2215 outcount = 0;
2203 2216 while (outcount < bytes_wanted) {
2204 2217 ino64_t objnum;
2205 2218 ushort_t reclen;
2206 2219 off64_t *next = NULL;
2207 2220
2208 2221 /*
2209 2222 * Special case `.', `..', and `.zfs'.
2210 2223 */
2211 2224 if (offset == 0) {
2212 2225 (void) strcpy(zap.za_name, ".");
2213 2226 zap.za_normalization_conflict = 0;
2214 2227 objnum = zp->z_id;
2215 2228 } else if (offset == 1) {
2216 2229 (void) strcpy(zap.za_name, "..");
2217 2230 zap.za_normalization_conflict = 0;
2218 2231 objnum = parent;
2219 2232 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2220 2233 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2221 2234 zap.za_normalization_conflict = 0;
2222 2235 objnum = ZFSCTL_INO_ROOT;
2223 2236 } else {
2224 2237 /*
2225 2238 * Grab next entry.
2226 2239 */
2227 2240 if (error = zap_cursor_retrieve(&zc, &zap)) {
2228 2241 if ((*eofp = (error == ENOENT)) != 0)
2229 2242 break;
2230 2243 else
2231 2244 goto update;
2232 2245 }
2233 2246
2234 2247 if (zap.za_integer_length != 8 ||
2235 2248 zap.za_num_integers != 1) {
2236 2249 cmn_err(CE_WARN, "zap_readdir: bad directory "
2237 2250 "entry, obj = %lld, offset = %lld\n",
2238 2251 (u_longlong_t)zp->z_id,
2239 2252 (u_longlong_t)offset);
2240 2253 error = SET_ERROR(ENXIO);
2241 2254 goto update;
2242 2255 }
2243 2256
2244 2257 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2245 2258 /*
2246 2259 * MacOS X can extract the object type here such as:
2247 2260 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2248 2261 */
2249 2262
2250 2263 if (check_sysattrs && !zap.za_normalization_conflict) {
2251 2264 zap.za_normalization_conflict =
2252 2265 xattr_sysattr_casechk(zap.za_name);
2253 2266 }
2254 2267 }
2255 2268
2256 2269 if (flags & V_RDDIR_ACCFILTER) {
2257 2270 /*
2258 2271 * If we have no access at all, don't include
2259 2272 * this entry in the returned information
2260 2273 */
2261 2274 znode_t *ezp;
2262 2275 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2263 2276 goto skip_entry;
2264 2277 if (!zfs_has_access(ezp, cr)) {
2265 2278 VN_RELE(ZTOV(ezp));
2266 2279 goto skip_entry;
2267 2280 }
2268 2281 VN_RELE(ZTOV(ezp));
2269 2282 }
2270 2283
2271 2284 if (flags & V_RDDIR_ENTFLAGS)
2272 2285 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2273 2286 else
2274 2287 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2275 2288
2276 2289 /*
2277 2290 * Will this entry fit in the buffer?
2278 2291 */
2279 2292 if (outcount + reclen > bufsize) {
2280 2293 /*
2281 2294 * Did we manage to fit anything in the buffer?
2282 2295 */
2283 2296 if (!outcount) {
2284 2297 error = SET_ERROR(EINVAL);
2285 2298 goto update;
2286 2299 }
2287 2300 break;
2288 2301 }
2289 2302 if (flags & V_RDDIR_ENTFLAGS) {
2290 2303 /*
2291 2304 * Add extended flag entry:
2292 2305 */
2293 2306 eodp->ed_ino = objnum;
2294 2307 eodp->ed_reclen = reclen;
2295 2308 /* NOTE: ed_off is the offset for the *next* entry */
2296 2309 next = &(eodp->ed_off);
2297 2310 eodp->ed_eflags = zap.za_normalization_conflict ?
2298 2311 ED_CASE_CONFLICT : 0;
2299 2312 (void) strncpy(eodp->ed_name, zap.za_name,
2300 2313 EDIRENT_NAMELEN(reclen));
2301 2314 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2302 2315 } else {
2303 2316 /*
2304 2317 * Add normal entry:
2305 2318 */
2306 2319 odp->d_ino = objnum;
2307 2320 odp->d_reclen = reclen;
2308 2321 /* NOTE: d_off is the offset for the *next* entry */
2309 2322 next = &(odp->d_off);
2310 2323 (void) strncpy(odp->d_name, zap.za_name,
2311 2324 DIRENT64_NAMELEN(reclen));
2312 2325 odp = (dirent64_t *)((intptr_t)odp + reclen);
2313 2326 }
2314 2327 outcount += reclen;
2315 2328
2316 2329 ASSERT(outcount <= bufsize);
2317 2330
2318 2331 /* Prefetch znode */
2319 2332 if (prefetch)
2320 2333 dmu_prefetch(os, objnum, 0, 0);
2321 2334
2322 2335 skip_entry:
2323 2336 /*
2324 2337 * Move to the next entry, fill in the previous offset.
2325 2338 */
2326 2339 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2327 2340 zap_cursor_advance(&zc);
2328 2341 offset = zap_cursor_serialize(&zc);
2329 2342 } else {
2330 2343 offset += 1;
2331 2344 }
2332 2345 if (next)
2333 2346 *next = offset;
2334 2347 }
2335 2348 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2336 2349
2337 2350 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2338 2351 iovp->iov_base += outcount;
2339 2352 iovp->iov_len -= outcount;
2340 2353 uio->uio_resid -= outcount;
2341 2354 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2342 2355 /*
2343 2356 * Reset the pointer.
2344 2357 */
2345 2358 offset = uio->uio_loffset;
2346 2359 }
2347 2360
2348 2361 update:
2349 2362 zap_cursor_fini(&zc);
2350 2363 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2351 2364 kmem_free(outbuf, bufsize);
2352 2365
2353 2366 if (error == ENOENT)
2354 2367 error = 0;
2355 2368
2356 2369 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2357 2370
2358 2371 uio->uio_loffset = offset;
2359 2372 ZFS_EXIT(zfsvfs);
2360 2373 return (error);
2361 2374 }
2362 2375
2363 2376 ulong_t zfs_fsync_sync_cnt = 4;
2364 2377
2365 2378 static int
2366 2379 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2367 2380 {
2368 2381 znode_t *zp = VTOZ(vp);
2369 2382 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2370 2383
2371 2384 /*
2372 2385 * Regardless of whether this is required for standards conformance,
2373 2386 * this is the logical behavior when fsync() is called on a file with
2374 2387 * dirty pages. We use B_ASYNC since the ZIL transactions are already
2375 2388 * going to be pushed out as part of the zil_commit().
2376 2389 */
2377 2390 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2378 2391 (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2379 2392 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2380 2393
2381 2394 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2382 2395
2383 2396 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2384 2397 ZFS_ENTER(zfsvfs);
2385 2398 ZFS_VERIFY_ZP(zp);
2386 2399 zil_commit(zfsvfs->z_log, zp->z_id);
2387 2400 ZFS_EXIT(zfsvfs);
2388 2401 }
2389 2402 return (0);
2390 2403 }
2391 2404
2392 2405
2393 2406 /*
2394 2407 * Get the requested file attributes and place them in the provided
2395 2408 * vattr structure.
2396 2409 *
2397 2410 * IN: vp - vnode of file.
2398 2411 * vap - va_mask identifies requested attributes.
2399 2412 * If AT_XVATTR set, then optional attrs are requested
2400 2413 * flags - ATTR_NOACLCHECK (CIFS server context)
2401 2414 * cr - credentials of caller.
2402 2415 * ct - caller context
2403 2416 *
2404 2417 * OUT: vap - attribute values.
2405 2418 *
2406 2419 * RETURN: 0 (always succeeds).
2407 2420 */
2408 2421 /* ARGSUSED */
2409 2422 static int
2410 2423 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2411 2424 caller_context_t *ct)
2412 2425 {
2413 2426 znode_t *zp = VTOZ(vp);
2414 2427 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2415 2428 int error = 0;
2416 2429 uint64_t links;
2417 2430 uint64_t mtime[2], ctime[2];
2418 2431 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2419 2432 xoptattr_t *xoap = NULL;
2420 2433 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2421 2434 sa_bulk_attr_t bulk[2];
2422 2435 int count = 0;
2423 2436
2424 2437 ZFS_ENTER(zfsvfs);
2425 2438 ZFS_VERIFY_ZP(zp);
2426 2439
2427 2440 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2428 2441
2429 2442 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2430 2443 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2431 2444
2432 2445 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2433 2446 ZFS_EXIT(zfsvfs);
2434 2447 return (error);
2435 2448 }
2436 2449
2437 2450 /*
2438 2451 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2439 2452 * Also, if we are the owner don't bother, since owner should
2440 2453 * always be allowed to read basic attributes of file.
2441 2454 */
2442 2455 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2443 2456 (vap->va_uid != crgetuid(cr))) {
2444 2457 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2445 2458 skipaclchk, cr)) {
2446 2459 ZFS_EXIT(zfsvfs);
2447 2460 return (error);
2448 2461 }
2449 2462 }
2450 2463
2451 2464 /*
2452 2465 * Return all attributes. It's cheaper to provide the answer
2453 2466 * than to determine whether we were asked the question.
2454 2467 */
2455 2468
2456 2469 mutex_enter(&zp->z_lock);
2457 2470 vap->va_type = vp->v_type;
2458 2471 vap->va_mode = zp->z_mode & MODEMASK;
2459 2472 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2460 2473 vap->va_nodeid = zp->z_id;
2461 2474 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2462 2475 links = zp->z_links + 1;
2463 2476 else
2464 2477 links = zp->z_links;
2465 2478 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2466 2479 vap->va_size = zp->z_size;
2467 2480 vap->va_rdev = vp->v_rdev;
2468 2481 vap->va_seq = zp->z_seq;
2469 2482
2470 2483 /*
2471 2484 * Add in any requested optional attributes and the create time.
2472 2485 * Also set the corresponding bits in the returned attribute bitmap.
2473 2486 */
2474 2487 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2475 2488 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2476 2489 xoap->xoa_archive =
2477 2490 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2478 2491 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2479 2492 }
2480 2493
2481 2494 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2482 2495 xoap->xoa_readonly =
2483 2496 ((zp->z_pflags & ZFS_READONLY) != 0);
2484 2497 XVA_SET_RTN(xvap, XAT_READONLY);
2485 2498 }
2486 2499
2487 2500 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2488 2501 xoap->xoa_system =
2489 2502 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2490 2503 XVA_SET_RTN(xvap, XAT_SYSTEM);
2491 2504 }
2492 2505
2493 2506 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2494 2507 xoap->xoa_hidden =
2495 2508 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2496 2509 XVA_SET_RTN(xvap, XAT_HIDDEN);
2497 2510 }
2498 2511
2499 2512 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2500 2513 xoap->xoa_nounlink =
2501 2514 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2502 2515 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2503 2516 }
2504 2517
2505 2518 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2506 2519 xoap->xoa_immutable =
2507 2520 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2508 2521 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2509 2522 }
2510 2523
2511 2524 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2512 2525 xoap->xoa_appendonly =
2513 2526 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2514 2527 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2515 2528 }
2516 2529
2517 2530 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2518 2531 xoap->xoa_nodump =
2519 2532 ((zp->z_pflags & ZFS_NODUMP) != 0);
2520 2533 XVA_SET_RTN(xvap, XAT_NODUMP);
2521 2534 }
2522 2535
2523 2536 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2524 2537 xoap->xoa_opaque =
2525 2538 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2526 2539 XVA_SET_RTN(xvap, XAT_OPAQUE);
2527 2540 }
2528 2541
2529 2542 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2530 2543 xoap->xoa_av_quarantined =
2531 2544 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2532 2545 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2533 2546 }
2534 2547
2535 2548 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2536 2549 xoap->xoa_av_modified =
2537 2550 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2538 2551 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2539 2552 }
2540 2553
2541 2554 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2542 2555 vp->v_type == VREG) {
2543 2556 zfs_sa_get_scanstamp(zp, xvap);
2544 2557 }
2545 2558
2546 2559 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2547 2560 uint64_t times[2];
2548 2561
2549 2562 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2550 2563 times, sizeof (times));
2551 2564 ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2552 2565 XVA_SET_RTN(xvap, XAT_CREATETIME);
2553 2566 }
2554 2567
2555 2568 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2556 2569 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2557 2570 XVA_SET_RTN(xvap, XAT_REPARSE);
2558 2571 }
2559 2572 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2560 2573 xoap->xoa_generation = zp->z_gen;
2561 2574 XVA_SET_RTN(xvap, XAT_GEN);
2562 2575 }
2563 2576
2564 2577 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2565 2578 xoap->xoa_offline =
2566 2579 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2567 2580 XVA_SET_RTN(xvap, XAT_OFFLINE);
2568 2581 }
2569 2582
2570 2583 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2571 2584 xoap->xoa_sparse =
2572 2585 ((zp->z_pflags & ZFS_SPARSE) != 0);
2573 2586 XVA_SET_RTN(xvap, XAT_SPARSE);
2574 2587 }
2575 2588 }
2576 2589
2577 2590 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2578 2591 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2579 2592 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2580 2593
2581 2594 mutex_exit(&zp->z_lock);
2582 2595
2583 2596 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2584 2597
2585 2598 if (zp->z_blksz == 0) {
2586 2599 /*
2587 2600 * Block size hasn't been set; suggest maximal I/O transfers.
2588 2601 */
2589 2602 vap->va_blksize = zfsvfs->z_max_blksz;
2590 2603 }
2591 2604
2592 2605 ZFS_EXIT(zfsvfs);
2593 2606 return (0);
2594 2607 }
2595 2608
2596 2609 /*
2597 2610 * Set the file attributes to the values contained in the
2598 2611 * vattr structure.
2599 2612 *
2600 2613 * IN: vp - vnode of file to be modified.
2601 2614 * vap - new attribute values.
2602 2615 * If AT_XVATTR set, then optional attrs are being set
2603 2616 * flags - ATTR_UTIME set if non-default time values provided.
2604 2617 * - ATTR_NOACLCHECK (CIFS context only).
2605 2618 * cr - credentials of caller.
2606 2619 * ct - caller context
2607 2620 *
2608 2621 * RETURN: 0 on success, error code on failure.
2609 2622 *
2610 2623 * Timestamps:
2611 2624 * vp - ctime updated, mtime updated if size changed.
2612 2625 */
2613 2626 /* ARGSUSED */
2614 2627 static int
2615 2628 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2616 2629 caller_context_t *ct)
2617 2630 {
2618 2631 znode_t *zp = VTOZ(vp);
2619 2632 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2620 2633 zilog_t *zilog;
2621 2634 dmu_tx_t *tx;
2622 2635 vattr_t oldva;
2623 2636 xvattr_t tmpxvattr;
2624 2637 uint_t mask = vap->va_mask;
2625 2638 uint_t saved_mask = 0;
2626 2639 int trim_mask = 0;
2627 2640 uint64_t new_mode;
2628 2641 uint64_t new_uid, new_gid;
2629 2642 uint64_t xattr_obj;
2630 2643 uint64_t mtime[2], ctime[2];
2631 2644 znode_t *attrzp;
2632 2645 int need_policy = FALSE;
2633 2646 int err, err2;
2634 2647 zfs_fuid_info_t *fuidp = NULL;
2635 2648 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2636 2649 xoptattr_t *xoap;
2637 2650 zfs_acl_t *aclp;
2638 2651 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2639 2652 boolean_t fuid_dirtied = B_FALSE;
2640 2653 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2641 2654 int count = 0, xattr_count = 0;
2642 2655
2643 2656 if (mask == 0)
2644 2657 return (0);
2645 2658
2646 2659 if (mask & AT_NOSET)
2647 2660 return (SET_ERROR(EINVAL));
2648 2661
2649 2662 ZFS_ENTER(zfsvfs);
2650 2663 ZFS_VERIFY_ZP(zp);
2651 2664
2652 2665 zilog = zfsvfs->z_log;
2653 2666
2654 2667 /*
2655 2668 * Make sure that if we have ephemeral uid/gid or xvattr specified
2656 2669 * that file system is at proper version level
2657 2670 */
2658 2671
2659 2672 if (zfsvfs->z_use_fuids == B_FALSE &&
2660 2673 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2661 2674 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2662 2675 (mask & AT_XVATTR))) {
2663 2676 ZFS_EXIT(zfsvfs);
2664 2677 return (SET_ERROR(EINVAL));
2665 2678 }
2666 2679
2667 2680 if (mask & AT_SIZE && vp->v_type == VDIR) {
2668 2681 ZFS_EXIT(zfsvfs);
2669 2682 return (SET_ERROR(EISDIR));
2670 2683 }
2671 2684
2672 2685 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2673 2686 ZFS_EXIT(zfsvfs);
2674 2687 return (SET_ERROR(EINVAL));
2675 2688 }
2676 2689
2677 2690 /*
2678 2691 * If this is an xvattr_t, then get a pointer to the structure of
2679 2692 * optional attributes. If this is NULL, then we have a vattr_t.
2680 2693 */
2681 2694 xoap = xva_getxoptattr(xvap);
2682 2695
2683 2696 xva_init(&tmpxvattr);
2684 2697
2685 2698 /*
2686 2699 * Immutable files can only alter immutable bit and atime
2687 2700 */
2688 2701 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2689 2702 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2690 2703 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2691 2704 ZFS_EXIT(zfsvfs);
2692 2705 return (SET_ERROR(EPERM));
2693 2706 }
2694 2707
2695 2708 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2696 2709 ZFS_EXIT(zfsvfs);
2697 2710 return (SET_ERROR(EPERM));
2698 2711 }
2699 2712
2700 2713 /*
2701 2714 * Verify timestamps doesn't overflow 32 bits.
2702 2715 * ZFS can handle large timestamps, but 32bit syscalls can't
2703 2716 * handle times greater than 2039. This check should be removed
2704 2717 * once large timestamps are fully supported.
2705 2718 */
2706 2719 if (mask & (AT_ATIME | AT_MTIME)) {
2707 2720 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2708 2721 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2709 2722 ZFS_EXIT(zfsvfs);
2710 2723 return (SET_ERROR(EOVERFLOW));
2711 2724 }
2712 2725 }
2713 2726
2714 2727 top:
2715 2728 attrzp = NULL;
2716 2729 aclp = NULL;
2717 2730
2718 2731 /* Can this be moved to before the top label? */
2719 2732 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2720 2733 ZFS_EXIT(zfsvfs);
2721 2734 return (SET_ERROR(EROFS));
2722 2735 }
2723 2736
2724 2737 /*
2725 2738 * First validate permissions
2726 2739 */
2727 2740
2728 2741 if (mask & AT_SIZE) {
2729 2742 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2730 2743 if (err) {
2731 2744 ZFS_EXIT(zfsvfs);
2732 2745 return (err);
2733 2746 }
2734 2747 /*
2735 2748 * XXX - Note, we are not providing any open
2736 2749 * mode flags here (like FNDELAY), so we may
2737 2750 * block if there are locks present... this
2738 2751 * should be addressed in openat().
2739 2752 */
2740 2753 /* XXX - would it be OK to generate a log record here? */
2741 2754 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2742 2755 if (err) {
2743 2756 ZFS_EXIT(zfsvfs);
2744 2757 return (err);
2745 2758 }
2746 2759 }
2747 2760
2748 2761 if (mask & (AT_ATIME|AT_MTIME) ||
2749 2762 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2750 2763 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2751 2764 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2752 2765 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2753 2766 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2754 2767 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2755 2768 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2756 2769 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2757 2770 skipaclchk, cr);
2758 2771 }
2759 2772
2760 2773 if (mask & (AT_UID|AT_GID)) {
2761 2774 int idmask = (mask & (AT_UID|AT_GID));
2762 2775 int take_owner;
2763 2776 int take_group;
2764 2777
2765 2778 /*
2766 2779 * NOTE: even if a new mode is being set,
2767 2780 * we may clear S_ISUID/S_ISGID bits.
2768 2781 */
2769 2782
2770 2783 if (!(mask & AT_MODE))
2771 2784 vap->va_mode = zp->z_mode;
2772 2785
2773 2786 /*
2774 2787 * Take ownership or chgrp to group we are a member of
2775 2788 */
2776 2789
2777 2790 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2778 2791 take_group = (mask & AT_GID) &&
2779 2792 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2780 2793
2781 2794 /*
2782 2795 * If both AT_UID and AT_GID are set then take_owner and
2783 2796 * take_group must both be set in order to allow taking
2784 2797 * ownership.
2785 2798 *
2786 2799 * Otherwise, send the check through secpolicy_vnode_setattr()
2787 2800 *
2788 2801 */
2789 2802
2790 2803 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2791 2804 ((idmask == AT_UID) && take_owner) ||
2792 2805 ((idmask == AT_GID) && take_group)) {
2793 2806 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2794 2807 skipaclchk, cr) == 0) {
2795 2808 /*
2796 2809 * Remove setuid/setgid for non-privileged users
2797 2810 */
2798 2811 secpolicy_setid_clear(vap, cr);
2799 2812 trim_mask = (mask & (AT_UID|AT_GID));
2800 2813 } else {
2801 2814 need_policy = TRUE;
2802 2815 }
2803 2816 } else {
2804 2817 need_policy = TRUE;
2805 2818 }
2806 2819 }
2807 2820
2808 2821 mutex_enter(&zp->z_lock);
2809 2822 oldva.va_mode = zp->z_mode;
2810 2823 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2811 2824 if (mask & AT_XVATTR) {
2812 2825 /*
2813 2826 * Update xvattr mask to include only those attributes
2814 2827 * that are actually changing.
2815 2828 *
2816 2829 * the bits will be restored prior to actually setting
2817 2830 * the attributes so the caller thinks they were set.
2818 2831 */
2819 2832 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2820 2833 if (xoap->xoa_appendonly !=
2821 2834 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2822 2835 need_policy = TRUE;
2823 2836 } else {
2824 2837 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2825 2838 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2826 2839 }
2827 2840 }
2828 2841
2829 2842 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2830 2843 if (xoap->xoa_nounlink !=
2831 2844 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2832 2845 need_policy = TRUE;
2833 2846 } else {
2834 2847 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2835 2848 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2836 2849 }
2837 2850 }
2838 2851
2839 2852 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2840 2853 if (xoap->xoa_immutable !=
2841 2854 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2842 2855 need_policy = TRUE;
2843 2856 } else {
2844 2857 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2845 2858 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2846 2859 }
2847 2860 }
2848 2861
2849 2862 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2850 2863 if (xoap->xoa_nodump !=
2851 2864 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2852 2865 need_policy = TRUE;
2853 2866 } else {
2854 2867 XVA_CLR_REQ(xvap, XAT_NODUMP);
2855 2868 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2856 2869 }
2857 2870 }
2858 2871
2859 2872 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2860 2873 if (xoap->xoa_av_modified !=
2861 2874 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2862 2875 need_policy = TRUE;
2863 2876 } else {
2864 2877 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2865 2878 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2866 2879 }
2867 2880 }
2868 2881
2869 2882 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2870 2883 if ((vp->v_type != VREG &&
2871 2884 xoap->xoa_av_quarantined) ||
2872 2885 xoap->xoa_av_quarantined !=
2873 2886 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2874 2887 need_policy = TRUE;
2875 2888 } else {
2876 2889 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2877 2890 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2878 2891 }
2879 2892 }
2880 2893
2881 2894 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2882 2895 mutex_exit(&zp->z_lock);
2883 2896 ZFS_EXIT(zfsvfs);
2884 2897 return (SET_ERROR(EPERM));
2885 2898 }
2886 2899
2887 2900 if (need_policy == FALSE &&
2888 2901 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2889 2902 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2890 2903 need_policy = TRUE;
2891 2904 }
2892 2905 }
2893 2906
2894 2907 mutex_exit(&zp->z_lock);
2895 2908
2896 2909 if (mask & AT_MODE) {
2897 2910 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2898 2911 err = secpolicy_setid_setsticky_clear(vp, vap,
2899 2912 &oldva, cr);
2900 2913 if (err) {
2901 2914 ZFS_EXIT(zfsvfs);
2902 2915 return (err);
2903 2916 }
2904 2917 trim_mask |= AT_MODE;
2905 2918 } else {
2906 2919 need_policy = TRUE;
2907 2920 }
2908 2921 }
2909 2922
2910 2923 if (need_policy) {
2911 2924 /*
2912 2925 * If trim_mask is set then take ownership
2913 2926 * has been granted or write_acl is present and user
2914 2927 * has the ability to modify mode. In that case remove
2915 2928 * UID|GID and or MODE from mask so that
2916 2929 * secpolicy_vnode_setattr() doesn't revoke it.
2917 2930 */
2918 2931
2919 2932 if (trim_mask) {
2920 2933 saved_mask = vap->va_mask;
2921 2934 vap->va_mask &= ~trim_mask;
2922 2935 }
2923 2936 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2924 2937 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2925 2938 if (err) {
2926 2939 ZFS_EXIT(zfsvfs);
2927 2940 return (err);
2928 2941 }
2929 2942
2930 2943 if (trim_mask)
2931 2944 vap->va_mask |= saved_mask;
2932 2945 }
2933 2946
2934 2947 /*
2935 2948 * secpolicy_vnode_setattr, or take ownership may have
2936 2949 * changed va_mask
2937 2950 */
2938 2951 mask = vap->va_mask;
2939 2952
2940 2953 if ((mask & (AT_UID | AT_GID))) {
2941 2954 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2942 2955 &xattr_obj, sizeof (xattr_obj));
2943 2956
2944 2957 if (err == 0 && xattr_obj) {
2945 2958 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2946 2959 if (err)
2947 2960 goto out2;
2948 2961 }
2949 2962 if (mask & AT_UID) {
2950 2963 new_uid = zfs_fuid_create(zfsvfs,
2951 2964 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2952 2965 if (new_uid != zp->z_uid &&
2953 2966 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
2954 2967 if (attrzp)
2955 2968 VN_RELE(ZTOV(attrzp));
2956 2969 err = SET_ERROR(EDQUOT);
2957 2970 goto out2;
2958 2971 }
2959 2972 }
2960 2973
2961 2974 if (mask & AT_GID) {
2962 2975 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2963 2976 cr, ZFS_GROUP, &fuidp);
2964 2977 if (new_gid != zp->z_gid &&
2965 2978 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
2966 2979 if (attrzp)
2967 2980 VN_RELE(ZTOV(attrzp));
2968 2981 err = SET_ERROR(EDQUOT);
2969 2982 goto out2;
2970 2983 }
2971 2984 }
2972 2985 }
2973 2986 tx = dmu_tx_create(zfsvfs->z_os);
2974 2987
2975 2988 if (mask & AT_MODE) {
2976 2989 uint64_t pmode = zp->z_mode;
2977 2990 uint64_t acl_obj;
2978 2991 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2979 2992
2980 2993 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
2981 2994 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
2982 2995 err = SET_ERROR(EPERM);
2983 2996 goto out;
2984 2997 }
2985 2998
2986 2999 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
2987 3000 goto out;
2988 3001
2989 3002 mutex_enter(&zp->z_lock);
2990 3003 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2991 3004 /*
2992 3005 * Are we upgrading ACL from old V0 format
2993 3006 * to V1 format?
2994 3007 */
2995 3008 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2996 3009 zfs_znode_acl_version(zp) ==
2997 3010 ZFS_ACL_VERSION_INITIAL) {
2998 3011 dmu_tx_hold_free(tx, acl_obj, 0,
2999 3012 DMU_OBJECT_END);
3000 3013 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3001 3014 0, aclp->z_acl_bytes);
3002 3015 } else {
3003 3016 dmu_tx_hold_write(tx, acl_obj, 0,
3004 3017 aclp->z_acl_bytes);
3005 3018 }
3006 3019 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3007 3020 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3008 3021 0, aclp->z_acl_bytes);
3009 3022 }
3010 3023 mutex_exit(&zp->z_lock);
3011 3024 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3012 3025 } else {
3013 3026 if ((mask & AT_XVATTR) &&
3014 3027 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3015 3028 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3016 3029 else
3017 3030 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3018 3031 }
3019 3032
3020 3033 if (attrzp) {
3021 3034 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3022 3035 }
3023 3036
3024 3037 fuid_dirtied = zfsvfs->z_fuid_dirty;
3025 3038 if (fuid_dirtied)
3026 3039 zfs_fuid_txhold(zfsvfs, tx);
3027 3040
3028 3041 zfs_sa_upgrade_txholds(tx, zp);
3029 3042
3030 3043 err = dmu_tx_assign(tx, TXG_NOWAIT);
3031 3044 if (err) {
3032 3045 if (err == ERESTART)
3033 3046 dmu_tx_wait(tx);
3034 3047 goto out;
3035 3048 }
3036 3049
3037 3050 count = 0;
3038 3051 /*
3039 3052 * Set each attribute requested.
3040 3053 * We group settings according to the locks they need to acquire.
3041 3054 *
3042 3055 * Note: you cannot set ctime directly, although it will be
3043 3056 * updated as a side-effect of calling this function.
3044 3057 */
3045 3058
3046 3059
3047 3060 if (mask & (AT_UID|AT_GID|AT_MODE))
3048 3061 mutex_enter(&zp->z_acl_lock);
3049 3062 mutex_enter(&zp->z_lock);
3050 3063
3051 3064 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3052 3065 &zp->z_pflags, sizeof (zp->z_pflags));
3053 3066
3054 3067 if (attrzp) {
3055 3068 if (mask & (AT_UID|AT_GID|AT_MODE))
3056 3069 mutex_enter(&attrzp->z_acl_lock);
3057 3070 mutex_enter(&attrzp->z_lock);
3058 3071 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3059 3072 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3060 3073 sizeof (attrzp->z_pflags));
3061 3074 }
3062 3075
3063 3076 if (mask & (AT_UID|AT_GID)) {
3064 3077
3065 3078 if (mask & AT_UID) {
3066 3079 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3067 3080 &new_uid, sizeof (new_uid));
3068 3081 zp->z_uid = new_uid;
3069 3082 if (attrzp) {
3070 3083 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3071 3084 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3072 3085 sizeof (new_uid));
3073 3086 attrzp->z_uid = new_uid;
3074 3087 }
3075 3088 }
3076 3089
3077 3090 if (mask & AT_GID) {
3078 3091 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3079 3092 NULL, &new_gid, sizeof (new_gid));
3080 3093 zp->z_gid = new_gid;
3081 3094 if (attrzp) {
3082 3095 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3083 3096 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3084 3097 sizeof (new_gid));
3085 3098 attrzp->z_gid = new_gid;
3086 3099 }
3087 3100 }
3088 3101 if (!(mask & AT_MODE)) {
3089 3102 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3090 3103 NULL, &new_mode, sizeof (new_mode));
3091 3104 new_mode = zp->z_mode;
3092 3105 }
3093 3106 err = zfs_acl_chown_setattr(zp);
3094 3107 ASSERT(err == 0);
3095 3108 if (attrzp) {
3096 3109 err = zfs_acl_chown_setattr(attrzp);
3097 3110 ASSERT(err == 0);
3098 3111 }
3099 3112 }
3100 3113
3101 3114 if (mask & AT_MODE) {
3102 3115 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3103 3116 &new_mode, sizeof (new_mode));
3104 3117 zp->z_mode = new_mode;
3105 3118 ASSERT3U((uintptr_t)aclp, !=, NULL);
3106 3119 err = zfs_aclset_common(zp, aclp, cr, tx);
3107 3120 ASSERT0(err);
3108 3121 if (zp->z_acl_cached)
3109 3122 zfs_acl_free(zp->z_acl_cached);
3110 3123 zp->z_acl_cached = aclp;
3111 3124 aclp = NULL;
3112 3125 }
3113 3126
3114 3127
3115 3128 if (mask & AT_ATIME) {
3116 3129 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3117 3130 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3118 3131 &zp->z_atime, sizeof (zp->z_atime));
3119 3132 }
3120 3133
3121 3134 if (mask & AT_MTIME) {
3122 3135 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3123 3136 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3124 3137 mtime, sizeof (mtime));
3125 3138 }
3126 3139
3127 3140 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3128 3141 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3129 3142 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3130 3143 NULL, mtime, sizeof (mtime));
3131 3144 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3132 3145 &ctime, sizeof (ctime));
3133 3146 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3134 3147 B_TRUE);
3135 3148 } else if (mask != 0) {
3136 3149 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3137 3150 &ctime, sizeof (ctime));
3138 3151 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3139 3152 B_TRUE);
3140 3153 if (attrzp) {
3141 3154 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3142 3155 SA_ZPL_CTIME(zfsvfs), NULL,
3143 3156 &ctime, sizeof (ctime));
3144 3157 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3145 3158 mtime, ctime, B_TRUE);
3146 3159 }
3147 3160 }
3148 3161 /*
3149 3162 * Do this after setting timestamps to prevent timestamp
3150 3163 * update from toggling bit
3151 3164 */
3152 3165
3153 3166 if (xoap && (mask & AT_XVATTR)) {
3154 3167
3155 3168 /*
3156 3169 * restore trimmed off masks
3157 3170 * so that return masks can be set for caller.
3158 3171 */
3159 3172
3160 3173 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3161 3174 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3162 3175 }
3163 3176 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3164 3177 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3165 3178 }
3166 3179 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3167 3180 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3168 3181 }
3169 3182 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3170 3183 XVA_SET_REQ(xvap, XAT_NODUMP);
3171 3184 }
3172 3185 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3173 3186 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3174 3187 }
3175 3188 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3176 3189 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3177 3190 }
3178 3191
3179 3192 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3180 3193 ASSERT(vp->v_type == VREG);
3181 3194
3182 3195 zfs_xvattr_set(zp, xvap, tx);
3183 3196 }
3184 3197
3185 3198 if (fuid_dirtied)
3186 3199 zfs_fuid_sync(zfsvfs, tx);
3187 3200
3188 3201 if (mask != 0)
3189 3202 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3190 3203
3191 3204 mutex_exit(&zp->z_lock);
3192 3205 if (mask & (AT_UID|AT_GID|AT_MODE))
3193 3206 mutex_exit(&zp->z_acl_lock);
3194 3207
3195 3208 if (attrzp) {
3196 3209 if (mask & (AT_UID|AT_GID|AT_MODE))
3197 3210 mutex_exit(&attrzp->z_acl_lock);
3198 3211 mutex_exit(&attrzp->z_lock);
3199 3212 }
3200 3213 out:
3201 3214 if (err == 0 && attrzp) {
3202 3215 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3203 3216 xattr_count, tx);
3204 3217 ASSERT(err2 == 0);
3205 3218 }
3206 3219
3207 3220 if (attrzp)
3208 3221 VN_RELE(ZTOV(attrzp));
3209 3222
3210 3223 if (aclp)
3211 3224 zfs_acl_free(aclp);
3212 3225
3213 3226 if (fuidp) {
3214 3227 zfs_fuid_info_free(fuidp);
3215 3228 fuidp = NULL;
3216 3229 }
3217 3230
3218 3231 if (err) {
3219 3232 dmu_tx_abort(tx);
3220 3233 if (err == ERESTART)
3221 3234 goto top;
3222 3235 } else {
3223 3236 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3224 3237 dmu_tx_commit(tx);
3225 3238 }
3226 3239
3227 3240 out2:
3228 3241 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3229 3242 zil_commit(zilog, 0);
3230 3243
3231 3244 ZFS_EXIT(zfsvfs);
3232 3245 return (err);
3233 3246 }
3234 3247
3235 3248 typedef struct zfs_zlock {
3236 3249 krwlock_t *zl_rwlock; /* lock we acquired */
3237 3250 znode_t *zl_znode; /* znode we held */
3238 3251 struct zfs_zlock *zl_next; /* next in list */
3239 3252 } zfs_zlock_t;
3240 3253
3241 3254 /*
3242 3255 * Drop locks and release vnodes that were held by zfs_rename_lock().
3243 3256 */
3244 3257 static void
3245 3258 zfs_rename_unlock(zfs_zlock_t **zlpp)
3246 3259 {
3247 3260 zfs_zlock_t *zl;
3248 3261
3249 3262 while ((zl = *zlpp) != NULL) {
3250 3263 if (zl->zl_znode != NULL)
3251 3264 VN_RELE(ZTOV(zl->zl_znode));
3252 3265 rw_exit(zl->zl_rwlock);
3253 3266 *zlpp = zl->zl_next;
3254 3267 kmem_free(zl, sizeof (*zl));
3255 3268 }
3256 3269 }
3257 3270
3258 3271 /*
3259 3272 * Search back through the directory tree, using the ".." entries.
3260 3273 * Lock each directory in the chain to prevent concurrent renames.
3261 3274 * Fail any attempt to move a directory into one of its own descendants.
3262 3275 * XXX - z_parent_lock can overlap with map or grow locks
3263 3276 */
3264 3277 static int
3265 3278 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3266 3279 {
3267 3280 zfs_zlock_t *zl;
3268 3281 znode_t *zp = tdzp;
3269 3282 uint64_t rootid = zp->z_zfsvfs->z_root;
3270 3283 uint64_t oidp = zp->z_id;
3271 3284 krwlock_t *rwlp = &szp->z_parent_lock;
3272 3285 krw_t rw = RW_WRITER;
3273 3286
3274 3287 /*
3275 3288 * First pass write-locks szp and compares to zp->z_id.
3276 3289 * Later passes read-lock zp and compare to zp->z_parent.
3277 3290 */
3278 3291 do {
3279 3292 if (!rw_tryenter(rwlp, rw)) {
3280 3293 /*
3281 3294 * Another thread is renaming in this path.
3282 3295 * Note that if we are a WRITER, we don't have any
3283 3296 * parent_locks held yet.
3284 3297 */
3285 3298 if (rw == RW_READER && zp->z_id > szp->z_id) {
3286 3299 /*
3287 3300 * Drop our locks and restart
3288 3301 */
3289 3302 zfs_rename_unlock(&zl);
3290 3303 *zlpp = NULL;
3291 3304 zp = tdzp;
3292 3305 oidp = zp->z_id;
3293 3306 rwlp = &szp->z_parent_lock;
3294 3307 rw = RW_WRITER;
3295 3308 continue;
3296 3309 } else {
3297 3310 /*
3298 3311 * Wait for other thread to drop its locks
3299 3312 */
3300 3313 rw_enter(rwlp, rw);
3301 3314 }
3302 3315 }
3303 3316
3304 3317 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3305 3318 zl->zl_rwlock = rwlp;
3306 3319 zl->zl_znode = NULL;
3307 3320 zl->zl_next = *zlpp;
3308 3321 *zlpp = zl;
3309 3322
3310 3323 if (oidp == szp->z_id) /* We're a descendant of szp */
3311 3324 return (SET_ERROR(EINVAL));
3312 3325
3313 3326 if (oidp == rootid) /* We've hit the top */
3314 3327 return (0);
3315 3328
3316 3329 if (rw == RW_READER) { /* i.e. not the first pass */
3317 3330 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3318 3331 if (error)
3319 3332 return (error);
3320 3333 zl->zl_znode = zp;
3321 3334 }
3322 3335 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3323 3336 &oidp, sizeof (oidp));
3324 3337 rwlp = &zp->z_parent_lock;
3325 3338 rw = RW_READER;
3326 3339
3327 3340 } while (zp->z_id != sdzp->z_id);
3328 3341
3329 3342 return (0);
3330 3343 }
3331 3344
3332 3345 /*
3333 3346 * Move an entry from the provided source directory to the target
3334 3347 * directory. Change the entry name as indicated.
3335 3348 *
3336 3349 * IN: sdvp - Source directory containing the "old entry".
3337 3350 * snm - Old entry name.
3338 3351 * tdvp - Target directory to contain the "new entry".
3339 3352 * tnm - New entry name.
3340 3353 * cr - credentials of caller.
3341 3354 * ct - caller context
3342 3355 * flags - case flags
3343 3356 *
3344 3357 * RETURN: 0 on success, error code on failure.
3345 3358 *
3346 3359 * Timestamps:
3347 3360 * sdvp,tdvp - ctime|mtime updated
3348 3361 */
3349 3362 /*ARGSUSED*/
3350 3363 static int
3351 3364 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3352 3365 caller_context_t *ct, int flags)
3353 3366 {
3354 3367 znode_t *tdzp, *szp, *tzp;
↓ open down ↓ |
1307 lines elided |
↑ open up ↑ |
3355 3368 znode_t *sdzp = VTOZ(sdvp);
3356 3369 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
3357 3370 zilog_t *zilog;
3358 3371 vnode_t *realvp;
3359 3372 zfs_dirlock_t *sdl, *tdl;
3360 3373 dmu_tx_t *tx;
3361 3374 zfs_zlock_t *zl;
3362 3375 int cmp, serr, terr;
3363 3376 int error = 0;
3364 3377 int zflg = 0;
3378 + boolean_t waited = B_FALSE;
3365 3379
3366 3380 ZFS_ENTER(zfsvfs);
3367 3381 ZFS_VERIFY_ZP(sdzp);
3368 3382 zilog = zfsvfs->z_log;
3369 3383
3370 3384 /*
3371 3385 * Make sure we have the real vp for the target directory.
3372 3386 */
3373 3387 if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3374 3388 tdvp = realvp;
3375 3389
3376 3390 tdzp = VTOZ(tdvp);
3377 3391 ZFS_VERIFY_ZP(tdzp);
3378 3392
3379 3393 /*
3380 3394 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3381 3395 * ctldir appear to have the same v_vfsp.
3382 3396 */
3383 3397 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3384 3398 ZFS_EXIT(zfsvfs);
3385 3399 return (SET_ERROR(EXDEV));
3386 3400 }
3387 3401
3388 3402 if (zfsvfs->z_utf8 && u8_validate(tnm,
3389 3403 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3390 3404 ZFS_EXIT(zfsvfs);
3391 3405 return (SET_ERROR(EILSEQ));
3392 3406 }
3393 3407
3394 3408 if (flags & FIGNORECASE)
3395 3409 zflg |= ZCILOOK;
3396 3410
3397 3411 top:
3398 3412 szp = NULL;
3399 3413 tzp = NULL;
3400 3414 zl = NULL;
3401 3415
3402 3416 /*
3403 3417 * This is to prevent the creation of links into attribute space
3404 3418 * by renaming a linked file into/outof an attribute directory.
3405 3419 * See the comment in zfs_link() for why this is considered bad.
3406 3420 */
3407 3421 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3408 3422 ZFS_EXIT(zfsvfs);
3409 3423 return (SET_ERROR(EINVAL));
3410 3424 }
3411 3425
3412 3426 /*
3413 3427 * Lock source and target directory entries. To prevent deadlock,
3414 3428 * a lock ordering must be defined. We lock the directory with
3415 3429 * the smallest object id first, or if it's a tie, the one with
3416 3430 * the lexically first name.
3417 3431 */
3418 3432 if (sdzp->z_id < tdzp->z_id) {
3419 3433 cmp = -1;
3420 3434 } else if (sdzp->z_id > tdzp->z_id) {
3421 3435 cmp = 1;
3422 3436 } else {
3423 3437 /*
3424 3438 * First compare the two name arguments without
3425 3439 * considering any case folding.
3426 3440 */
3427 3441 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3428 3442
3429 3443 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3430 3444 ASSERT(error == 0 || !zfsvfs->z_utf8);
3431 3445 if (cmp == 0) {
3432 3446 /*
3433 3447 * POSIX: "If the old argument and the new argument
3434 3448 * both refer to links to the same existing file,
3435 3449 * the rename() function shall return successfully
3436 3450 * and perform no other action."
3437 3451 */
3438 3452 ZFS_EXIT(zfsvfs);
3439 3453 return (0);
3440 3454 }
3441 3455 /*
3442 3456 * If the file system is case-folding, then we may
3443 3457 * have some more checking to do. A case-folding file
3444 3458 * system is either supporting mixed case sensitivity
3445 3459 * access or is completely case-insensitive. Note
3446 3460 * that the file system is always case preserving.
3447 3461 *
3448 3462 * In mixed sensitivity mode case sensitive behavior
3449 3463 * is the default. FIGNORECASE must be used to
3450 3464 * explicitly request case insensitive behavior.
3451 3465 *
3452 3466 * If the source and target names provided differ only
3453 3467 * by case (e.g., a request to rename 'tim' to 'Tim'),
3454 3468 * we will treat this as a special case in the
3455 3469 * case-insensitive mode: as long as the source name
3456 3470 * is an exact match, we will allow this to proceed as
3457 3471 * a name-change request.
3458 3472 */
3459 3473 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3460 3474 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3461 3475 flags & FIGNORECASE)) &&
3462 3476 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3463 3477 &error) == 0) {
3464 3478 /*
3465 3479 * case preserving rename request, require exact
3466 3480 * name matches
3467 3481 */
3468 3482 zflg |= ZCIEXACT;
3469 3483 zflg &= ~ZCILOOK;
3470 3484 }
3471 3485 }
3472 3486
3473 3487 /*
3474 3488 * If the source and destination directories are the same, we should
3475 3489 * grab the z_name_lock of that directory only once.
3476 3490 */
3477 3491 if (sdzp == tdzp) {
3478 3492 zflg |= ZHAVELOCK;
3479 3493 rw_enter(&sdzp->z_name_lock, RW_READER);
3480 3494 }
3481 3495
3482 3496 if (cmp < 0) {
3483 3497 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3484 3498 ZEXISTS | zflg, NULL, NULL);
3485 3499 terr = zfs_dirent_lock(&tdl,
3486 3500 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3487 3501 } else {
3488 3502 terr = zfs_dirent_lock(&tdl,
3489 3503 tdzp, tnm, &tzp, zflg, NULL, NULL);
3490 3504 serr = zfs_dirent_lock(&sdl,
3491 3505 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3492 3506 NULL, NULL);
3493 3507 }
3494 3508
3495 3509 if (serr) {
3496 3510 /*
3497 3511 * Source entry invalid or not there.
3498 3512 */
3499 3513 if (!terr) {
3500 3514 zfs_dirent_unlock(tdl);
3501 3515 if (tzp)
3502 3516 VN_RELE(ZTOV(tzp));
3503 3517 }
3504 3518
3505 3519 if (sdzp == tdzp)
3506 3520 rw_exit(&sdzp->z_name_lock);
3507 3521
3508 3522 if (strcmp(snm, "..") == 0)
3509 3523 serr = SET_ERROR(EINVAL);
3510 3524 ZFS_EXIT(zfsvfs);
3511 3525 return (serr);
3512 3526 }
3513 3527 if (terr) {
3514 3528 zfs_dirent_unlock(sdl);
3515 3529 VN_RELE(ZTOV(szp));
3516 3530
3517 3531 if (sdzp == tdzp)
3518 3532 rw_exit(&sdzp->z_name_lock);
3519 3533
3520 3534 if (strcmp(tnm, "..") == 0)
3521 3535 terr = SET_ERROR(EINVAL);
3522 3536 ZFS_EXIT(zfsvfs);
3523 3537 return (terr);
3524 3538 }
3525 3539
3526 3540 /*
3527 3541 * Must have write access at the source to remove the old entry
3528 3542 * and write access at the target to create the new entry.
3529 3543 * Note that if target and source are the same, this can be
3530 3544 * done in a single check.
3531 3545 */
3532 3546
3533 3547 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3534 3548 goto out;
3535 3549
3536 3550 if (ZTOV(szp)->v_type == VDIR) {
3537 3551 /*
3538 3552 * Check to make sure rename is valid.
3539 3553 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3540 3554 */
3541 3555 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3542 3556 goto out;
3543 3557 }
3544 3558
3545 3559 /*
3546 3560 * Does target exist?
3547 3561 */
3548 3562 if (tzp) {
3549 3563 /*
3550 3564 * Source and target must be the same type.
3551 3565 */
3552 3566 if (ZTOV(szp)->v_type == VDIR) {
3553 3567 if (ZTOV(tzp)->v_type != VDIR) {
3554 3568 error = SET_ERROR(ENOTDIR);
3555 3569 goto out;
3556 3570 }
3557 3571 } else {
3558 3572 if (ZTOV(tzp)->v_type == VDIR) {
3559 3573 error = SET_ERROR(EISDIR);
3560 3574 goto out;
3561 3575 }
3562 3576 }
3563 3577 /*
3564 3578 * POSIX dictates that when the source and target
3565 3579 * entries refer to the same file object, rename
3566 3580 * must do nothing and exit without error.
3567 3581 */
3568 3582 if (szp->z_id == tzp->z_id) {
3569 3583 error = 0;
3570 3584 goto out;
3571 3585 }
3572 3586 }
3573 3587
3574 3588 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3575 3589 if (tzp)
3576 3590 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3577 3591
3578 3592 /*
3579 3593 * notify the target directory if it is not the same
3580 3594 * as source directory.
3581 3595 */
3582 3596 if (tdvp != sdvp) {
3583 3597 vnevent_rename_dest_dir(tdvp, ct);
3584 3598 }
3585 3599
3586 3600 tx = dmu_tx_create(zfsvfs->z_os);
3587 3601 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3588 3602 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3589 3603 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3590 3604 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3591 3605 if (sdzp != tdzp) {
↓ open down ↓ |
217 lines elided |
↑ open up ↑ |
3592 3606 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3593 3607 zfs_sa_upgrade_txholds(tx, tdzp);
3594 3608 }
3595 3609 if (tzp) {
3596 3610 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3597 3611 zfs_sa_upgrade_txholds(tx, tzp);
3598 3612 }
3599 3613
3600 3614 zfs_sa_upgrade_txholds(tx, szp);
3601 3615 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3602 - error = dmu_tx_assign(tx, TXG_NOWAIT);
3616 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3603 3617 if (error) {
3604 3618 if (zl != NULL)
3605 3619 zfs_rename_unlock(&zl);
3606 3620 zfs_dirent_unlock(sdl);
3607 3621 zfs_dirent_unlock(tdl);
3608 3622
3609 3623 if (sdzp == tdzp)
3610 3624 rw_exit(&sdzp->z_name_lock);
3611 3625
3612 3626 VN_RELE(ZTOV(szp));
3613 3627 if (tzp)
3614 3628 VN_RELE(ZTOV(tzp));
3615 3629 if (error == ERESTART) {
3630 + waited = B_TRUE;
3616 3631 dmu_tx_wait(tx);
3617 3632 dmu_tx_abort(tx);
3618 3633 goto top;
3619 3634 }
3620 3635 dmu_tx_abort(tx);
3621 3636 ZFS_EXIT(zfsvfs);
3622 3637 return (error);
3623 3638 }
3624 3639
3625 3640 if (tzp) /* Attempt to remove the existing target */
3626 3641 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3627 3642
3628 3643 if (error == 0) {
3629 3644 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3630 3645 if (error == 0) {
3631 3646 szp->z_pflags |= ZFS_AV_MODIFIED;
3632 3647
3633 3648 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3634 3649 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3635 3650 ASSERT0(error);
3636 3651
3637 3652 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3638 3653 if (error == 0) {
3639 3654 zfs_log_rename(zilog, tx, TX_RENAME |
3640 3655 (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3641 3656 sdl->dl_name, tdzp, tdl->dl_name, szp);
3642 3657
3643 3658 /*
3644 3659 * Update path information for the target vnode
3645 3660 */
3646 3661 vn_renamepath(tdvp, ZTOV(szp), tnm,
3647 3662 strlen(tnm));
3648 3663 } else {
3649 3664 /*
3650 3665 * At this point, we have successfully created
3651 3666 * the target name, but have failed to remove
3652 3667 * the source name. Since the create was done
3653 3668 * with the ZRENAMING flag, there are
3654 3669 * complications; for one, the link count is
3655 3670 * wrong. The easiest way to deal with this
3656 3671 * is to remove the newly created target, and
3657 3672 * return the original error. This must
3658 3673 * succeed; fortunately, it is very unlikely to
3659 3674 * fail, since we just created it.
3660 3675 */
3661 3676 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3662 3677 ZRENAMING, NULL), ==, 0);
3663 3678 }
3664 3679 }
3665 3680 }
3666 3681
3667 3682 dmu_tx_commit(tx);
3668 3683 out:
3669 3684 if (zl != NULL)
3670 3685 zfs_rename_unlock(&zl);
3671 3686
3672 3687 zfs_dirent_unlock(sdl);
3673 3688 zfs_dirent_unlock(tdl);
3674 3689
3675 3690 if (sdzp == tdzp)
3676 3691 rw_exit(&sdzp->z_name_lock);
3677 3692
3678 3693
3679 3694 VN_RELE(ZTOV(szp));
3680 3695 if (tzp)
3681 3696 VN_RELE(ZTOV(tzp));
3682 3697
3683 3698 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3684 3699 zil_commit(zilog, 0);
3685 3700
3686 3701 ZFS_EXIT(zfsvfs);
3687 3702 return (error);
3688 3703 }
3689 3704
3690 3705 /*
3691 3706 * Insert the indicated symbolic reference entry into the directory.
3692 3707 *
3693 3708 * IN: dvp - Directory to contain new symbolic link.
3694 3709 * link - Name for new symlink entry.
3695 3710 * vap - Attributes of new entry.
3696 3711 * cr - credentials of caller.
3697 3712 * ct - caller context
3698 3713 * flags - case flags
3699 3714 *
3700 3715 * RETURN: 0 on success, error code on failure.
3701 3716 *
3702 3717 * Timestamps:
3703 3718 * dvp - ctime|mtime updated
3704 3719 */
3705 3720 /*ARGSUSED*/
3706 3721 static int
3707 3722 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3708 3723 caller_context_t *ct, int flags)
3709 3724 {
3710 3725 znode_t *zp, *dzp = VTOZ(dvp);
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
3711 3726 zfs_dirlock_t *dl;
3712 3727 dmu_tx_t *tx;
3713 3728 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3714 3729 zilog_t *zilog;
3715 3730 uint64_t len = strlen(link);
3716 3731 int error;
3717 3732 int zflg = ZNEW;
3718 3733 zfs_acl_ids_t acl_ids;
3719 3734 boolean_t fuid_dirtied;
3720 3735 uint64_t txtype = TX_SYMLINK;
3736 + boolean_t waited = B_FALSE;
3721 3737
3722 3738 ASSERT(vap->va_type == VLNK);
3723 3739
3724 3740 ZFS_ENTER(zfsvfs);
3725 3741 ZFS_VERIFY_ZP(dzp);
3726 3742 zilog = zfsvfs->z_log;
3727 3743
3728 3744 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3729 3745 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3730 3746 ZFS_EXIT(zfsvfs);
3731 3747 return (SET_ERROR(EILSEQ));
3732 3748 }
3733 3749 if (flags & FIGNORECASE)
3734 3750 zflg |= ZCILOOK;
3735 3751
3736 3752 if (len > MAXPATHLEN) {
3737 3753 ZFS_EXIT(zfsvfs);
3738 3754 return (SET_ERROR(ENAMETOOLONG));
3739 3755 }
3740 3756
3741 3757 if ((error = zfs_acl_ids_create(dzp, 0,
3742 3758 vap, cr, NULL, &acl_ids)) != 0) {
3743 3759 ZFS_EXIT(zfsvfs);
3744 3760 return (error);
3745 3761 }
3746 3762 top:
3747 3763 /*
3748 3764 * Attempt to lock directory; fail if entry already exists.
3749 3765 */
3750 3766 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3751 3767 if (error) {
3752 3768 zfs_acl_ids_free(&acl_ids);
3753 3769 ZFS_EXIT(zfsvfs);
3754 3770 return (error);
3755 3771 }
3756 3772
3757 3773 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3758 3774 zfs_acl_ids_free(&acl_ids);
3759 3775 zfs_dirent_unlock(dl);
3760 3776 ZFS_EXIT(zfsvfs);
3761 3777 return (error);
3762 3778 }
3763 3779
3764 3780 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3765 3781 zfs_acl_ids_free(&acl_ids);
3766 3782 zfs_dirent_unlock(dl);
3767 3783 ZFS_EXIT(zfsvfs);
3768 3784 return (SET_ERROR(EDQUOT));
3769 3785 }
3770 3786 tx = dmu_tx_create(zfsvfs->z_os);
3771 3787 fuid_dirtied = zfsvfs->z_fuid_dirty;
3772 3788 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
↓ open down ↓ |
42 lines elided |
↑ open up ↑ |
3773 3789 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3774 3790 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3775 3791 ZFS_SA_BASE_ATTR_SIZE + len);
3776 3792 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3777 3793 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3778 3794 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3779 3795 acl_ids.z_aclp->z_acl_bytes);
3780 3796 }
3781 3797 if (fuid_dirtied)
3782 3798 zfs_fuid_txhold(zfsvfs, tx);
3783 - error = dmu_tx_assign(tx, TXG_NOWAIT);
3799 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3784 3800 if (error) {
3785 3801 zfs_dirent_unlock(dl);
3786 3802 if (error == ERESTART) {
3803 + waited = B_TRUE;
3787 3804 dmu_tx_wait(tx);
3788 3805 dmu_tx_abort(tx);
3789 3806 goto top;
3790 3807 }
3791 3808 zfs_acl_ids_free(&acl_ids);
3792 3809 dmu_tx_abort(tx);
3793 3810 ZFS_EXIT(zfsvfs);
3794 3811 return (error);
3795 3812 }
3796 3813
3797 3814 /*
3798 3815 * Create a new object for the symlink.
3799 3816 * for version 4 ZPL datsets the symlink will be an SA attribute
3800 3817 */
3801 3818 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3802 3819
3803 3820 if (fuid_dirtied)
3804 3821 zfs_fuid_sync(zfsvfs, tx);
3805 3822
3806 3823 mutex_enter(&zp->z_lock);
3807 3824 if (zp->z_is_sa)
3808 3825 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3809 3826 link, len, tx);
3810 3827 else
3811 3828 zfs_sa_symlink(zp, link, len, tx);
3812 3829 mutex_exit(&zp->z_lock);
3813 3830
3814 3831 zp->z_size = len;
3815 3832 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3816 3833 &zp->z_size, sizeof (zp->z_size), tx);
3817 3834 /*
3818 3835 * Insert the new object into the directory.
3819 3836 */
3820 3837 (void) zfs_link_create(dl, zp, tx, ZNEW);
3821 3838
3822 3839 if (flags & FIGNORECASE)
3823 3840 txtype |= TX_CI;
3824 3841 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3825 3842
3826 3843 zfs_acl_ids_free(&acl_ids);
3827 3844
3828 3845 dmu_tx_commit(tx);
3829 3846
3830 3847 zfs_dirent_unlock(dl);
3831 3848
3832 3849 VN_RELE(ZTOV(zp));
3833 3850
3834 3851 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3835 3852 zil_commit(zilog, 0);
3836 3853
3837 3854 ZFS_EXIT(zfsvfs);
3838 3855 return (error);
3839 3856 }
3840 3857
3841 3858 /*
3842 3859 * Return, in the buffer contained in the provided uio structure,
3843 3860 * the symbolic path referred to by vp.
3844 3861 *
3845 3862 * IN: vp - vnode of symbolic link.
3846 3863 * uio - structure to contain the link path.
3847 3864 * cr - credentials of caller.
3848 3865 * ct - caller context
3849 3866 *
3850 3867 * OUT: uio - structure containing the link path.
3851 3868 *
3852 3869 * RETURN: 0 on success, error code on failure.
3853 3870 *
3854 3871 * Timestamps:
3855 3872 * vp - atime updated
3856 3873 */
3857 3874 /* ARGSUSED */
3858 3875 static int
3859 3876 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3860 3877 {
3861 3878 znode_t *zp = VTOZ(vp);
3862 3879 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3863 3880 int error;
3864 3881
3865 3882 ZFS_ENTER(zfsvfs);
3866 3883 ZFS_VERIFY_ZP(zp);
3867 3884
3868 3885 mutex_enter(&zp->z_lock);
3869 3886 if (zp->z_is_sa)
3870 3887 error = sa_lookup_uio(zp->z_sa_hdl,
3871 3888 SA_ZPL_SYMLINK(zfsvfs), uio);
3872 3889 else
3873 3890 error = zfs_sa_readlink(zp, uio);
3874 3891 mutex_exit(&zp->z_lock);
3875 3892
3876 3893 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3877 3894
3878 3895 ZFS_EXIT(zfsvfs);
3879 3896 return (error);
3880 3897 }
3881 3898
3882 3899 /*
3883 3900 * Insert a new entry into directory tdvp referencing svp.
3884 3901 *
3885 3902 * IN: tdvp - Directory to contain new entry.
3886 3903 * svp - vnode of new entry.
3887 3904 * name - name of new entry.
3888 3905 * cr - credentials of caller.
3889 3906 * ct - caller context
3890 3907 *
3891 3908 * RETURN: 0 on success, error code on failure.
3892 3909 *
3893 3910 * Timestamps:
3894 3911 * tdvp - ctime|mtime updated
3895 3912 * svp - ctime updated
3896 3913 */
3897 3914 /* ARGSUSED */
3898 3915 static int
3899 3916 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3900 3917 caller_context_t *ct, int flags)
3901 3918 {
3902 3919 znode_t *dzp = VTOZ(tdvp);
↓ open down ↓ |
106 lines elided |
↑ open up ↑ |
3903 3920 znode_t *tzp, *szp;
3904 3921 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3905 3922 zilog_t *zilog;
3906 3923 zfs_dirlock_t *dl;
3907 3924 dmu_tx_t *tx;
3908 3925 vnode_t *realvp;
3909 3926 int error;
3910 3927 int zf = ZNEW;
3911 3928 uint64_t parent;
3912 3929 uid_t owner;
3930 + boolean_t waited = B_FALSE;
3913 3931
3914 3932 ASSERT(tdvp->v_type == VDIR);
3915 3933
3916 3934 ZFS_ENTER(zfsvfs);
3917 3935 ZFS_VERIFY_ZP(dzp);
3918 3936 zilog = zfsvfs->z_log;
3919 3937
3920 3938 if (VOP_REALVP(svp, &realvp, ct) == 0)
3921 3939 svp = realvp;
3922 3940
3923 3941 /*
3924 3942 * POSIX dictates that we return EPERM here.
3925 3943 * Better choices include ENOTSUP or EISDIR.
3926 3944 */
3927 3945 if (svp->v_type == VDIR) {
3928 3946 ZFS_EXIT(zfsvfs);
3929 3947 return (SET_ERROR(EPERM));
3930 3948 }
3931 3949
3932 3950 szp = VTOZ(svp);
3933 3951 ZFS_VERIFY_ZP(szp);
3934 3952
3935 3953 /*
3936 3954 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3937 3955 * ctldir appear to have the same v_vfsp.
3938 3956 */
3939 3957 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
3940 3958 ZFS_EXIT(zfsvfs);
3941 3959 return (SET_ERROR(EXDEV));
3942 3960 }
3943 3961
3944 3962 /* Prevent links to .zfs/shares files */
3945 3963
3946 3964 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3947 3965 &parent, sizeof (uint64_t))) != 0) {
3948 3966 ZFS_EXIT(zfsvfs);
3949 3967 return (error);
3950 3968 }
3951 3969 if (parent == zfsvfs->z_shares_dir) {
3952 3970 ZFS_EXIT(zfsvfs);
3953 3971 return (SET_ERROR(EPERM));
3954 3972 }
3955 3973
3956 3974 if (zfsvfs->z_utf8 && u8_validate(name,
3957 3975 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3958 3976 ZFS_EXIT(zfsvfs);
3959 3977 return (SET_ERROR(EILSEQ));
3960 3978 }
3961 3979 if (flags & FIGNORECASE)
3962 3980 zf |= ZCILOOK;
3963 3981
3964 3982 /*
3965 3983 * We do not support links between attributes and non-attributes
3966 3984 * because of the potential security risk of creating links
3967 3985 * into "normal" file space in order to circumvent restrictions
3968 3986 * imposed in attribute space.
3969 3987 */
3970 3988 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
3971 3989 ZFS_EXIT(zfsvfs);
3972 3990 return (SET_ERROR(EINVAL));
3973 3991 }
3974 3992
3975 3993
3976 3994 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3977 3995 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3978 3996 ZFS_EXIT(zfsvfs);
3979 3997 return (SET_ERROR(EPERM));
3980 3998 }
3981 3999
3982 4000 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3983 4001 ZFS_EXIT(zfsvfs);
3984 4002 return (error);
3985 4003 }
3986 4004
3987 4005 top:
3988 4006 /*
3989 4007 * Attempt to lock directory; fail if entry already exists.
3990 4008 */
3991 4009 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
↓ open down ↓ |
69 lines elided |
↑ open up ↑ |
3992 4010 if (error) {
3993 4011 ZFS_EXIT(zfsvfs);
3994 4012 return (error);
3995 4013 }
3996 4014
3997 4015 tx = dmu_tx_create(zfsvfs->z_os);
3998 4016 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3999 4017 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4000 4018 zfs_sa_upgrade_txholds(tx, szp);
4001 4019 zfs_sa_upgrade_txholds(tx, dzp);
4002 - error = dmu_tx_assign(tx, TXG_NOWAIT);
4020 + error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4003 4021 if (error) {
4004 4022 zfs_dirent_unlock(dl);
4005 4023 if (error == ERESTART) {
4024 + waited = B_TRUE;
4006 4025 dmu_tx_wait(tx);
4007 4026 dmu_tx_abort(tx);
4008 4027 goto top;
4009 4028 }
4010 4029 dmu_tx_abort(tx);
4011 4030 ZFS_EXIT(zfsvfs);
4012 4031 return (error);
4013 4032 }
4014 4033
4015 4034 error = zfs_link_create(dl, szp, tx, 0);
4016 4035
4017 4036 if (error == 0) {
4018 4037 uint64_t txtype = TX_LINK;
4019 4038 if (flags & FIGNORECASE)
4020 4039 txtype |= TX_CI;
4021 4040 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4022 4041 }
4023 4042
4024 4043 dmu_tx_commit(tx);
4025 4044
4026 4045 zfs_dirent_unlock(dl);
4027 4046
4028 4047 if (error == 0) {
4029 4048 vnevent_link(svp, ct);
4030 4049 }
4031 4050
4032 4051 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4033 4052 zil_commit(zilog, 0);
4034 4053
4035 4054 ZFS_EXIT(zfsvfs);
4036 4055 return (error);
4037 4056 }
4038 4057
4039 4058 /*
4040 4059 * zfs_null_putapage() is used when the file system has been force
4041 4060 * unmounted. It just drops the pages.
4042 4061 */
4043 4062 /* ARGSUSED */
4044 4063 static int
4045 4064 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4046 4065 size_t *lenp, int flags, cred_t *cr)
4047 4066 {
4048 4067 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4049 4068 return (0);
4050 4069 }
4051 4070
4052 4071 /*
4053 4072 * Push a page out to disk, klustering if possible.
4054 4073 *
4055 4074 * IN: vp - file to push page to.
4056 4075 * pp - page to push.
4057 4076 * flags - additional flags.
4058 4077 * cr - credentials of caller.
4059 4078 *
4060 4079 * OUT: offp - start of range pushed.
4061 4080 * lenp - len of range pushed.
4062 4081 *
4063 4082 * RETURN: 0 on success, error code on failure.
4064 4083 *
4065 4084 * NOTE: callers must have locked the page to be pushed. On
4066 4085 * exit, the page (and all other pages in the kluster) must be
4067 4086 * unlocked.
4068 4087 */
4069 4088 /* ARGSUSED */
4070 4089 static int
4071 4090 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4072 4091 size_t *lenp, int flags, cred_t *cr)
4073 4092 {
4074 4093 znode_t *zp = VTOZ(vp);
4075 4094 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4076 4095 dmu_tx_t *tx;
4077 4096 u_offset_t off, koff;
4078 4097 size_t len, klen;
4079 4098 int err;
4080 4099
4081 4100 off = pp->p_offset;
4082 4101 len = PAGESIZE;
4083 4102 /*
4084 4103 * If our blocksize is bigger than the page size, try to kluster
4085 4104 * multiple pages so that we write a full block (thus avoiding
4086 4105 * a read-modify-write).
4087 4106 */
4088 4107 if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4089 4108 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4090 4109 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4091 4110 ASSERT(koff <= zp->z_size);
4092 4111 if (koff + klen > zp->z_size)
4093 4112 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4094 4113 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4095 4114 }
4096 4115 ASSERT3U(btop(len), ==, btopr(len));
4097 4116
4098 4117 /*
4099 4118 * Can't push pages past end-of-file.
4100 4119 */
4101 4120 if (off >= zp->z_size) {
4102 4121 /* ignore all pages */
4103 4122 err = 0;
4104 4123 goto out;
4105 4124 } else if (off + len > zp->z_size) {
4106 4125 int npages = btopr(zp->z_size - off);
4107 4126 page_t *trunc;
4108 4127
4109 4128 page_list_break(&pp, &trunc, npages);
4110 4129 /* ignore pages past end of file */
4111 4130 if (trunc)
4112 4131 pvn_write_done(trunc, flags);
4113 4132 len = zp->z_size - off;
4114 4133 }
4115 4134
4116 4135 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4117 4136 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4118 4137 err = SET_ERROR(EDQUOT);
4119 4138 goto out;
4120 4139 }
4121 4140 top:
4122 4141 tx = dmu_tx_create(zfsvfs->z_os);
4123 4142 dmu_tx_hold_write(tx, zp->z_id, off, len);
4124 4143
4125 4144 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4126 4145 zfs_sa_upgrade_txholds(tx, zp);
4127 4146 err = dmu_tx_assign(tx, TXG_NOWAIT);
4128 4147 if (err != 0) {
4129 4148 if (err == ERESTART) {
4130 4149 dmu_tx_wait(tx);
4131 4150 dmu_tx_abort(tx);
4132 4151 goto top;
4133 4152 }
4134 4153 dmu_tx_abort(tx);
4135 4154 goto out;
4136 4155 }
4137 4156
4138 4157 if (zp->z_blksz <= PAGESIZE) {
4139 4158 caddr_t va = zfs_map_page(pp, S_READ);
4140 4159 ASSERT3U(len, <=, PAGESIZE);
4141 4160 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4142 4161 zfs_unmap_page(pp, va);
4143 4162 } else {
4144 4163 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4145 4164 }
4146 4165
4147 4166 if (err == 0) {
4148 4167 uint64_t mtime[2], ctime[2];
4149 4168 sa_bulk_attr_t bulk[3];
4150 4169 int count = 0;
4151 4170
4152 4171 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4153 4172 &mtime, 16);
4154 4173 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4155 4174 &ctime, 16);
4156 4175 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4157 4176 &zp->z_pflags, 8);
4158 4177 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4159 4178 B_TRUE);
4160 4179 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4161 4180 }
4162 4181 dmu_tx_commit(tx);
4163 4182
4164 4183 out:
4165 4184 pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4166 4185 if (offp)
4167 4186 *offp = off;
4168 4187 if (lenp)
4169 4188 *lenp = len;
4170 4189
4171 4190 return (err);
4172 4191 }
4173 4192
4174 4193 /*
4175 4194 * Copy the portion of the file indicated from pages into the file.
4176 4195 * The pages are stored in a page list attached to the files vnode.
4177 4196 *
4178 4197 * IN: vp - vnode of file to push page data to.
4179 4198 * off - position in file to put data.
4180 4199 * len - amount of data to write.
4181 4200 * flags - flags to control the operation.
4182 4201 * cr - credentials of caller.
4183 4202 * ct - caller context.
4184 4203 *
4185 4204 * RETURN: 0 on success, error code on failure.
4186 4205 *
4187 4206 * Timestamps:
4188 4207 * vp - ctime|mtime updated
4189 4208 */
4190 4209 /*ARGSUSED*/
4191 4210 static int
4192 4211 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4193 4212 caller_context_t *ct)
4194 4213 {
4195 4214 znode_t *zp = VTOZ(vp);
4196 4215 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4197 4216 page_t *pp;
4198 4217 size_t io_len;
4199 4218 u_offset_t io_off;
4200 4219 uint_t blksz;
4201 4220 rl_t *rl;
4202 4221 int error = 0;
4203 4222
4204 4223 ZFS_ENTER(zfsvfs);
4205 4224 ZFS_VERIFY_ZP(zp);
4206 4225
4207 4226 /*
4208 4227 * There's nothing to do if no data is cached.
4209 4228 */
4210 4229 if (!vn_has_cached_data(vp)) {
4211 4230 ZFS_EXIT(zfsvfs);
4212 4231 return (0);
4213 4232 }
4214 4233
4215 4234 /*
4216 4235 * Align this request to the file block size in case we kluster.
4217 4236 * XXX - this can result in pretty aggresive locking, which can
4218 4237 * impact simultanious read/write access. One option might be
4219 4238 * to break up long requests (len == 0) into block-by-block
4220 4239 * operations to get narrower locking.
4221 4240 */
4222 4241 blksz = zp->z_blksz;
4223 4242 if (ISP2(blksz))
4224 4243 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4225 4244 else
4226 4245 io_off = 0;
4227 4246 if (len > 0 && ISP2(blksz))
4228 4247 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4229 4248 else
4230 4249 io_len = 0;
4231 4250
4232 4251 if (io_len == 0) {
4233 4252 /*
4234 4253 * Search the entire vp list for pages >= io_off.
4235 4254 */
4236 4255 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4237 4256 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4238 4257 goto out;
4239 4258 }
4240 4259 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4241 4260
4242 4261 if (off > zp->z_size) {
4243 4262 /* past end of file */
4244 4263 zfs_range_unlock(rl);
4245 4264 ZFS_EXIT(zfsvfs);
4246 4265 return (0);
4247 4266 }
4248 4267
4249 4268 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4250 4269
4251 4270 for (off = io_off; io_off < off + len; io_off += io_len) {
4252 4271 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4253 4272 pp = page_lookup(vp, io_off,
4254 4273 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4255 4274 } else {
4256 4275 pp = page_lookup_nowait(vp, io_off,
4257 4276 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4258 4277 }
4259 4278
4260 4279 if (pp != NULL && pvn_getdirty(pp, flags)) {
4261 4280 int err;
4262 4281
4263 4282 /*
4264 4283 * Found a dirty page to push
4265 4284 */
4266 4285 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4267 4286 if (err)
4268 4287 error = err;
4269 4288 } else {
4270 4289 io_len = PAGESIZE;
4271 4290 }
4272 4291 }
4273 4292 out:
4274 4293 zfs_range_unlock(rl);
4275 4294 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4276 4295 zil_commit(zfsvfs->z_log, zp->z_id);
4277 4296 ZFS_EXIT(zfsvfs);
4278 4297 return (error);
4279 4298 }
4280 4299
4281 4300 /*ARGSUSED*/
4282 4301 void
4283 4302 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4284 4303 {
4285 4304 znode_t *zp = VTOZ(vp);
4286 4305 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4287 4306 int error;
4288 4307
4289 4308 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4290 4309 if (zp->z_sa_hdl == NULL) {
4291 4310 /*
4292 4311 * The fs has been unmounted, or we did a
4293 4312 * suspend/resume and this file no longer exists.
4294 4313 */
4295 4314 if (vn_has_cached_data(vp)) {
4296 4315 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4297 4316 B_INVAL, cr);
4298 4317 }
4299 4318
4300 4319 mutex_enter(&zp->z_lock);
4301 4320 mutex_enter(&vp->v_lock);
4302 4321 ASSERT(vp->v_count == 1);
4303 4322 vp->v_count = 0;
4304 4323 mutex_exit(&vp->v_lock);
4305 4324 mutex_exit(&zp->z_lock);
4306 4325 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4307 4326 zfs_znode_free(zp);
4308 4327 return;
4309 4328 }
4310 4329
4311 4330 /*
4312 4331 * Attempt to push any data in the page cache. If this fails
4313 4332 * we will get kicked out later in zfs_zinactive().
4314 4333 */
4315 4334 if (vn_has_cached_data(vp)) {
4316 4335 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4317 4336 cr);
4318 4337 }
4319 4338
4320 4339 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4321 4340 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4322 4341
4323 4342 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4324 4343 zfs_sa_upgrade_txholds(tx, zp);
4325 4344 error = dmu_tx_assign(tx, TXG_WAIT);
4326 4345 if (error) {
4327 4346 dmu_tx_abort(tx);
4328 4347 } else {
4329 4348 mutex_enter(&zp->z_lock);
4330 4349 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4331 4350 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4332 4351 zp->z_atime_dirty = 0;
4333 4352 mutex_exit(&zp->z_lock);
4334 4353 dmu_tx_commit(tx);
4335 4354 }
4336 4355 }
4337 4356
4338 4357 zfs_zinactive(zp);
4339 4358 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4340 4359 }
4341 4360
4342 4361 /*
4343 4362 * Bounds-check the seek operation.
4344 4363 *
4345 4364 * IN: vp - vnode seeking within
4346 4365 * ooff - old file offset
4347 4366 * noffp - pointer to new file offset
4348 4367 * ct - caller context
4349 4368 *
4350 4369 * RETURN: 0 on success, EINVAL if new offset invalid.
4351 4370 */
4352 4371 /* ARGSUSED */
4353 4372 static int
4354 4373 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4355 4374 caller_context_t *ct)
4356 4375 {
4357 4376 if (vp->v_type == VDIR)
4358 4377 return (0);
4359 4378 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4360 4379 }
4361 4380
4362 4381 /*
4363 4382 * Pre-filter the generic locking function to trap attempts to place
4364 4383 * a mandatory lock on a memory mapped file.
4365 4384 */
4366 4385 static int
4367 4386 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4368 4387 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4369 4388 {
4370 4389 znode_t *zp = VTOZ(vp);
4371 4390 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4372 4391
4373 4392 ZFS_ENTER(zfsvfs);
4374 4393 ZFS_VERIFY_ZP(zp);
4375 4394
4376 4395 /*
4377 4396 * We are following the UFS semantics with respect to mapcnt
4378 4397 * here: If we see that the file is mapped already, then we will
4379 4398 * return an error, but we don't worry about races between this
4380 4399 * function and zfs_map().
4381 4400 */
4382 4401 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4383 4402 ZFS_EXIT(zfsvfs);
4384 4403 return (SET_ERROR(EAGAIN));
4385 4404 }
4386 4405 ZFS_EXIT(zfsvfs);
4387 4406 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4388 4407 }
4389 4408
4390 4409 /*
4391 4410 * If we can't find a page in the cache, we will create a new page
4392 4411 * and fill it with file data. For efficiency, we may try to fill
4393 4412 * multiple pages at once (klustering) to fill up the supplied page
4394 4413 * list. Note that the pages to be filled are held with an exclusive
4395 4414 * lock to prevent access by other threads while they are being filled.
4396 4415 */
4397 4416 static int
4398 4417 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4399 4418 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4400 4419 {
4401 4420 znode_t *zp = VTOZ(vp);
4402 4421 page_t *pp, *cur_pp;
4403 4422 objset_t *os = zp->z_zfsvfs->z_os;
4404 4423 u_offset_t io_off, total;
4405 4424 size_t io_len;
4406 4425 int err;
4407 4426
4408 4427 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4409 4428 /*
4410 4429 * We only have a single page, don't bother klustering
4411 4430 */
4412 4431 io_off = off;
4413 4432 io_len = PAGESIZE;
4414 4433 pp = page_create_va(vp, io_off, io_len,
4415 4434 PG_EXCL | PG_WAIT, seg, addr);
4416 4435 } else {
4417 4436 /*
4418 4437 * Try to find enough pages to fill the page list
4419 4438 */
4420 4439 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4421 4440 &io_len, off, plsz, 0);
4422 4441 }
4423 4442 if (pp == NULL) {
4424 4443 /*
4425 4444 * The page already exists, nothing to do here.
4426 4445 */
4427 4446 *pl = NULL;
4428 4447 return (0);
4429 4448 }
4430 4449
4431 4450 /*
4432 4451 * Fill the pages in the kluster.
4433 4452 */
4434 4453 cur_pp = pp;
4435 4454 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4436 4455 caddr_t va;
4437 4456
4438 4457 ASSERT3U(io_off, ==, cur_pp->p_offset);
4439 4458 va = zfs_map_page(cur_pp, S_WRITE);
4440 4459 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4441 4460 DMU_READ_PREFETCH);
4442 4461 zfs_unmap_page(cur_pp, va);
4443 4462 if (err) {
4444 4463 /* On error, toss the entire kluster */
4445 4464 pvn_read_done(pp, B_ERROR);
4446 4465 /* convert checksum errors into IO errors */
4447 4466 if (err == ECKSUM)
4448 4467 err = SET_ERROR(EIO);
4449 4468 return (err);
4450 4469 }
4451 4470 cur_pp = cur_pp->p_next;
4452 4471 }
4453 4472
4454 4473 /*
4455 4474 * Fill in the page list array from the kluster starting
4456 4475 * from the desired offset `off'.
4457 4476 * NOTE: the page list will always be null terminated.
4458 4477 */
4459 4478 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4460 4479 ASSERT(pl == NULL || (*pl)->p_offset == off);
4461 4480
4462 4481 return (0);
4463 4482 }
4464 4483
4465 4484 /*
4466 4485 * Return pointers to the pages for the file region [off, off + len]
4467 4486 * in the pl array. If plsz is greater than len, this function may
4468 4487 * also return page pointers from after the specified region
4469 4488 * (i.e. the region [off, off + plsz]). These additional pages are
4470 4489 * only returned if they are already in the cache, or were created as
4471 4490 * part of a klustered read.
4472 4491 *
4473 4492 * IN: vp - vnode of file to get data from.
4474 4493 * off - position in file to get data from.
4475 4494 * len - amount of data to retrieve.
4476 4495 * plsz - length of provided page list.
4477 4496 * seg - segment to obtain pages for.
4478 4497 * addr - virtual address of fault.
4479 4498 * rw - mode of created pages.
4480 4499 * cr - credentials of caller.
4481 4500 * ct - caller context.
4482 4501 *
4483 4502 * OUT: protp - protection mode of created pages.
4484 4503 * pl - list of pages created.
4485 4504 *
4486 4505 * RETURN: 0 on success, error code on failure.
4487 4506 *
4488 4507 * Timestamps:
4489 4508 * vp - atime updated
4490 4509 */
4491 4510 /* ARGSUSED */
4492 4511 static int
4493 4512 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4494 4513 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4495 4514 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4496 4515 {
4497 4516 znode_t *zp = VTOZ(vp);
4498 4517 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4499 4518 page_t **pl0 = pl;
4500 4519 int err = 0;
4501 4520
4502 4521 /* we do our own caching, faultahead is unnecessary */
4503 4522 if (pl == NULL)
4504 4523 return (0);
4505 4524 else if (len > plsz)
4506 4525 len = plsz;
4507 4526 else
4508 4527 len = P2ROUNDUP(len, PAGESIZE);
4509 4528 ASSERT(plsz >= len);
4510 4529
4511 4530 ZFS_ENTER(zfsvfs);
4512 4531 ZFS_VERIFY_ZP(zp);
4513 4532
4514 4533 if (protp)
4515 4534 *protp = PROT_ALL;
4516 4535
4517 4536 /*
4518 4537 * Loop through the requested range [off, off + len) looking
4519 4538 * for pages. If we don't find a page, we will need to create
4520 4539 * a new page and fill it with data from the file.
4521 4540 */
4522 4541 while (len > 0) {
4523 4542 if (*pl = page_lookup(vp, off, SE_SHARED))
4524 4543 *(pl+1) = NULL;
4525 4544 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4526 4545 goto out;
4527 4546 while (*pl) {
4528 4547 ASSERT3U((*pl)->p_offset, ==, off);
4529 4548 off += PAGESIZE;
4530 4549 addr += PAGESIZE;
4531 4550 if (len > 0) {
4532 4551 ASSERT3U(len, >=, PAGESIZE);
4533 4552 len -= PAGESIZE;
4534 4553 }
4535 4554 ASSERT3U(plsz, >=, PAGESIZE);
4536 4555 plsz -= PAGESIZE;
4537 4556 pl++;
4538 4557 }
4539 4558 }
4540 4559
4541 4560 /*
4542 4561 * Fill out the page array with any pages already in the cache.
4543 4562 */
4544 4563 while (plsz > 0 &&
4545 4564 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4546 4565 off += PAGESIZE;
4547 4566 plsz -= PAGESIZE;
4548 4567 }
4549 4568 out:
4550 4569 if (err) {
4551 4570 /*
4552 4571 * Release any pages we have previously locked.
4553 4572 */
4554 4573 while (pl > pl0)
4555 4574 page_unlock(*--pl);
4556 4575 } else {
4557 4576 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4558 4577 }
4559 4578
4560 4579 *pl = NULL;
4561 4580
4562 4581 ZFS_EXIT(zfsvfs);
4563 4582 return (err);
4564 4583 }
4565 4584
4566 4585 /*
4567 4586 * Request a memory map for a section of a file. This code interacts
4568 4587 * with common code and the VM system as follows:
4569 4588 *
4570 4589 * - common code calls mmap(), which ends up in smmap_common()
4571 4590 * - this calls VOP_MAP(), which takes you into (say) zfs
4572 4591 * - zfs_map() calls as_map(), passing segvn_create() as the callback
4573 4592 * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4574 4593 * - zfs_addmap() updates z_mapcnt
4575 4594 */
4576 4595 /*ARGSUSED*/
4577 4596 static int
4578 4597 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4579 4598 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4580 4599 caller_context_t *ct)
4581 4600 {
4582 4601 znode_t *zp = VTOZ(vp);
4583 4602 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4584 4603 segvn_crargs_t vn_a;
4585 4604 int error;
4586 4605
4587 4606 ZFS_ENTER(zfsvfs);
4588 4607 ZFS_VERIFY_ZP(zp);
4589 4608
4590 4609 if ((prot & PROT_WRITE) && (zp->z_pflags &
4591 4610 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4592 4611 ZFS_EXIT(zfsvfs);
4593 4612 return (SET_ERROR(EPERM));
4594 4613 }
4595 4614
4596 4615 if ((prot & (PROT_READ | PROT_EXEC)) &&
4597 4616 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4598 4617 ZFS_EXIT(zfsvfs);
4599 4618 return (SET_ERROR(EACCES));
4600 4619 }
4601 4620
4602 4621 if (vp->v_flag & VNOMAP) {
4603 4622 ZFS_EXIT(zfsvfs);
4604 4623 return (SET_ERROR(ENOSYS));
4605 4624 }
4606 4625
4607 4626 if (off < 0 || len > MAXOFFSET_T - off) {
4608 4627 ZFS_EXIT(zfsvfs);
4609 4628 return (SET_ERROR(ENXIO));
4610 4629 }
4611 4630
4612 4631 if (vp->v_type != VREG) {
4613 4632 ZFS_EXIT(zfsvfs);
4614 4633 return (SET_ERROR(ENODEV));
4615 4634 }
4616 4635
4617 4636 /*
4618 4637 * If file is locked, disallow mapping.
4619 4638 */
4620 4639 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4621 4640 ZFS_EXIT(zfsvfs);
4622 4641 return (SET_ERROR(EAGAIN));
4623 4642 }
4624 4643
4625 4644 as_rangelock(as);
4626 4645 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4627 4646 if (error != 0) {
4628 4647 as_rangeunlock(as);
4629 4648 ZFS_EXIT(zfsvfs);
4630 4649 return (error);
4631 4650 }
4632 4651
4633 4652 vn_a.vp = vp;
4634 4653 vn_a.offset = (u_offset_t)off;
4635 4654 vn_a.type = flags & MAP_TYPE;
4636 4655 vn_a.prot = prot;
4637 4656 vn_a.maxprot = maxprot;
4638 4657 vn_a.cred = cr;
4639 4658 vn_a.amp = NULL;
4640 4659 vn_a.flags = flags & ~MAP_TYPE;
4641 4660 vn_a.szc = 0;
4642 4661 vn_a.lgrp_mem_policy_flags = 0;
4643 4662
4644 4663 error = as_map(as, *addrp, len, segvn_create, &vn_a);
4645 4664
4646 4665 as_rangeunlock(as);
4647 4666 ZFS_EXIT(zfsvfs);
4648 4667 return (error);
4649 4668 }
4650 4669
4651 4670 /* ARGSUSED */
4652 4671 static int
4653 4672 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4654 4673 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4655 4674 caller_context_t *ct)
4656 4675 {
4657 4676 uint64_t pages = btopr(len);
4658 4677
4659 4678 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4660 4679 return (0);
4661 4680 }
4662 4681
4663 4682 /*
4664 4683 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4665 4684 * more accurate mtime for the associated file. Since we don't have a way of
4666 4685 * detecting when the data was actually modified, we have to resort to
4667 4686 * heuristics. If an explicit msync() is done, then we mark the mtime when the
4668 4687 * last page is pushed. The problem occurs when the msync() call is omitted,
4669 4688 * which by far the most common case:
4670 4689 *
4671 4690 * open()
4672 4691 * mmap()
4673 4692 * <modify memory>
4674 4693 * munmap()
4675 4694 * close()
4676 4695 * <time lapse>
4677 4696 * putpage() via fsflush
4678 4697 *
4679 4698 * If we wait until fsflush to come along, we can have a modification time that
4680 4699 * is some arbitrary point in the future. In order to prevent this in the
4681 4700 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4682 4701 * torn down.
4683 4702 */
4684 4703 /* ARGSUSED */
4685 4704 static int
4686 4705 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4687 4706 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4688 4707 caller_context_t *ct)
4689 4708 {
4690 4709 uint64_t pages = btopr(len);
4691 4710
4692 4711 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4693 4712 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4694 4713
4695 4714 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4696 4715 vn_has_cached_data(vp))
4697 4716 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4698 4717
4699 4718 return (0);
4700 4719 }
4701 4720
4702 4721 /*
4703 4722 * Free or allocate space in a file. Currently, this function only
4704 4723 * supports the `F_FREESP' command. However, this command is somewhat
4705 4724 * misnamed, as its functionality includes the ability to allocate as
4706 4725 * well as free space.
4707 4726 *
4708 4727 * IN: vp - vnode of file to free data in.
4709 4728 * cmd - action to take (only F_FREESP supported).
4710 4729 * bfp - section of file to free/alloc.
4711 4730 * flag - current file open mode flags.
4712 4731 * offset - current file offset.
4713 4732 * cr - credentials of caller [UNUSED].
4714 4733 * ct - caller context.
4715 4734 *
4716 4735 * RETURN: 0 on success, error code on failure.
4717 4736 *
4718 4737 * Timestamps:
4719 4738 * vp - ctime|mtime updated
4720 4739 */
4721 4740 /* ARGSUSED */
4722 4741 static int
4723 4742 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4724 4743 offset_t offset, cred_t *cr, caller_context_t *ct)
4725 4744 {
4726 4745 znode_t *zp = VTOZ(vp);
4727 4746 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4728 4747 uint64_t off, len;
4729 4748 int error;
4730 4749
4731 4750 ZFS_ENTER(zfsvfs);
4732 4751 ZFS_VERIFY_ZP(zp);
4733 4752
4734 4753 if (cmd != F_FREESP) {
4735 4754 ZFS_EXIT(zfsvfs);
4736 4755 return (SET_ERROR(EINVAL));
4737 4756 }
4738 4757
4739 4758 if (error = convoff(vp, bfp, 0, offset)) {
4740 4759 ZFS_EXIT(zfsvfs);
4741 4760 return (error);
4742 4761 }
4743 4762
4744 4763 if (bfp->l_len < 0) {
4745 4764 ZFS_EXIT(zfsvfs);
4746 4765 return (SET_ERROR(EINVAL));
4747 4766 }
4748 4767
4749 4768 off = bfp->l_start;
4750 4769 len = bfp->l_len; /* 0 means from off to end of file */
4751 4770
4752 4771 error = zfs_freesp(zp, off, len, flag, TRUE);
4753 4772
4754 4773 ZFS_EXIT(zfsvfs);
4755 4774 return (error);
4756 4775 }
4757 4776
4758 4777 /*ARGSUSED*/
4759 4778 static int
4760 4779 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4761 4780 {
4762 4781 znode_t *zp = VTOZ(vp);
4763 4782 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4764 4783 uint32_t gen;
4765 4784 uint64_t gen64;
4766 4785 uint64_t object = zp->z_id;
4767 4786 zfid_short_t *zfid;
4768 4787 int size, i, error;
4769 4788
4770 4789 ZFS_ENTER(zfsvfs);
4771 4790 ZFS_VERIFY_ZP(zp);
4772 4791
4773 4792 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4774 4793 &gen64, sizeof (uint64_t))) != 0) {
4775 4794 ZFS_EXIT(zfsvfs);
4776 4795 return (error);
4777 4796 }
4778 4797
4779 4798 gen = (uint32_t)gen64;
4780 4799
4781 4800 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4782 4801 if (fidp->fid_len < size) {
4783 4802 fidp->fid_len = size;
4784 4803 ZFS_EXIT(zfsvfs);
4785 4804 return (SET_ERROR(ENOSPC));
4786 4805 }
4787 4806
4788 4807 zfid = (zfid_short_t *)fidp;
4789 4808
4790 4809 zfid->zf_len = size;
4791 4810
4792 4811 for (i = 0; i < sizeof (zfid->zf_object); i++)
4793 4812 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4794 4813
4795 4814 /* Must have a non-zero generation number to distinguish from .zfs */
4796 4815 if (gen == 0)
4797 4816 gen = 1;
4798 4817 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4799 4818 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4800 4819
4801 4820 if (size == LONG_FID_LEN) {
4802 4821 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4803 4822 zfid_long_t *zlfid;
4804 4823
4805 4824 zlfid = (zfid_long_t *)fidp;
4806 4825
4807 4826 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4808 4827 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4809 4828
4810 4829 /* XXX - this should be the generation number for the objset */
4811 4830 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4812 4831 zlfid->zf_setgen[i] = 0;
4813 4832 }
4814 4833
4815 4834 ZFS_EXIT(zfsvfs);
4816 4835 return (0);
4817 4836 }
4818 4837
4819 4838 static int
4820 4839 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4821 4840 caller_context_t *ct)
4822 4841 {
4823 4842 znode_t *zp, *xzp;
4824 4843 zfsvfs_t *zfsvfs;
4825 4844 zfs_dirlock_t *dl;
4826 4845 int error;
4827 4846
4828 4847 switch (cmd) {
4829 4848 case _PC_LINK_MAX:
4830 4849 *valp = ULONG_MAX;
4831 4850 return (0);
4832 4851
4833 4852 case _PC_FILESIZEBITS:
4834 4853 *valp = 64;
4835 4854 return (0);
4836 4855
4837 4856 case _PC_XATTR_EXISTS:
4838 4857 zp = VTOZ(vp);
4839 4858 zfsvfs = zp->z_zfsvfs;
4840 4859 ZFS_ENTER(zfsvfs);
4841 4860 ZFS_VERIFY_ZP(zp);
4842 4861 *valp = 0;
4843 4862 error = zfs_dirent_lock(&dl, zp, "", &xzp,
4844 4863 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4845 4864 if (error == 0) {
4846 4865 zfs_dirent_unlock(dl);
4847 4866 if (!zfs_dirempty(xzp))
4848 4867 *valp = 1;
4849 4868 VN_RELE(ZTOV(xzp));
4850 4869 } else if (error == ENOENT) {
4851 4870 /*
4852 4871 * If there aren't extended attributes, it's the
4853 4872 * same as having zero of them.
4854 4873 */
4855 4874 error = 0;
4856 4875 }
4857 4876 ZFS_EXIT(zfsvfs);
4858 4877 return (error);
4859 4878
4860 4879 case _PC_SATTR_ENABLED:
4861 4880 case _PC_SATTR_EXISTS:
4862 4881 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4863 4882 (vp->v_type == VREG || vp->v_type == VDIR);
4864 4883 return (0);
4865 4884
4866 4885 case _PC_ACCESS_FILTERING:
4867 4886 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4868 4887 vp->v_type == VDIR;
4869 4888 return (0);
4870 4889
4871 4890 case _PC_ACL_ENABLED:
4872 4891 *valp = _ACL_ACE_ENABLED;
4873 4892 return (0);
4874 4893
4875 4894 case _PC_MIN_HOLE_SIZE:
4876 4895 *valp = (ulong_t)SPA_MINBLOCKSIZE;
4877 4896 return (0);
4878 4897
4879 4898 case _PC_TIMESTAMP_RESOLUTION:
4880 4899 /* nanosecond timestamp resolution */
4881 4900 *valp = 1L;
4882 4901 return (0);
4883 4902
4884 4903 default:
4885 4904 return (fs_pathconf(vp, cmd, valp, cr, ct));
4886 4905 }
4887 4906 }
4888 4907
4889 4908 /*ARGSUSED*/
4890 4909 static int
4891 4910 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4892 4911 caller_context_t *ct)
4893 4912 {
4894 4913 znode_t *zp = VTOZ(vp);
4895 4914 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4896 4915 int error;
4897 4916 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4898 4917
4899 4918 ZFS_ENTER(zfsvfs);
4900 4919 ZFS_VERIFY_ZP(zp);
4901 4920 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4902 4921 ZFS_EXIT(zfsvfs);
4903 4922
4904 4923 return (error);
4905 4924 }
4906 4925
4907 4926 /*ARGSUSED*/
4908 4927 static int
4909 4928 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4910 4929 caller_context_t *ct)
4911 4930 {
4912 4931 znode_t *zp = VTOZ(vp);
4913 4932 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4914 4933 int error;
4915 4934 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4916 4935 zilog_t *zilog = zfsvfs->z_log;
4917 4936
4918 4937 ZFS_ENTER(zfsvfs);
4919 4938 ZFS_VERIFY_ZP(zp);
4920 4939
4921 4940 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4922 4941
4923 4942 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4924 4943 zil_commit(zilog, 0);
4925 4944
4926 4945 ZFS_EXIT(zfsvfs);
4927 4946 return (error);
4928 4947 }
4929 4948
4930 4949 /*
4931 4950 * The smallest read we may consider to loan out an arcbuf.
4932 4951 * This must be a power of 2.
4933 4952 */
4934 4953 int zcr_blksz_min = (1 << 10); /* 1K */
4935 4954 /*
4936 4955 * If set to less than the file block size, allow loaning out of an
4937 4956 * arcbuf for a partial block read. This must be a power of 2.
4938 4957 */
4939 4958 int zcr_blksz_max = (1 << 17); /* 128K */
4940 4959
4941 4960 /*ARGSUSED*/
4942 4961 static int
4943 4962 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
4944 4963 caller_context_t *ct)
4945 4964 {
4946 4965 znode_t *zp = VTOZ(vp);
4947 4966 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4948 4967 int max_blksz = zfsvfs->z_max_blksz;
4949 4968 uio_t *uio = &xuio->xu_uio;
4950 4969 ssize_t size = uio->uio_resid;
4951 4970 offset_t offset = uio->uio_loffset;
4952 4971 int blksz;
4953 4972 int fullblk, i;
4954 4973 arc_buf_t *abuf;
4955 4974 ssize_t maxsize;
4956 4975 int preamble, postamble;
4957 4976
4958 4977 if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4959 4978 return (SET_ERROR(EINVAL));
4960 4979
4961 4980 ZFS_ENTER(zfsvfs);
4962 4981 ZFS_VERIFY_ZP(zp);
4963 4982 switch (ioflag) {
4964 4983 case UIO_WRITE:
4965 4984 /*
4966 4985 * Loan out an arc_buf for write if write size is bigger than
4967 4986 * max_blksz, and the file's block size is also max_blksz.
4968 4987 */
4969 4988 blksz = max_blksz;
4970 4989 if (size < blksz || zp->z_blksz != blksz) {
4971 4990 ZFS_EXIT(zfsvfs);
4972 4991 return (SET_ERROR(EINVAL));
4973 4992 }
4974 4993 /*
4975 4994 * Caller requests buffers for write before knowing where the
4976 4995 * write offset might be (e.g. NFS TCP write).
4977 4996 */
4978 4997 if (offset == -1) {
4979 4998 preamble = 0;
4980 4999 } else {
4981 5000 preamble = P2PHASE(offset, blksz);
4982 5001 if (preamble) {
4983 5002 preamble = blksz - preamble;
4984 5003 size -= preamble;
4985 5004 }
4986 5005 }
4987 5006
4988 5007 postamble = P2PHASE(size, blksz);
4989 5008 size -= postamble;
4990 5009
4991 5010 fullblk = size / blksz;
4992 5011 (void) dmu_xuio_init(xuio,
4993 5012 (preamble != 0) + fullblk + (postamble != 0));
4994 5013 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
4995 5014 int, postamble, int,
4996 5015 (preamble != 0) + fullblk + (postamble != 0));
4997 5016
4998 5017 /*
4999 5018 * Have to fix iov base/len for partial buffers. They
5000 5019 * currently represent full arc_buf's.
5001 5020 */
5002 5021 if (preamble) {
5003 5022 /* data begins in the middle of the arc_buf */
5004 5023 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5005 5024 blksz);
5006 5025 ASSERT(abuf);
5007 5026 (void) dmu_xuio_add(xuio, abuf,
5008 5027 blksz - preamble, preamble);
5009 5028 }
5010 5029
5011 5030 for (i = 0; i < fullblk; i++) {
5012 5031 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5013 5032 blksz);
5014 5033 ASSERT(abuf);
5015 5034 (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5016 5035 }
5017 5036
5018 5037 if (postamble) {
5019 5038 /* data ends in the middle of the arc_buf */
5020 5039 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5021 5040 blksz);
5022 5041 ASSERT(abuf);
5023 5042 (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5024 5043 }
5025 5044 break;
5026 5045 case UIO_READ:
5027 5046 /*
5028 5047 * Loan out an arc_buf for read if the read size is larger than
5029 5048 * the current file block size. Block alignment is not
5030 5049 * considered. Partial arc_buf will be loaned out for read.
5031 5050 */
5032 5051 blksz = zp->z_blksz;
5033 5052 if (blksz < zcr_blksz_min)
5034 5053 blksz = zcr_blksz_min;
5035 5054 if (blksz > zcr_blksz_max)
5036 5055 blksz = zcr_blksz_max;
5037 5056 /* avoid potential complexity of dealing with it */
5038 5057 if (blksz > max_blksz) {
5039 5058 ZFS_EXIT(zfsvfs);
5040 5059 return (SET_ERROR(EINVAL));
5041 5060 }
5042 5061
5043 5062 maxsize = zp->z_size - uio->uio_loffset;
5044 5063 if (size > maxsize)
5045 5064 size = maxsize;
5046 5065
5047 5066 if (size < blksz || vn_has_cached_data(vp)) {
5048 5067 ZFS_EXIT(zfsvfs);
5049 5068 return (SET_ERROR(EINVAL));
5050 5069 }
5051 5070 break;
5052 5071 default:
5053 5072 ZFS_EXIT(zfsvfs);
5054 5073 return (SET_ERROR(EINVAL));
5055 5074 }
5056 5075
5057 5076 uio->uio_extflg = UIO_XUIO;
5058 5077 XUIO_XUZC_RW(xuio) = ioflag;
5059 5078 ZFS_EXIT(zfsvfs);
5060 5079 return (0);
5061 5080 }
5062 5081
5063 5082 /*ARGSUSED*/
5064 5083 static int
5065 5084 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5066 5085 {
5067 5086 int i;
5068 5087 arc_buf_t *abuf;
5069 5088 int ioflag = XUIO_XUZC_RW(xuio);
5070 5089
5071 5090 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5072 5091
5073 5092 i = dmu_xuio_cnt(xuio);
5074 5093 while (i-- > 0) {
5075 5094 abuf = dmu_xuio_arcbuf(xuio, i);
5076 5095 /*
5077 5096 * if abuf == NULL, it must be a write buffer
5078 5097 * that has been returned in zfs_write().
5079 5098 */
5080 5099 if (abuf)
5081 5100 dmu_return_arcbuf(abuf);
5082 5101 ASSERT(abuf || ioflag == UIO_WRITE);
5083 5102 }
5084 5103
5085 5104 dmu_xuio_fini(xuio);
5086 5105 return (0);
5087 5106 }
5088 5107
5089 5108 /*
5090 5109 * Predeclare these here so that the compiler assumes that
5091 5110 * this is an "old style" function declaration that does
5092 5111 * not include arguments => we won't get type mismatch errors
5093 5112 * in the initializations that follow.
5094 5113 */
5095 5114 static int zfs_inval();
5096 5115 static int zfs_isdir();
5097 5116
5098 5117 static int
5099 5118 zfs_inval()
5100 5119 {
5101 5120 return (SET_ERROR(EINVAL));
5102 5121 }
5103 5122
5104 5123 static int
5105 5124 zfs_isdir()
5106 5125 {
5107 5126 return (SET_ERROR(EISDIR));
5108 5127 }
5109 5128 /*
5110 5129 * Directory vnode operations template
5111 5130 */
5112 5131 vnodeops_t *zfs_dvnodeops;
5113 5132 const fs_operation_def_t zfs_dvnodeops_template[] = {
5114 5133 VOPNAME_OPEN, { .vop_open = zfs_open },
5115 5134 VOPNAME_CLOSE, { .vop_close = zfs_close },
5116 5135 VOPNAME_READ, { .error = zfs_isdir },
5117 5136 VOPNAME_WRITE, { .error = zfs_isdir },
5118 5137 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5119 5138 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5120 5139 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5121 5140 VOPNAME_ACCESS, { .vop_access = zfs_access },
5122 5141 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5123 5142 VOPNAME_CREATE, { .vop_create = zfs_create },
5124 5143 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
5125 5144 VOPNAME_LINK, { .vop_link = zfs_link },
5126 5145 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5127 5146 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
5128 5147 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
5129 5148 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
5130 5149 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
5131 5150 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5132 5151 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5133 5152 VOPNAME_FID, { .vop_fid = zfs_fid },
5134 5153 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5135 5154 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5136 5155 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5137 5156 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5138 5157 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5139 5158 NULL, NULL
5140 5159 };
5141 5160
5142 5161 /*
5143 5162 * Regular file vnode operations template
5144 5163 */
5145 5164 vnodeops_t *zfs_fvnodeops;
5146 5165 const fs_operation_def_t zfs_fvnodeops_template[] = {
5147 5166 VOPNAME_OPEN, { .vop_open = zfs_open },
5148 5167 VOPNAME_CLOSE, { .vop_close = zfs_close },
5149 5168 VOPNAME_READ, { .vop_read = zfs_read },
5150 5169 VOPNAME_WRITE, { .vop_write = zfs_write },
5151 5170 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5152 5171 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5153 5172 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5154 5173 VOPNAME_ACCESS, { .vop_access = zfs_access },
5155 5174 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5156 5175 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5157 5176 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5158 5177 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5159 5178 VOPNAME_FID, { .vop_fid = zfs_fid },
5160 5179 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5161 5180 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
5162 5181 VOPNAME_SPACE, { .vop_space = zfs_space },
5163 5182 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
5164 5183 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
5165 5184 VOPNAME_MAP, { .vop_map = zfs_map },
5166 5185 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
5167 5186 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
5168 5187 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5169 5188 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5170 5189 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5171 5190 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5172 5191 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
5173 5192 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
5174 5193 NULL, NULL
5175 5194 };
5176 5195
5177 5196 /*
5178 5197 * Symbolic link vnode operations template
5179 5198 */
5180 5199 vnodeops_t *zfs_symvnodeops;
5181 5200 const fs_operation_def_t zfs_symvnodeops_template[] = {
5182 5201 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5183 5202 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5184 5203 VOPNAME_ACCESS, { .vop_access = zfs_access },
5185 5204 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5186 5205 VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
5187 5206 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5188 5207 VOPNAME_FID, { .vop_fid = zfs_fid },
5189 5208 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5190 5209 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5191 5210 NULL, NULL
5192 5211 };
5193 5212
5194 5213 /*
5195 5214 * special share hidden files vnode operations template
5196 5215 */
5197 5216 vnodeops_t *zfs_sharevnodeops;
5198 5217 const fs_operation_def_t zfs_sharevnodeops_template[] = {
5199 5218 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5200 5219 VOPNAME_ACCESS, { .vop_access = zfs_access },
5201 5220 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5202 5221 VOPNAME_FID, { .vop_fid = zfs_fid },
5203 5222 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5204 5223 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5205 5224 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5206 5225 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5207 5226 NULL, NULL
5208 5227 };
5209 5228
5210 5229 /*
5211 5230 * Extended attribute directory vnode operations template
5212 5231 *
5213 5232 * This template is identical to the directory vnodes
5214 5233 * operation template except for restricted operations:
5215 5234 * VOP_MKDIR()
5216 5235 * VOP_SYMLINK()
5217 5236 *
5218 5237 * Note that there are other restrictions embedded in:
5219 5238 * zfs_create() - restrict type to VREG
5220 5239 * zfs_link() - no links into/out of attribute space
5221 5240 * zfs_rename() - no moves into/out of attribute space
5222 5241 */
5223 5242 vnodeops_t *zfs_xdvnodeops;
5224 5243 const fs_operation_def_t zfs_xdvnodeops_template[] = {
5225 5244 VOPNAME_OPEN, { .vop_open = zfs_open },
5226 5245 VOPNAME_CLOSE, { .vop_close = zfs_close },
5227 5246 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5228 5247 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5229 5248 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5230 5249 VOPNAME_ACCESS, { .vop_access = zfs_access },
5231 5250 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5232 5251 VOPNAME_CREATE, { .vop_create = zfs_create },
5233 5252 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
5234 5253 VOPNAME_LINK, { .vop_link = zfs_link },
5235 5254 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5236 5255 VOPNAME_MKDIR, { .error = zfs_inval },
5237 5256 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
5238 5257 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
5239 5258 VOPNAME_SYMLINK, { .error = zfs_inval },
5240 5259 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5241 5260 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5242 5261 VOPNAME_FID, { .vop_fid = zfs_fid },
5243 5262 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5244 5263 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5245 5264 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5246 5265 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5247 5266 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5248 5267 NULL, NULL
5249 5268 };
5250 5269
5251 5270 /*
5252 5271 * Error vnode operations template
5253 5272 */
5254 5273 vnodeops_t *zfs_evnodeops;
5255 5274 const fs_operation_def_t zfs_evnodeops_template[] = {
5256 5275 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5257 5276 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5258 5277 NULL, NULL
5259 5278 };
↓ open down ↓ |
1244 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX