Print this page
3006 VERIFY[S,U,P] and ASSERT[S,U,P] frequently check if first argument is zero
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright (c) 2012 by Delphix. All rights reserved.
23 24 */
24 25
26 +
27 +
28 +
25 29 /* Portions Copyright 2007 Jeremy Teo */
26 30 /* Portions Copyright 2010 Robert Milkowski */
27 31
28 32 #include <sys/types.h>
29 33 #include <sys/param.h>
30 34 #include <sys/time.h>
31 35 #include <sys/systm.h>
32 36 #include <sys/sysmacros.h>
33 37 #include <sys/resource.h>
34 38 #include <sys/vfs.h>
35 39 #include <sys/vfs_opreg.h>
36 40 #include <sys/vnode.h>
37 41 #include <sys/file.h>
38 42 #include <sys/stat.h>
39 43 #include <sys/kmem.h>
40 44 #include <sys/taskq.h>
41 45 #include <sys/uio.h>
42 46 #include <sys/vmsystm.h>
43 47 #include <sys/atomic.h>
44 48 #include <sys/vm.h>
45 49 #include <vm/seg_vn.h>
46 50 #include <vm/pvn.h>
47 51 #include <vm/as.h>
48 52 #include <vm/kpm.h>
49 53 #include <vm/seg_kpm.h>
50 54 #include <sys/mman.h>
51 55 #include <sys/pathname.h>
52 56 #include <sys/cmn_err.h>
53 57 #include <sys/errno.h>
54 58 #include <sys/unistd.h>
55 59 #include <sys/zfs_dir.h>
56 60 #include <sys/zfs_acl.h>
57 61 #include <sys/zfs_ioctl.h>
58 62 #include <sys/fs/zfs.h>
59 63 #include <sys/dmu.h>
60 64 #include <sys/dmu_objset.h>
61 65 #include <sys/spa.h>
62 66 #include <sys/txg.h>
63 67 #include <sys/dbuf.h>
64 68 #include <sys/zap.h>
65 69 #include <sys/sa.h>
66 70 #include <sys/dirent.h>
67 71 #include <sys/policy.h>
68 72 #include <sys/sunddi.h>
69 73 #include <sys/filio.h>
70 74 #include <sys/sid.h>
71 75 #include "fs/fs_subr.h"
72 76 #include <sys/zfs_ctldir.h>
73 77 #include <sys/zfs_fuid.h>
74 78 #include <sys/zfs_sa.h>
75 79 #include <sys/dnlc.h>
76 80 #include <sys/zfs_rlock.h>
77 81 #include <sys/extdirent.h>
78 82 #include <sys/kidmap.h>
79 83 #include <sys/cred.h>
80 84 #include <sys/attr.h>
81 85
82 86 /*
83 87 * Programming rules.
84 88 *
85 89 * Each vnode op performs some logical unit of work. To do this, the ZPL must
86 90 * properly lock its in-core state, create a DMU transaction, do the work,
87 91 * record this work in the intent log (ZIL), commit the DMU transaction,
88 92 * and wait for the intent log to commit if it is a synchronous operation.
89 93 * Moreover, the vnode ops must work in both normal and log replay context.
90 94 * The ordering of events is important to avoid deadlocks and references
91 95 * to freed memory. The example below illustrates the following Big Rules:
92 96 *
93 97 * (1) A check must be made in each zfs thread for a mounted file system.
94 98 * This is done avoiding races using ZFS_ENTER(zfsvfs).
95 99 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
96 100 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
97 101 * can return EIO from the calling function.
98 102 *
99 103 * (2) VN_RELE() should always be the last thing except for zil_commit()
100 104 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
101 105 * First, if it's the last reference, the vnode/znode
102 106 * can be freed, so the zp may point to freed memory. Second, the last
103 107 * reference will call zfs_zinactive(), which may induce a lot of work --
104 108 * pushing cached pages (which acquires range locks) and syncing out
105 109 * cached atime changes. Third, zfs_zinactive() may require a new tx,
106 110 * which could deadlock the system if you were already holding one.
107 111 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
108 112 *
109 113 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
110 114 * as they can span dmu_tx_assign() calls.
111 115 *
112 116 * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
113 117 * This is critical because we don't want to block while holding locks.
114 118 * Note, in particular, that if a lock is sometimes acquired before
115 119 * the tx assigns, and sometimes after (e.g. z_lock), then failing to
116 120 * use a non-blocking assign can deadlock the system. The scenario:
117 121 *
118 122 * Thread A has grabbed a lock before calling dmu_tx_assign().
119 123 * Thread B is in an already-assigned tx, and blocks for this lock.
120 124 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
121 125 * forever, because the previous txg can't quiesce until B's tx commits.
122 126 *
123 127 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
124 128 * then drop all locks, call dmu_tx_wait(), and try again.
125 129 *
126 130 * (5) If the operation succeeded, generate the intent log entry for it
127 131 * before dropping locks. This ensures that the ordering of events
128 132 * in the intent log matches the order in which they actually occurred.
129 133 * During ZIL replay the zfs_log_* functions will update the sequence
130 134 * number to indicate the zil transaction has replayed.
131 135 *
132 136 * (6) At the end of each vnode op, the DMU tx must always commit,
133 137 * regardless of whether there were any errors.
134 138 *
135 139 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
136 140 * to ensure that synchronous semantics are provided when necessary.
137 141 *
138 142 * In general, this is how things should be ordered in each vnode op:
139 143 *
140 144 * ZFS_ENTER(zfsvfs); // exit if unmounted
141 145 * top:
142 146 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
143 147 * rw_enter(...); // grab any other locks you need
144 148 * tx = dmu_tx_create(...); // get DMU tx
145 149 * dmu_tx_hold_*(); // hold each object you might modify
146 150 * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
147 151 * if (error) {
148 152 * rw_exit(...); // drop locks
149 153 * zfs_dirent_unlock(dl); // unlock directory entry
150 154 * VN_RELE(...); // release held vnodes
151 155 * if (error == ERESTART) {
152 156 * dmu_tx_wait(tx);
153 157 * dmu_tx_abort(tx);
154 158 * goto top;
155 159 * }
156 160 * dmu_tx_abort(tx); // abort DMU tx
157 161 * ZFS_EXIT(zfsvfs); // finished in zfs
158 162 * return (error); // really out of space
159 163 * }
160 164 * error = do_real_work(); // do whatever this VOP does
161 165 * if (error == 0)
162 166 * zfs_log_*(...); // on success, make ZIL entry
163 167 * dmu_tx_commit(tx); // commit DMU tx -- error or not
164 168 * rw_exit(...); // drop locks
165 169 * zfs_dirent_unlock(dl); // unlock directory entry
166 170 * VN_RELE(...); // release held vnodes
167 171 * zil_commit(zilog, foid); // synchronous when necessary
168 172 * ZFS_EXIT(zfsvfs); // finished in zfs
169 173 * return (error); // done, report error
170 174 */
171 175
172 176 /* ARGSUSED */
173 177 static int
174 178 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
175 179 {
176 180 znode_t *zp = VTOZ(*vpp);
177 181 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
178 182
179 183 ZFS_ENTER(zfsvfs);
180 184 ZFS_VERIFY_ZP(zp);
181 185
182 186 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
183 187 ((flag & FAPPEND) == 0)) {
184 188 ZFS_EXIT(zfsvfs);
185 189 return (EPERM);
186 190 }
187 191
188 192 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
189 193 ZTOV(zp)->v_type == VREG &&
190 194 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
191 195 if (fs_vscan(*vpp, cr, 0) != 0) {
192 196 ZFS_EXIT(zfsvfs);
193 197 return (EACCES);
194 198 }
195 199 }
196 200
197 201 /* Keep a count of the synchronous opens in the znode */
198 202 if (flag & (FSYNC | FDSYNC))
199 203 atomic_inc_32(&zp->z_sync_cnt);
200 204
201 205 ZFS_EXIT(zfsvfs);
202 206 return (0);
203 207 }
204 208
205 209 /* ARGSUSED */
206 210 static int
207 211 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
208 212 caller_context_t *ct)
209 213 {
210 214 znode_t *zp = VTOZ(vp);
211 215 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
212 216
213 217 /*
214 218 * Clean up any locks held by this process on the vp.
215 219 */
216 220 cleanlocks(vp, ddi_get_pid(), 0);
217 221 cleanshares(vp, ddi_get_pid());
218 222
219 223 ZFS_ENTER(zfsvfs);
220 224 ZFS_VERIFY_ZP(zp);
221 225
222 226 /* Decrement the synchronous opens in the znode */
223 227 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
224 228 atomic_dec_32(&zp->z_sync_cnt);
225 229
226 230 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
227 231 ZTOV(zp)->v_type == VREG &&
228 232 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
229 233 VERIFY(fs_vscan(vp, cr, 1) == 0);
230 234
231 235 ZFS_EXIT(zfsvfs);
232 236 return (0);
233 237 }
234 238
235 239 /*
236 240 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
237 241 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
238 242 */
239 243 static int
240 244 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
241 245 {
242 246 znode_t *zp = VTOZ(vp);
243 247 uint64_t noff = (uint64_t)*off; /* new offset */
244 248 uint64_t file_sz;
245 249 int error;
246 250 boolean_t hole;
247 251
248 252 file_sz = zp->z_size;
249 253 if (noff >= file_sz) {
250 254 return (ENXIO);
251 255 }
252 256
253 257 if (cmd == _FIO_SEEK_HOLE)
254 258 hole = B_TRUE;
255 259 else
256 260 hole = B_FALSE;
257 261
258 262 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
259 263
260 264 /* end of file? */
261 265 if ((error == ESRCH) || (noff > file_sz)) {
262 266 /*
263 267 * Handle the virtual hole at the end of file.
264 268 */
265 269 if (hole) {
266 270 *off = file_sz;
267 271 return (0);
268 272 }
269 273 return (ENXIO);
270 274 }
271 275
272 276 if (noff < *off)
273 277 return (error);
274 278 *off = noff;
275 279 return (error);
276 280 }
277 281
278 282 /* ARGSUSED */
279 283 static int
280 284 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
281 285 int *rvalp, caller_context_t *ct)
282 286 {
283 287 offset_t off;
284 288 int error;
285 289 zfsvfs_t *zfsvfs;
286 290 znode_t *zp;
287 291
288 292 switch (com) {
289 293 case _FIOFFS:
290 294 return (zfs_sync(vp->v_vfsp, 0, cred));
291 295
292 296 /*
293 297 * The following two ioctls are used by bfu. Faking out,
294 298 * necessary to avoid bfu errors.
295 299 */
296 300 case _FIOGDIO:
297 301 case _FIOSDIO:
298 302 return (0);
299 303
300 304 case _FIO_SEEK_DATA:
301 305 case _FIO_SEEK_HOLE:
302 306 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
303 307 return (EFAULT);
304 308
305 309 zp = VTOZ(vp);
306 310 zfsvfs = zp->z_zfsvfs;
307 311 ZFS_ENTER(zfsvfs);
308 312 ZFS_VERIFY_ZP(zp);
309 313
310 314 /* offset parameter is in/out */
311 315 error = zfs_holey(vp, com, &off);
312 316 ZFS_EXIT(zfsvfs);
313 317 if (error)
314 318 return (error);
315 319 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
316 320 return (EFAULT);
317 321 return (0);
318 322 }
319 323 return (ENOTTY);
320 324 }
321 325
322 326 /*
323 327 * Utility functions to map and unmap a single physical page. These
324 328 * are used to manage the mappable copies of ZFS file data, and therefore
325 329 * do not update ref/mod bits.
326 330 */
327 331 caddr_t
328 332 zfs_map_page(page_t *pp, enum seg_rw rw)
329 333 {
330 334 if (kpm_enable)
331 335 return (hat_kpm_mapin(pp, 0));
332 336 ASSERT(rw == S_READ || rw == S_WRITE);
333 337 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
334 338 (caddr_t)-1));
335 339 }
336 340
337 341 void
338 342 zfs_unmap_page(page_t *pp, caddr_t addr)
339 343 {
340 344 if (kpm_enable) {
341 345 hat_kpm_mapout(pp, 0, addr);
342 346 } else {
343 347 ppmapout(addr);
344 348 }
345 349 }
346 350
347 351 /*
348 352 * When a file is memory mapped, we must keep the IO data synchronized
349 353 * between the DMU cache and the memory mapped pages. What this means:
350 354 *
351 355 * On Write: If we find a memory mapped page, we write to *both*
352 356 * the page and the dmu buffer.
353 357 */
354 358 static void
355 359 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
356 360 {
357 361 int64_t off;
358 362
359 363 off = start & PAGEOFFSET;
360 364 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
361 365 page_t *pp;
362 366 uint64_t nbytes = MIN(PAGESIZE - off, len);
363 367
364 368 if (pp = page_lookup(vp, start, SE_SHARED)) {
365 369 caddr_t va;
366 370
367 371 va = zfs_map_page(pp, S_WRITE);
368 372 (void) dmu_read(os, oid, start+off, nbytes, va+off,
369 373 DMU_READ_PREFETCH);
370 374 zfs_unmap_page(pp, va);
371 375 page_unlock(pp);
372 376 }
373 377 len -= nbytes;
374 378 off = 0;
375 379 }
376 380 }
377 381
378 382 /*
379 383 * When a file is memory mapped, we must keep the IO data synchronized
380 384 * between the DMU cache and the memory mapped pages. What this means:
381 385 *
382 386 * On Read: We "read" preferentially from memory mapped pages,
383 387 * else we default from the dmu buffer.
384 388 *
385 389 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
386 390 * the file is memory mapped.
387 391 */
388 392 static int
389 393 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
390 394 {
391 395 znode_t *zp = VTOZ(vp);
392 396 objset_t *os = zp->z_zfsvfs->z_os;
393 397 int64_t start, off;
394 398 int len = nbytes;
395 399 int error = 0;
396 400
397 401 start = uio->uio_loffset;
398 402 off = start & PAGEOFFSET;
399 403 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
400 404 page_t *pp;
401 405 uint64_t bytes = MIN(PAGESIZE - off, len);
402 406
403 407 if (pp = page_lookup(vp, start, SE_SHARED)) {
404 408 caddr_t va;
405 409
406 410 va = zfs_map_page(pp, S_READ);
407 411 error = uiomove(va + off, bytes, UIO_READ, uio);
408 412 zfs_unmap_page(pp, va);
409 413 page_unlock(pp);
410 414 } else {
411 415 error = dmu_read_uio(os, zp->z_id, uio, bytes);
412 416 }
413 417 len -= bytes;
414 418 off = 0;
415 419 if (error)
416 420 break;
417 421 }
418 422 return (error);
419 423 }
420 424
421 425 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
422 426
423 427 /*
424 428 * Read bytes from specified file into supplied buffer.
425 429 *
426 430 * IN: vp - vnode of file to be read from.
427 431 * uio - structure supplying read location, range info,
428 432 * and return buffer.
429 433 * ioflag - SYNC flags; used to provide FRSYNC semantics.
430 434 * cr - credentials of caller.
431 435 * ct - caller context
432 436 *
433 437 * OUT: uio - updated offset and range, buffer filled.
434 438 *
435 439 * RETURN: 0 if success
436 440 * error code if failure
437 441 *
438 442 * Side Effects:
439 443 * vp - atime updated if byte count > 0
440 444 */
441 445 /* ARGSUSED */
442 446 static int
443 447 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
444 448 {
445 449 znode_t *zp = VTOZ(vp);
446 450 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
447 451 objset_t *os;
448 452 ssize_t n, nbytes;
449 453 int error;
450 454 rl_t *rl;
451 455 xuio_t *xuio = NULL;
452 456
453 457 ZFS_ENTER(zfsvfs);
454 458 ZFS_VERIFY_ZP(zp);
455 459 os = zfsvfs->z_os;
456 460
457 461 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
458 462 ZFS_EXIT(zfsvfs);
459 463 return (EACCES);
460 464 }
461 465
462 466 /*
463 467 * Validate file offset
464 468 */
465 469 if (uio->uio_loffset < (offset_t)0) {
466 470 ZFS_EXIT(zfsvfs);
467 471 return (EINVAL);
468 472 }
469 473
470 474 /*
471 475 * Fasttrack empty reads
472 476 */
473 477 if (uio->uio_resid == 0) {
474 478 ZFS_EXIT(zfsvfs);
475 479 return (0);
476 480 }
477 481
478 482 /*
479 483 * Check for mandatory locks
480 484 */
481 485 if (MANDMODE(zp->z_mode)) {
482 486 if (error = chklock(vp, FREAD,
483 487 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
484 488 ZFS_EXIT(zfsvfs);
485 489 return (error);
486 490 }
487 491 }
488 492
489 493 /*
490 494 * If we're in FRSYNC mode, sync out this znode before reading it.
491 495 */
492 496 if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
493 497 zil_commit(zfsvfs->z_log, zp->z_id);
494 498
495 499 /*
496 500 * Lock the range against changes.
497 501 */
498 502 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
499 503
500 504 /*
501 505 * If we are reading past end-of-file we can skip
502 506 * to the end; but we might still need to set atime.
503 507 */
504 508 if (uio->uio_loffset >= zp->z_size) {
505 509 error = 0;
506 510 goto out;
507 511 }
508 512
509 513 ASSERT(uio->uio_loffset < zp->z_size);
510 514 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
511 515
512 516 if ((uio->uio_extflg == UIO_XUIO) &&
513 517 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
514 518 int nblk;
515 519 int blksz = zp->z_blksz;
516 520 uint64_t offset = uio->uio_loffset;
517 521
518 522 xuio = (xuio_t *)uio;
519 523 if ((ISP2(blksz))) {
520 524 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
521 525 blksz)) / blksz;
522 526 } else {
523 527 ASSERT(offset + n <= blksz);
524 528 nblk = 1;
525 529 }
526 530 (void) dmu_xuio_init(xuio, nblk);
527 531
528 532 if (vn_has_cached_data(vp)) {
529 533 /*
530 534 * For simplicity, we always allocate a full buffer
531 535 * even if we only expect to read a portion of a block.
532 536 */
533 537 while (--nblk >= 0) {
534 538 (void) dmu_xuio_add(xuio,
535 539 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
536 540 blksz), 0, blksz);
537 541 }
538 542 }
539 543 }
540 544
541 545 while (n > 0) {
542 546 nbytes = MIN(n, zfs_read_chunk_size -
543 547 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
544 548
545 549 if (vn_has_cached_data(vp))
546 550 error = mappedread(vp, nbytes, uio);
547 551 else
548 552 error = dmu_read_uio(os, zp->z_id, uio, nbytes);
549 553 if (error) {
550 554 /* convert checksum errors into IO errors */
551 555 if (error == ECKSUM)
552 556 error = EIO;
553 557 break;
554 558 }
555 559
556 560 n -= nbytes;
557 561 }
558 562 out:
559 563 zfs_range_unlock(rl);
560 564
561 565 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
562 566 ZFS_EXIT(zfsvfs);
563 567 return (error);
564 568 }
565 569
566 570 /*
567 571 * Write the bytes to a file.
568 572 *
569 573 * IN: vp - vnode of file to be written to.
570 574 * uio - structure supplying write location, range info,
571 575 * and data buffer.
572 576 * ioflag - FAPPEND flag set if in append mode.
573 577 * cr - credentials of caller.
574 578 * ct - caller context (NFS/CIFS fem monitor only)
575 579 *
576 580 * OUT: uio - updated offset and range.
577 581 *
578 582 * RETURN: 0 if success
579 583 * error code if failure
580 584 *
581 585 * Timestamps:
582 586 * vp - ctime|mtime updated if byte count > 0
583 587 */
584 588
585 589 /* ARGSUSED */
586 590 static int
587 591 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
588 592 {
589 593 znode_t *zp = VTOZ(vp);
590 594 rlim64_t limit = uio->uio_llimit;
591 595 ssize_t start_resid = uio->uio_resid;
592 596 ssize_t tx_bytes;
593 597 uint64_t end_size;
594 598 dmu_tx_t *tx;
595 599 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
596 600 zilog_t *zilog;
597 601 offset_t woff;
598 602 ssize_t n, nbytes;
599 603 rl_t *rl;
600 604 int max_blksz = zfsvfs->z_max_blksz;
601 605 int error;
602 606 arc_buf_t *abuf;
603 607 iovec_t *aiov;
604 608 xuio_t *xuio = NULL;
605 609 int i_iov = 0;
606 610 int iovcnt = uio->uio_iovcnt;
607 611 iovec_t *iovp = uio->uio_iov;
608 612 int write_eof;
609 613 int count = 0;
610 614 sa_bulk_attr_t bulk[4];
611 615 uint64_t mtime[2], ctime[2];
612 616
613 617 /*
614 618 * Fasttrack empty write
615 619 */
616 620 n = start_resid;
617 621 if (n == 0)
618 622 return (0);
619 623
620 624 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
621 625 limit = MAXOFFSET_T;
622 626
623 627 ZFS_ENTER(zfsvfs);
624 628 ZFS_VERIFY_ZP(zp);
625 629
626 630 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
627 631 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
628 632 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
629 633 &zp->z_size, 8);
630 634 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
631 635 &zp->z_pflags, 8);
632 636
633 637 /*
634 638 * If immutable or not appending then return EPERM
635 639 */
636 640 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
637 641 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
638 642 (uio->uio_loffset < zp->z_size))) {
639 643 ZFS_EXIT(zfsvfs);
640 644 return (EPERM);
641 645 }
642 646
643 647 zilog = zfsvfs->z_log;
644 648
645 649 /*
646 650 * Validate file offset
647 651 */
648 652 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
649 653 if (woff < 0) {
650 654 ZFS_EXIT(zfsvfs);
651 655 return (EINVAL);
652 656 }
653 657
654 658 /*
655 659 * Check for mandatory locks before calling zfs_range_lock()
656 660 * in order to prevent a deadlock with locks set via fcntl().
657 661 */
658 662 if (MANDMODE((mode_t)zp->z_mode) &&
659 663 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
660 664 ZFS_EXIT(zfsvfs);
661 665 return (error);
662 666 }
663 667
664 668 /*
665 669 * Pre-fault the pages to ensure slow (eg NFS) pages
666 670 * don't hold up txg.
667 671 * Skip this if uio contains loaned arc_buf.
668 672 */
669 673 if ((uio->uio_extflg == UIO_XUIO) &&
670 674 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
671 675 xuio = (xuio_t *)uio;
672 676 else
673 677 uio_prefaultpages(MIN(n, max_blksz), uio);
674 678
675 679 /*
676 680 * If in append mode, set the io offset pointer to eof.
677 681 */
678 682 if (ioflag & FAPPEND) {
679 683 /*
680 684 * Obtain an appending range lock to guarantee file append
681 685 * semantics. We reset the write offset once we have the lock.
682 686 */
683 687 rl = zfs_range_lock(zp, 0, n, RL_APPEND);
684 688 woff = rl->r_off;
685 689 if (rl->r_len == UINT64_MAX) {
686 690 /*
687 691 * We overlocked the file because this write will cause
688 692 * the file block size to increase.
689 693 * Note that zp_size cannot change with this lock held.
690 694 */
691 695 woff = zp->z_size;
692 696 }
693 697 uio->uio_loffset = woff;
694 698 } else {
695 699 /*
696 700 * Note that if the file block size will change as a result of
697 701 * this write, then this range lock will lock the entire file
698 702 * so that we can re-write the block safely.
699 703 */
700 704 rl = zfs_range_lock(zp, woff, n, RL_WRITER);
701 705 }
702 706
703 707 if (woff >= limit) {
704 708 zfs_range_unlock(rl);
705 709 ZFS_EXIT(zfsvfs);
706 710 return (EFBIG);
707 711 }
708 712
709 713 if ((woff + n) > limit || woff > (limit - n))
710 714 n = limit - woff;
711 715
712 716 /* Will this write extend the file length? */
713 717 write_eof = (woff + n > zp->z_size);
714 718
715 719 end_size = MAX(zp->z_size, woff + n);
716 720
717 721 /*
718 722 * Write the file in reasonable size chunks. Each chunk is written
719 723 * in a separate transaction; this keeps the intent log records small
720 724 * and allows us to do more fine-grained space accounting.
721 725 */
722 726 while (n > 0) {
723 727 abuf = NULL;
724 728 woff = uio->uio_loffset;
725 729 again:
726 730 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
727 731 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
728 732 if (abuf != NULL)
729 733 dmu_return_arcbuf(abuf);
730 734 error = EDQUOT;
731 735 break;
732 736 }
733 737
734 738 if (xuio && abuf == NULL) {
735 739 ASSERT(i_iov < iovcnt);
736 740 aiov = &iovp[i_iov];
737 741 abuf = dmu_xuio_arcbuf(xuio, i_iov);
738 742 dmu_xuio_clear(xuio, i_iov);
739 743 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
740 744 iovec_t *, aiov, arc_buf_t *, abuf);
741 745 ASSERT((aiov->iov_base == abuf->b_data) ||
742 746 ((char *)aiov->iov_base - (char *)abuf->b_data +
743 747 aiov->iov_len == arc_buf_size(abuf)));
744 748 i_iov++;
745 749 } else if (abuf == NULL && n >= max_blksz &&
746 750 woff >= zp->z_size &&
747 751 P2PHASE(woff, max_blksz) == 0 &&
748 752 zp->z_blksz == max_blksz) {
749 753 /*
750 754 * This write covers a full block. "Borrow" a buffer
751 755 * from the dmu so that we can fill it before we enter
752 756 * a transaction. This avoids the possibility of
753 757 * holding up the transaction if the data copy hangs
754 758 * up on a pagefault (e.g., from an NFS server mapping).
755 759 */
756 760 size_t cbytes;
757 761
758 762 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
759 763 max_blksz);
760 764 ASSERT(abuf != NULL);
761 765 ASSERT(arc_buf_size(abuf) == max_blksz);
762 766 if (error = uiocopy(abuf->b_data, max_blksz,
763 767 UIO_WRITE, uio, &cbytes)) {
764 768 dmu_return_arcbuf(abuf);
765 769 break;
766 770 }
767 771 ASSERT(cbytes == max_blksz);
768 772 }
769 773
770 774 /*
771 775 * Start a transaction.
772 776 */
773 777 tx = dmu_tx_create(zfsvfs->z_os);
774 778 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
775 779 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
776 780 zfs_sa_upgrade_txholds(tx, zp);
777 781 error = dmu_tx_assign(tx, TXG_NOWAIT);
778 782 if (error) {
779 783 if (error == ERESTART) {
780 784 dmu_tx_wait(tx);
781 785 dmu_tx_abort(tx);
782 786 goto again;
783 787 }
784 788 dmu_tx_abort(tx);
785 789 if (abuf != NULL)
786 790 dmu_return_arcbuf(abuf);
787 791 break;
788 792 }
789 793
790 794 /*
791 795 * If zfs_range_lock() over-locked we grow the blocksize
792 796 * and then reduce the lock range. This will only happen
793 797 * on the first iteration since zfs_range_reduce() will
794 798 * shrink down r_len to the appropriate size.
795 799 */
796 800 if (rl->r_len == UINT64_MAX) {
797 801 uint64_t new_blksz;
798 802
799 803 if (zp->z_blksz > max_blksz) {
800 804 ASSERT(!ISP2(zp->z_blksz));
801 805 new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
802 806 } else {
803 807 new_blksz = MIN(end_size, max_blksz);
804 808 }
805 809 zfs_grow_blocksize(zp, new_blksz, tx);
806 810 zfs_range_reduce(rl, woff, n);
807 811 }
808 812
809 813 /*
810 814 * XXX - should we really limit each write to z_max_blksz?
811 815 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
812 816 */
813 817 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
814 818
815 819 if (abuf == NULL) {
816 820 tx_bytes = uio->uio_resid;
817 821 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
818 822 uio, nbytes, tx);
819 823 tx_bytes -= uio->uio_resid;
820 824 } else {
821 825 tx_bytes = nbytes;
822 826 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
823 827 /*
824 828 * If this is not a full block write, but we are
825 829 * extending the file past EOF and this data starts
826 830 * block-aligned, use assign_arcbuf(). Otherwise,
827 831 * write via dmu_write().
828 832 */
829 833 if (tx_bytes < max_blksz && (!write_eof ||
830 834 aiov->iov_base != abuf->b_data)) {
831 835 ASSERT(xuio);
832 836 dmu_write(zfsvfs->z_os, zp->z_id, woff,
833 837 aiov->iov_len, aiov->iov_base, tx);
834 838 dmu_return_arcbuf(abuf);
835 839 xuio_stat_wbuf_copied();
836 840 } else {
837 841 ASSERT(xuio || tx_bytes == max_blksz);
838 842 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
839 843 woff, abuf, tx);
840 844 }
841 845 ASSERT(tx_bytes <= uio->uio_resid);
842 846 uioskip(uio, tx_bytes);
843 847 }
844 848 if (tx_bytes && vn_has_cached_data(vp)) {
845 849 update_pages(vp, woff,
846 850 tx_bytes, zfsvfs->z_os, zp->z_id);
847 851 }
848 852
849 853 /*
850 854 * If we made no progress, we're done. If we made even
851 855 * partial progress, update the znode and ZIL accordingly.
852 856 */
853 857 if (tx_bytes == 0) {
854 858 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
855 859 (void *)&zp->z_size, sizeof (uint64_t), tx);
856 860 dmu_tx_commit(tx);
857 861 ASSERT(error != 0);
858 862 break;
859 863 }
860 864
861 865 /*
862 866 * Clear Set-UID/Set-GID bits on successful write if not
863 867 * privileged and at least one of the excute bits is set.
864 868 *
865 869 * It would be nice to to this after all writes have
866 870 * been done, but that would still expose the ISUID/ISGID
867 871 * to another app after the partial write is committed.
868 872 *
869 873 * Note: we don't call zfs_fuid_map_id() here because
870 874 * user 0 is not an ephemeral uid.
871 875 */
872 876 mutex_enter(&zp->z_acl_lock);
873 877 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
874 878 (S_IXUSR >> 6))) != 0 &&
875 879 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
876 880 secpolicy_vnode_setid_retain(cr,
877 881 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
878 882 uint64_t newmode;
879 883 zp->z_mode &= ~(S_ISUID | S_ISGID);
880 884 newmode = zp->z_mode;
881 885 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
882 886 (void *)&newmode, sizeof (uint64_t), tx);
883 887 }
884 888 mutex_exit(&zp->z_acl_lock);
885 889
886 890 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
887 891 B_TRUE);
888 892
889 893 /*
890 894 * Update the file size (zp_size) if it has changed;
891 895 * account for possible concurrent updates.
892 896 */
893 897 while ((end_size = zp->z_size) < uio->uio_loffset) {
894 898 (void) atomic_cas_64(&zp->z_size, end_size,
895 899 uio->uio_loffset);
896 900 ASSERT(error == 0);
897 901 }
898 902 /*
899 903 * If we are replaying and eof is non zero then force
900 904 * the file size to the specified eof. Note, there's no
901 905 * concurrency during replay.
902 906 */
903 907 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
904 908 zp->z_size = zfsvfs->z_replay_eof;
905 909
906 910 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
907 911
908 912 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
909 913 dmu_tx_commit(tx);
910 914
911 915 if (error != 0)
912 916 break;
913 917 ASSERT(tx_bytes == nbytes);
914 918 n -= nbytes;
915 919
916 920 if (!xuio && n > 0)
917 921 uio_prefaultpages(MIN(n, max_blksz), uio);
918 922 }
919 923
920 924 zfs_range_unlock(rl);
921 925
922 926 /*
923 927 * If we're in replay mode, or we made no progress, return error.
924 928 * Otherwise, it's at least a partial write, so it's successful.
925 929 */
926 930 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
927 931 ZFS_EXIT(zfsvfs);
928 932 return (error);
929 933 }
930 934
931 935 if (ioflag & (FSYNC | FDSYNC) ||
932 936 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
933 937 zil_commit(zilog, zp->z_id);
934 938
935 939 ZFS_EXIT(zfsvfs);
936 940 return (0);
937 941 }
938 942
939 943 void
940 944 zfs_get_done(zgd_t *zgd, int error)
941 945 {
942 946 znode_t *zp = zgd->zgd_private;
943 947 objset_t *os = zp->z_zfsvfs->z_os;
944 948
945 949 if (zgd->zgd_db)
946 950 dmu_buf_rele(zgd->zgd_db, zgd);
947 951
948 952 zfs_range_unlock(zgd->zgd_rl);
949 953
950 954 /*
951 955 * Release the vnode asynchronously as we currently have the
952 956 * txg stopped from syncing.
953 957 */
954 958 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
955 959
956 960 if (error == 0 && zgd->zgd_bp)
957 961 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
958 962
959 963 kmem_free(zgd, sizeof (zgd_t));
960 964 }
961 965
962 966 #ifdef DEBUG
963 967 static int zil_fault_io = 0;
964 968 #endif
965 969
966 970 /*
967 971 * Get data to generate a TX_WRITE intent log record.
968 972 */
969 973 int
970 974 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
971 975 {
972 976 zfsvfs_t *zfsvfs = arg;
973 977 objset_t *os = zfsvfs->z_os;
974 978 znode_t *zp;
975 979 uint64_t object = lr->lr_foid;
976 980 uint64_t offset = lr->lr_offset;
977 981 uint64_t size = lr->lr_length;
978 982 blkptr_t *bp = &lr->lr_blkptr;
979 983 dmu_buf_t *db;
980 984 zgd_t *zgd;
981 985 int error = 0;
982 986
983 987 ASSERT(zio != NULL);
984 988 ASSERT(size != 0);
985 989
986 990 /*
987 991 * Nothing to do if the file has been removed
988 992 */
989 993 if (zfs_zget(zfsvfs, object, &zp) != 0)
990 994 return (ENOENT);
991 995 if (zp->z_unlinked) {
992 996 /*
993 997 * Release the vnode asynchronously as we currently have the
994 998 * txg stopped from syncing.
995 999 */
996 1000 VN_RELE_ASYNC(ZTOV(zp),
997 1001 dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
998 1002 return (ENOENT);
999 1003 }
1000 1004
1001 1005 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1002 1006 zgd->zgd_zilog = zfsvfs->z_log;
1003 1007 zgd->zgd_private = zp;
1004 1008
1005 1009 /*
1006 1010 * Write records come in two flavors: immediate and indirect.
1007 1011 * For small writes it's cheaper to store the data with the
1008 1012 * log record (immediate); for large writes it's cheaper to
1009 1013 * sync the data and get a pointer to it (indirect) so that
1010 1014 * we don't have to write the data twice.
1011 1015 */
1012 1016 if (buf != NULL) { /* immediate write */
1013 1017 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1014 1018 /* test for truncation needs to be done while range locked */
1015 1019 if (offset >= zp->z_size) {
1016 1020 error = ENOENT;
1017 1021 } else {
1018 1022 error = dmu_read(os, object, offset, size, buf,
1019 1023 DMU_READ_NO_PREFETCH);
1020 1024 }
1021 1025 ASSERT(error == 0 || error == ENOENT);
1022 1026 } else { /* indirect write */
1023 1027 /*
1024 1028 * Have to lock the whole block to ensure when it's
1025 1029 * written out and it's checksum is being calculated
1026 1030 * that no one can change the data. We need to re-check
1027 1031 * blocksize after we get the lock in case it's changed!
1028 1032 */
1029 1033 for (;;) {
1030 1034 uint64_t blkoff;
1031 1035 size = zp->z_blksz;
1032 1036 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1033 1037 offset -= blkoff;
1034 1038 zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1035 1039 RL_READER);
1036 1040 if (zp->z_blksz == size)
1037 1041 break;
1038 1042 offset += blkoff;
1039 1043 zfs_range_unlock(zgd->zgd_rl);
1040 1044 }
1041 1045 /* test for truncation needs to be done while range locked */
1042 1046 if (lr->lr_offset >= zp->z_size)
1043 1047 error = ENOENT;
1044 1048 #ifdef DEBUG
1045 1049 if (zil_fault_io) {
1046 1050 error = EIO;
1047 1051 zil_fault_io = 0;
1048 1052 }
1049 1053 #endif
1050 1054 if (error == 0)
1051 1055 error = dmu_buf_hold(os, object, offset, zgd, &db,
1052 1056 DMU_READ_NO_PREFETCH);
1053 1057
1054 1058 if (error == 0) {
1055 1059 zgd->zgd_db = db;
1056 1060 zgd->zgd_bp = bp;
1057 1061
1058 1062 ASSERT(db->db_offset == offset);
1059 1063 ASSERT(db->db_size == size);
1060 1064
1061 1065 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1062 1066 zfs_get_done, zgd);
1063 1067 ASSERT(error || lr->lr_length <= zp->z_blksz);
1064 1068
1065 1069 /*
1066 1070 * On success, we need to wait for the write I/O
1067 1071 * initiated by dmu_sync() to complete before we can
1068 1072 * release this dbuf. We will finish everything up
1069 1073 * in the zfs_get_done() callback.
1070 1074 */
1071 1075 if (error == 0)
1072 1076 return (0);
1073 1077
1074 1078 if (error == EALREADY) {
1075 1079 lr->lr_common.lrc_txtype = TX_WRITE2;
1076 1080 error = 0;
1077 1081 }
1078 1082 }
1079 1083 }
1080 1084
1081 1085 zfs_get_done(zgd, error);
1082 1086
1083 1087 return (error);
1084 1088 }
1085 1089
1086 1090 /*ARGSUSED*/
1087 1091 static int
1088 1092 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1089 1093 caller_context_t *ct)
1090 1094 {
1091 1095 znode_t *zp = VTOZ(vp);
1092 1096 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1093 1097 int error;
1094 1098
1095 1099 ZFS_ENTER(zfsvfs);
1096 1100 ZFS_VERIFY_ZP(zp);
1097 1101
1098 1102 if (flag & V_ACE_MASK)
1099 1103 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1100 1104 else
1101 1105 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1102 1106
1103 1107 ZFS_EXIT(zfsvfs);
1104 1108 return (error);
1105 1109 }
1106 1110
1107 1111 /*
1108 1112 * If vnode is for a device return a specfs vnode instead.
1109 1113 */
1110 1114 static int
1111 1115 specvp_check(vnode_t **vpp, cred_t *cr)
1112 1116 {
1113 1117 int error = 0;
1114 1118
1115 1119 if (IS_DEVVP(*vpp)) {
1116 1120 struct vnode *svp;
1117 1121
1118 1122 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1119 1123 VN_RELE(*vpp);
1120 1124 if (svp == NULL)
1121 1125 error = ENOSYS;
1122 1126 *vpp = svp;
1123 1127 }
1124 1128 return (error);
1125 1129 }
1126 1130
1127 1131
1128 1132 /*
1129 1133 * Lookup an entry in a directory, or an extended attribute directory.
1130 1134 * If it exists, return a held vnode reference for it.
1131 1135 *
1132 1136 * IN: dvp - vnode of directory to search.
1133 1137 * nm - name of entry to lookup.
1134 1138 * pnp - full pathname to lookup [UNUSED].
1135 1139 * flags - LOOKUP_XATTR set if looking for an attribute.
1136 1140 * rdir - root directory vnode [UNUSED].
1137 1141 * cr - credentials of caller.
1138 1142 * ct - caller context
1139 1143 * direntflags - directory lookup flags
1140 1144 * realpnp - returned pathname.
1141 1145 *
1142 1146 * OUT: vpp - vnode of located entry, NULL if not found.
1143 1147 *
1144 1148 * RETURN: 0 if success
1145 1149 * error code if failure
1146 1150 *
1147 1151 * Timestamps:
1148 1152 * NA
1149 1153 */
1150 1154 /* ARGSUSED */
1151 1155 static int
1152 1156 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1153 1157 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1154 1158 int *direntflags, pathname_t *realpnp)
1155 1159 {
1156 1160 znode_t *zdp = VTOZ(dvp);
1157 1161 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1158 1162 int error = 0;
1159 1163
1160 1164 /* fast path */
1161 1165 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1162 1166
1163 1167 if (dvp->v_type != VDIR) {
1164 1168 return (ENOTDIR);
1165 1169 } else if (zdp->z_sa_hdl == NULL) {
1166 1170 return (EIO);
1167 1171 }
1168 1172
1169 1173 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1170 1174 error = zfs_fastaccesschk_execute(zdp, cr);
1171 1175 if (!error) {
1172 1176 *vpp = dvp;
1173 1177 VN_HOLD(*vpp);
1174 1178 return (0);
1175 1179 }
1176 1180 return (error);
1177 1181 } else {
1178 1182 vnode_t *tvp = dnlc_lookup(dvp, nm);
1179 1183
1180 1184 if (tvp) {
1181 1185 error = zfs_fastaccesschk_execute(zdp, cr);
1182 1186 if (error) {
1183 1187 VN_RELE(tvp);
1184 1188 return (error);
1185 1189 }
1186 1190 if (tvp == DNLC_NO_VNODE) {
1187 1191 VN_RELE(tvp);
1188 1192 return (ENOENT);
1189 1193 } else {
1190 1194 *vpp = tvp;
1191 1195 return (specvp_check(vpp, cr));
1192 1196 }
1193 1197 }
1194 1198 }
1195 1199 }
1196 1200
1197 1201 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1198 1202
1199 1203 ZFS_ENTER(zfsvfs);
1200 1204 ZFS_VERIFY_ZP(zdp);
1201 1205
1202 1206 *vpp = NULL;
1203 1207
1204 1208 if (flags & LOOKUP_XATTR) {
1205 1209 /*
1206 1210 * If the xattr property is off, refuse the lookup request.
1207 1211 */
1208 1212 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1209 1213 ZFS_EXIT(zfsvfs);
1210 1214 return (EINVAL);
1211 1215 }
1212 1216
1213 1217 /*
1214 1218 * We don't allow recursive attributes..
1215 1219 * Maybe someday we will.
1216 1220 */
1217 1221 if (zdp->z_pflags & ZFS_XATTR) {
1218 1222 ZFS_EXIT(zfsvfs);
1219 1223 return (EINVAL);
1220 1224 }
1221 1225
1222 1226 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1223 1227 ZFS_EXIT(zfsvfs);
1224 1228 return (error);
1225 1229 }
1226 1230
1227 1231 /*
1228 1232 * Do we have permission to get into attribute directory?
1229 1233 */
1230 1234
1231 1235 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1232 1236 B_FALSE, cr)) {
1233 1237 VN_RELE(*vpp);
1234 1238 *vpp = NULL;
1235 1239 }
1236 1240
1237 1241 ZFS_EXIT(zfsvfs);
1238 1242 return (error);
1239 1243 }
1240 1244
1241 1245 if (dvp->v_type != VDIR) {
1242 1246 ZFS_EXIT(zfsvfs);
1243 1247 return (ENOTDIR);
1244 1248 }
1245 1249
1246 1250 /*
1247 1251 * Check accessibility of directory.
1248 1252 */
1249 1253
1250 1254 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1251 1255 ZFS_EXIT(zfsvfs);
1252 1256 return (error);
1253 1257 }
1254 1258
1255 1259 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1256 1260 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1257 1261 ZFS_EXIT(zfsvfs);
1258 1262 return (EILSEQ);
1259 1263 }
1260 1264
1261 1265 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1262 1266 if (error == 0)
1263 1267 error = specvp_check(vpp, cr);
1264 1268
1265 1269 ZFS_EXIT(zfsvfs);
1266 1270 return (error);
1267 1271 }
1268 1272
1269 1273 /*
1270 1274 * Attempt to create a new entry in a directory. If the entry
1271 1275 * already exists, truncate the file if permissible, else return
1272 1276 * an error. Return the vp of the created or trunc'd file.
1273 1277 *
1274 1278 * IN: dvp - vnode of directory to put new file entry in.
1275 1279 * name - name of new file entry.
1276 1280 * vap - attributes of new file.
1277 1281 * excl - flag indicating exclusive or non-exclusive mode.
1278 1282 * mode - mode to open file with.
1279 1283 * cr - credentials of caller.
1280 1284 * flag - large file flag [UNUSED].
1281 1285 * ct - caller context
1282 1286 * vsecp - ACL to be set
1283 1287 *
1284 1288 * OUT: vpp - vnode of created or trunc'd entry.
1285 1289 *
1286 1290 * RETURN: 0 if success
1287 1291 * error code if failure
1288 1292 *
1289 1293 * Timestamps:
1290 1294 * dvp - ctime|mtime updated if new entry created
1291 1295 * vp - ctime|mtime always, atime if new
1292 1296 */
1293 1297
1294 1298 /* ARGSUSED */
1295 1299 static int
1296 1300 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1297 1301 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1298 1302 vsecattr_t *vsecp)
1299 1303 {
1300 1304 znode_t *zp, *dzp = VTOZ(dvp);
1301 1305 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1302 1306 zilog_t *zilog;
1303 1307 objset_t *os;
1304 1308 zfs_dirlock_t *dl;
1305 1309 dmu_tx_t *tx;
1306 1310 int error;
1307 1311 ksid_t *ksid;
1308 1312 uid_t uid;
1309 1313 gid_t gid = crgetgid(cr);
1310 1314 zfs_acl_ids_t acl_ids;
1311 1315 boolean_t fuid_dirtied;
1312 1316 boolean_t have_acl = B_FALSE;
1313 1317
1314 1318 /*
1315 1319 * If we have an ephemeral id, ACL, or XVATTR then
1316 1320 * make sure file system is at proper version
1317 1321 */
1318 1322
1319 1323 ksid = crgetsid(cr, KSID_OWNER);
1320 1324 if (ksid)
1321 1325 uid = ksid_getid(ksid);
1322 1326 else
1323 1327 uid = crgetuid(cr);
1324 1328
1325 1329 if (zfsvfs->z_use_fuids == B_FALSE &&
1326 1330 (vsecp || (vap->va_mask & AT_XVATTR) ||
1327 1331 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1328 1332 return (EINVAL);
1329 1333
1330 1334 ZFS_ENTER(zfsvfs);
1331 1335 ZFS_VERIFY_ZP(dzp);
1332 1336 os = zfsvfs->z_os;
1333 1337 zilog = zfsvfs->z_log;
1334 1338
1335 1339 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1336 1340 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1337 1341 ZFS_EXIT(zfsvfs);
1338 1342 return (EILSEQ);
1339 1343 }
1340 1344
1341 1345 if (vap->va_mask & AT_XVATTR) {
1342 1346 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1343 1347 crgetuid(cr), cr, vap->va_type)) != 0) {
1344 1348 ZFS_EXIT(zfsvfs);
1345 1349 return (error);
1346 1350 }
1347 1351 }
1348 1352 top:
1349 1353 *vpp = NULL;
1350 1354
1351 1355 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1352 1356 vap->va_mode &= ~VSVTX;
1353 1357
1354 1358 if (*name == '\0') {
1355 1359 /*
1356 1360 * Null component name refers to the directory itself.
1357 1361 */
1358 1362 VN_HOLD(dvp);
1359 1363 zp = dzp;
1360 1364 dl = NULL;
1361 1365 error = 0;
1362 1366 } else {
1363 1367 /* possible VN_HOLD(zp) */
1364 1368 int zflg = 0;
1365 1369
1366 1370 if (flag & FIGNORECASE)
1367 1371 zflg |= ZCILOOK;
1368 1372
1369 1373 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1370 1374 NULL, NULL);
1371 1375 if (error) {
1372 1376 if (have_acl)
1373 1377 zfs_acl_ids_free(&acl_ids);
1374 1378 if (strcmp(name, "..") == 0)
1375 1379 error = EISDIR;
1376 1380 ZFS_EXIT(zfsvfs);
1377 1381 return (error);
1378 1382 }
1379 1383 }
1380 1384
1381 1385 if (zp == NULL) {
1382 1386 uint64_t txtype;
1383 1387
1384 1388 /*
1385 1389 * Create a new file object and update the directory
1386 1390 * to reference it.
1387 1391 */
1388 1392 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1389 1393 if (have_acl)
1390 1394 zfs_acl_ids_free(&acl_ids);
1391 1395 goto out;
1392 1396 }
1393 1397
1394 1398 /*
1395 1399 * We only support the creation of regular files in
1396 1400 * extended attribute directories.
1397 1401 */
1398 1402
1399 1403 if ((dzp->z_pflags & ZFS_XATTR) &&
1400 1404 (vap->va_type != VREG)) {
1401 1405 if (have_acl)
1402 1406 zfs_acl_ids_free(&acl_ids);
1403 1407 error = EINVAL;
1404 1408 goto out;
1405 1409 }
1406 1410
1407 1411 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1408 1412 cr, vsecp, &acl_ids)) != 0)
1409 1413 goto out;
1410 1414 have_acl = B_TRUE;
1411 1415
1412 1416 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1413 1417 zfs_acl_ids_free(&acl_ids);
1414 1418 error = EDQUOT;
1415 1419 goto out;
1416 1420 }
1417 1421
1418 1422 tx = dmu_tx_create(os);
1419 1423
1420 1424 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1421 1425 ZFS_SA_BASE_ATTR_SIZE);
1422 1426
1423 1427 fuid_dirtied = zfsvfs->z_fuid_dirty;
1424 1428 if (fuid_dirtied)
1425 1429 zfs_fuid_txhold(zfsvfs, tx);
1426 1430 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1427 1431 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1428 1432 if (!zfsvfs->z_use_sa &&
1429 1433 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1430 1434 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1431 1435 0, acl_ids.z_aclp->z_acl_bytes);
1432 1436 }
1433 1437 error = dmu_tx_assign(tx, TXG_NOWAIT);
1434 1438 if (error) {
1435 1439 zfs_dirent_unlock(dl);
1436 1440 if (error == ERESTART) {
1437 1441 dmu_tx_wait(tx);
1438 1442 dmu_tx_abort(tx);
1439 1443 goto top;
1440 1444 }
1441 1445 zfs_acl_ids_free(&acl_ids);
1442 1446 dmu_tx_abort(tx);
1443 1447 ZFS_EXIT(zfsvfs);
1444 1448 return (error);
1445 1449 }
1446 1450 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1447 1451
1448 1452 if (fuid_dirtied)
1449 1453 zfs_fuid_sync(zfsvfs, tx);
1450 1454
1451 1455 (void) zfs_link_create(dl, zp, tx, ZNEW);
1452 1456 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1453 1457 if (flag & FIGNORECASE)
1454 1458 txtype |= TX_CI;
1455 1459 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1456 1460 vsecp, acl_ids.z_fuidp, vap);
1457 1461 zfs_acl_ids_free(&acl_ids);
1458 1462 dmu_tx_commit(tx);
1459 1463 } else {
1460 1464 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1461 1465
1462 1466 if (have_acl)
1463 1467 zfs_acl_ids_free(&acl_ids);
1464 1468 have_acl = B_FALSE;
1465 1469
1466 1470 /*
1467 1471 * A directory entry already exists for this name.
1468 1472 */
1469 1473 /*
1470 1474 * Can't truncate an existing file if in exclusive mode.
1471 1475 */
1472 1476 if (excl == EXCL) {
1473 1477 error = EEXIST;
1474 1478 goto out;
1475 1479 }
1476 1480 /*
1477 1481 * Can't open a directory for writing.
1478 1482 */
1479 1483 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1480 1484 error = EISDIR;
1481 1485 goto out;
1482 1486 }
1483 1487 /*
1484 1488 * Verify requested access to file.
1485 1489 */
1486 1490 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1487 1491 goto out;
1488 1492 }
1489 1493
1490 1494 mutex_enter(&dzp->z_lock);
1491 1495 dzp->z_seq++;
1492 1496 mutex_exit(&dzp->z_lock);
1493 1497
1494 1498 /*
1495 1499 * Truncate regular files if requested.
1496 1500 */
1497 1501 if ((ZTOV(zp)->v_type == VREG) &&
1498 1502 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1499 1503 /* we can't hold any locks when calling zfs_freesp() */
1500 1504 zfs_dirent_unlock(dl);
1501 1505 dl = NULL;
1502 1506 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1503 1507 if (error == 0) {
1504 1508 vnevent_create(ZTOV(zp), ct);
1505 1509 }
1506 1510 }
1507 1511 }
1508 1512 out:
1509 1513
1510 1514 if (dl)
1511 1515 zfs_dirent_unlock(dl);
1512 1516
1513 1517 if (error) {
1514 1518 if (zp)
1515 1519 VN_RELE(ZTOV(zp));
1516 1520 } else {
1517 1521 *vpp = ZTOV(zp);
1518 1522 error = specvp_check(vpp, cr);
1519 1523 }
1520 1524
1521 1525 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1522 1526 zil_commit(zilog, 0);
1523 1527
1524 1528 ZFS_EXIT(zfsvfs);
1525 1529 return (error);
1526 1530 }
1527 1531
1528 1532 /*
1529 1533 * Remove an entry from a directory.
1530 1534 *
1531 1535 * IN: dvp - vnode of directory to remove entry from.
1532 1536 * name - name of entry to remove.
1533 1537 * cr - credentials of caller.
1534 1538 * ct - caller context
1535 1539 * flags - case flags
1536 1540 *
1537 1541 * RETURN: 0 if success
1538 1542 * error code if failure
1539 1543 *
1540 1544 * Timestamps:
1541 1545 * dvp - ctime|mtime
1542 1546 * vp - ctime (if nlink > 0)
1543 1547 */
1544 1548
1545 1549 uint64_t null_xattr = 0;
1546 1550
1547 1551 /*ARGSUSED*/
1548 1552 static int
1549 1553 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1550 1554 int flags)
1551 1555 {
1552 1556 znode_t *zp, *dzp = VTOZ(dvp);
1553 1557 znode_t *xzp;
1554 1558 vnode_t *vp;
1555 1559 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1556 1560 zilog_t *zilog;
1557 1561 uint64_t acl_obj, xattr_obj;
1558 1562 uint64_t xattr_obj_unlinked = 0;
1559 1563 uint64_t obj = 0;
1560 1564 zfs_dirlock_t *dl;
1561 1565 dmu_tx_t *tx;
1562 1566 boolean_t may_delete_now, delete_now = FALSE;
1563 1567 boolean_t unlinked, toobig = FALSE;
1564 1568 uint64_t txtype;
1565 1569 pathname_t *realnmp = NULL;
1566 1570 pathname_t realnm;
1567 1571 int error;
1568 1572 int zflg = ZEXISTS;
1569 1573
1570 1574 ZFS_ENTER(zfsvfs);
1571 1575 ZFS_VERIFY_ZP(dzp);
1572 1576 zilog = zfsvfs->z_log;
1573 1577
1574 1578 if (flags & FIGNORECASE) {
1575 1579 zflg |= ZCILOOK;
1576 1580 pn_alloc(&realnm);
1577 1581 realnmp = &realnm;
1578 1582 }
1579 1583
1580 1584 top:
1581 1585 xattr_obj = 0;
1582 1586 xzp = NULL;
1583 1587 /*
1584 1588 * Attempt to lock directory; fail if entry doesn't exist.
1585 1589 */
1586 1590 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1587 1591 NULL, realnmp)) {
1588 1592 if (realnmp)
1589 1593 pn_free(realnmp);
1590 1594 ZFS_EXIT(zfsvfs);
1591 1595 return (error);
1592 1596 }
1593 1597
1594 1598 vp = ZTOV(zp);
1595 1599
1596 1600 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1597 1601 goto out;
1598 1602 }
1599 1603
1600 1604 /*
1601 1605 * Need to use rmdir for removing directories.
1602 1606 */
1603 1607 if (vp->v_type == VDIR) {
1604 1608 error = EPERM;
1605 1609 goto out;
1606 1610 }
1607 1611
1608 1612 vnevent_remove(vp, dvp, name, ct);
1609 1613
1610 1614 if (realnmp)
1611 1615 dnlc_remove(dvp, realnmp->pn_buf);
1612 1616 else
1613 1617 dnlc_remove(dvp, name);
1614 1618
1615 1619 mutex_enter(&vp->v_lock);
1616 1620 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1617 1621 mutex_exit(&vp->v_lock);
1618 1622
1619 1623 /*
1620 1624 * We may delete the znode now, or we may put it in the unlinked set;
1621 1625 * it depends on whether we're the last link, and on whether there are
1622 1626 * other holds on the vnode. So we dmu_tx_hold() the right things to
1623 1627 * allow for either case.
1624 1628 */
1625 1629 obj = zp->z_id;
1626 1630 tx = dmu_tx_create(zfsvfs->z_os);
1627 1631 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1628 1632 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1629 1633 zfs_sa_upgrade_txholds(tx, zp);
1630 1634 zfs_sa_upgrade_txholds(tx, dzp);
1631 1635 if (may_delete_now) {
1632 1636 toobig =
1633 1637 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
↓ open down ↓ |
1599 lines elided |
↑ open up ↑ |
1634 1638 /* if the file is too big, only hold_free a token amount */
1635 1639 dmu_tx_hold_free(tx, zp->z_id, 0,
1636 1640 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1637 1641 }
1638 1642
1639 1643 /* are there any extended attributes? */
1640 1644 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1641 1645 &xattr_obj, sizeof (xattr_obj));
1642 1646 if (error == 0 && xattr_obj) {
1643 1647 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1644 - ASSERT3U(error, ==, 0);
1648 + ASSERT0(error);
1645 1649 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1646 1650 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1647 1651 }
1648 1652
1649 1653 mutex_enter(&zp->z_lock);
1650 1654 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1651 1655 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1652 1656 mutex_exit(&zp->z_lock);
1653 1657
1654 1658 /* charge as an update -- would be nice not to charge at all */
1655 1659 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1656 1660
1657 1661 error = dmu_tx_assign(tx, TXG_NOWAIT);
1658 1662 if (error) {
1659 1663 zfs_dirent_unlock(dl);
1660 1664 VN_RELE(vp);
1661 1665 if (xzp)
1662 1666 VN_RELE(ZTOV(xzp));
1663 1667 if (error == ERESTART) {
1664 1668 dmu_tx_wait(tx);
1665 1669 dmu_tx_abort(tx);
1666 1670 goto top;
1667 1671 }
1668 1672 if (realnmp)
1669 1673 pn_free(realnmp);
1670 1674 dmu_tx_abort(tx);
1671 1675 ZFS_EXIT(zfsvfs);
1672 1676 return (error);
1673 1677 }
1674 1678
1675 1679 /*
1676 1680 * Remove the directory entry.
1677 1681 */
1678 1682 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1679 1683
1680 1684 if (error) {
1681 1685 dmu_tx_commit(tx);
1682 1686 goto out;
1683 1687 }
1684 1688
1685 1689 if (unlinked) {
1686 1690
1687 1691 /*
1688 1692 * Hold z_lock so that we can make sure that the ACL obj
1689 1693 * hasn't changed. Could have been deleted due to
1690 1694 * zfs_sa_upgrade().
1691 1695 */
1692 1696 mutex_enter(&zp->z_lock);
1693 1697 mutex_enter(&vp->v_lock);
1694 1698 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1695 1699 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1696 1700 delete_now = may_delete_now && !toobig &&
1697 1701 vp->v_count == 1 && !vn_has_cached_data(vp) &&
1698 1702 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1699 1703 acl_obj;
1700 1704 mutex_exit(&vp->v_lock);
1701 1705 }
1702 1706
1703 1707 if (delete_now) {
1704 1708 if (xattr_obj_unlinked) {
1705 1709 ASSERT3U(xzp->z_links, ==, 2);
1706 1710 mutex_enter(&xzp->z_lock);
1707 1711 xzp->z_unlinked = 1;
1708 1712 xzp->z_links = 0;
1709 1713 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1710 1714 &xzp->z_links, sizeof (xzp->z_links), tx);
1711 1715 ASSERT3U(error, ==, 0);
↓ open down ↓ |
57 lines elided |
↑ open up ↑ |
1712 1716 mutex_exit(&xzp->z_lock);
1713 1717 zfs_unlinked_add(xzp, tx);
1714 1718
1715 1719 if (zp->z_is_sa)
1716 1720 error = sa_remove(zp->z_sa_hdl,
1717 1721 SA_ZPL_XATTR(zfsvfs), tx);
1718 1722 else
1719 1723 error = sa_update(zp->z_sa_hdl,
1720 1724 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1721 1725 sizeof (uint64_t), tx);
1722 - ASSERT3U(error, ==, 0);
1726 + ASSERT0(error);
1723 1727 }
1724 1728 mutex_enter(&vp->v_lock);
1725 1729 vp->v_count--;
1726 - ASSERT3U(vp->v_count, ==, 0);
1730 + ASSERT0(vp->v_count);
1727 1731 mutex_exit(&vp->v_lock);
1728 1732 mutex_exit(&zp->z_lock);
1729 1733 zfs_znode_delete(zp, tx);
1730 1734 } else if (unlinked) {
1731 1735 mutex_exit(&zp->z_lock);
1732 1736 zfs_unlinked_add(zp, tx);
1733 1737 }
1734 1738
1735 1739 txtype = TX_REMOVE;
1736 1740 if (flags & FIGNORECASE)
1737 1741 txtype |= TX_CI;
1738 1742 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1739 1743
1740 1744 dmu_tx_commit(tx);
1741 1745 out:
1742 1746 if (realnmp)
1743 1747 pn_free(realnmp);
1744 1748
1745 1749 zfs_dirent_unlock(dl);
1746 1750
1747 1751 if (!delete_now)
1748 1752 VN_RELE(vp);
1749 1753 if (xzp)
1750 1754 VN_RELE(ZTOV(xzp));
1751 1755
1752 1756 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1753 1757 zil_commit(zilog, 0);
1754 1758
1755 1759 ZFS_EXIT(zfsvfs);
1756 1760 return (error);
1757 1761 }
1758 1762
1759 1763 /*
1760 1764 * Create a new directory and insert it into dvp using the name
1761 1765 * provided. Return a pointer to the inserted directory.
1762 1766 *
1763 1767 * IN: dvp - vnode of directory to add subdir to.
1764 1768 * dirname - name of new directory.
1765 1769 * vap - attributes of new directory.
1766 1770 * cr - credentials of caller.
1767 1771 * ct - caller context
1768 1772 * vsecp - ACL to be set
1769 1773 *
1770 1774 * OUT: vpp - vnode of created directory.
1771 1775 *
1772 1776 * RETURN: 0 if success
1773 1777 * error code if failure
1774 1778 *
1775 1779 * Timestamps:
1776 1780 * dvp - ctime|mtime updated
1777 1781 * vp - ctime|mtime|atime updated
1778 1782 */
1779 1783 /*ARGSUSED*/
1780 1784 static int
1781 1785 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1782 1786 caller_context_t *ct, int flags, vsecattr_t *vsecp)
1783 1787 {
1784 1788 znode_t *zp, *dzp = VTOZ(dvp);
1785 1789 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1786 1790 zilog_t *zilog;
1787 1791 zfs_dirlock_t *dl;
1788 1792 uint64_t txtype;
1789 1793 dmu_tx_t *tx;
1790 1794 int error;
1791 1795 int zf = ZNEW;
1792 1796 ksid_t *ksid;
1793 1797 uid_t uid;
1794 1798 gid_t gid = crgetgid(cr);
1795 1799 zfs_acl_ids_t acl_ids;
1796 1800 boolean_t fuid_dirtied;
1797 1801
1798 1802 ASSERT(vap->va_type == VDIR);
1799 1803
1800 1804 /*
1801 1805 * If we have an ephemeral id, ACL, or XVATTR then
1802 1806 * make sure file system is at proper version
1803 1807 */
1804 1808
1805 1809 ksid = crgetsid(cr, KSID_OWNER);
1806 1810 if (ksid)
1807 1811 uid = ksid_getid(ksid);
1808 1812 else
1809 1813 uid = crgetuid(cr);
1810 1814 if (zfsvfs->z_use_fuids == B_FALSE &&
1811 1815 (vsecp || (vap->va_mask & AT_XVATTR) ||
1812 1816 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1813 1817 return (EINVAL);
1814 1818
1815 1819 ZFS_ENTER(zfsvfs);
1816 1820 ZFS_VERIFY_ZP(dzp);
1817 1821 zilog = zfsvfs->z_log;
1818 1822
1819 1823 if (dzp->z_pflags & ZFS_XATTR) {
1820 1824 ZFS_EXIT(zfsvfs);
1821 1825 return (EINVAL);
1822 1826 }
1823 1827
1824 1828 if (zfsvfs->z_utf8 && u8_validate(dirname,
1825 1829 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1826 1830 ZFS_EXIT(zfsvfs);
1827 1831 return (EILSEQ);
1828 1832 }
1829 1833 if (flags & FIGNORECASE)
1830 1834 zf |= ZCILOOK;
1831 1835
1832 1836 if (vap->va_mask & AT_XVATTR) {
1833 1837 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1834 1838 crgetuid(cr), cr, vap->va_type)) != 0) {
1835 1839 ZFS_EXIT(zfsvfs);
1836 1840 return (error);
1837 1841 }
1838 1842 }
1839 1843
1840 1844 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1841 1845 vsecp, &acl_ids)) != 0) {
1842 1846 ZFS_EXIT(zfsvfs);
1843 1847 return (error);
1844 1848 }
1845 1849 /*
1846 1850 * First make sure the new directory doesn't exist.
1847 1851 *
1848 1852 * Existence is checked first to make sure we don't return
1849 1853 * EACCES instead of EEXIST which can cause some applications
1850 1854 * to fail.
1851 1855 */
1852 1856 top:
1853 1857 *vpp = NULL;
1854 1858
1855 1859 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1856 1860 NULL, NULL)) {
1857 1861 zfs_acl_ids_free(&acl_ids);
1858 1862 ZFS_EXIT(zfsvfs);
1859 1863 return (error);
1860 1864 }
1861 1865
1862 1866 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1863 1867 zfs_acl_ids_free(&acl_ids);
1864 1868 zfs_dirent_unlock(dl);
1865 1869 ZFS_EXIT(zfsvfs);
1866 1870 return (error);
1867 1871 }
1868 1872
1869 1873 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1870 1874 zfs_acl_ids_free(&acl_ids);
1871 1875 zfs_dirent_unlock(dl);
1872 1876 ZFS_EXIT(zfsvfs);
1873 1877 return (EDQUOT);
1874 1878 }
1875 1879
1876 1880 /*
1877 1881 * Add a new entry to the directory.
1878 1882 */
1879 1883 tx = dmu_tx_create(zfsvfs->z_os);
1880 1884 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1881 1885 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1882 1886 fuid_dirtied = zfsvfs->z_fuid_dirty;
1883 1887 if (fuid_dirtied)
1884 1888 zfs_fuid_txhold(zfsvfs, tx);
1885 1889 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1886 1890 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1887 1891 acl_ids.z_aclp->z_acl_bytes);
1888 1892 }
1889 1893
1890 1894 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1891 1895 ZFS_SA_BASE_ATTR_SIZE);
1892 1896
1893 1897 error = dmu_tx_assign(tx, TXG_NOWAIT);
1894 1898 if (error) {
1895 1899 zfs_dirent_unlock(dl);
1896 1900 if (error == ERESTART) {
1897 1901 dmu_tx_wait(tx);
1898 1902 dmu_tx_abort(tx);
1899 1903 goto top;
1900 1904 }
1901 1905 zfs_acl_ids_free(&acl_ids);
1902 1906 dmu_tx_abort(tx);
1903 1907 ZFS_EXIT(zfsvfs);
1904 1908 return (error);
1905 1909 }
1906 1910
1907 1911 /*
1908 1912 * Create new node.
1909 1913 */
1910 1914 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1911 1915
1912 1916 if (fuid_dirtied)
1913 1917 zfs_fuid_sync(zfsvfs, tx);
1914 1918
1915 1919 /*
1916 1920 * Now put new name in parent dir.
1917 1921 */
1918 1922 (void) zfs_link_create(dl, zp, tx, ZNEW);
1919 1923
1920 1924 *vpp = ZTOV(zp);
1921 1925
1922 1926 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1923 1927 if (flags & FIGNORECASE)
1924 1928 txtype |= TX_CI;
1925 1929 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1926 1930 acl_ids.z_fuidp, vap);
1927 1931
1928 1932 zfs_acl_ids_free(&acl_ids);
1929 1933
1930 1934 dmu_tx_commit(tx);
1931 1935
1932 1936 zfs_dirent_unlock(dl);
1933 1937
1934 1938 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1935 1939 zil_commit(zilog, 0);
1936 1940
1937 1941 ZFS_EXIT(zfsvfs);
1938 1942 return (0);
1939 1943 }
1940 1944
1941 1945 /*
1942 1946 * Remove a directory subdir entry. If the current working
1943 1947 * directory is the same as the subdir to be removed, the
1944 1948 * remove will fail.
1945 1949 *
1946 1950 * IN: dvp - vnode of directory to remove from.
1947 1951 * name - name of directory to be removed.
1948 1952 * cwd - vnode of current working directory.
1949 1953 * cr - credentials of caller.
1950 1954 * ct - caller context
1951 1955 * flags - case flags
1952 1956 *
1953 1957 * RETURN: 0 if success
1954 1958 * error code if failure
1955 1959 *
1956 1960 * Timestamps:
1957 1961 * dvp - ctime|mtime updated
1958 1962 */
1959 1963 /*ARGSUSED*/
1960 1964 static int
1961 1965 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1962 1966 caller_context_t *ct, int flags)
1963 1967 {
1964 1968 znode_t *dzp = VTOZ(dvp);
1965 1969 znode_t *zp;
1966 1970 vnode_t *vp;
1967 1971 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1968 1972 zilog_t *zilog;
1969 1973 zfs_dirlock_t *dl;
1970 1974 dmu_tx_t *tx;
1971 1975 int error;
1972 1976 int zflg = ZEXISTS;
1973 1977
1974 1978 ZFS_ENTER(zfsvfs);
1975 1979 ZFS_VERIFY_ZP(dzp);
1976 1980 zilog = zfsvfs->z_log;
1977 1981
1978 1982 if (flags & FIGNORECASE)
1979 1983 zflg |= ZCILOOK;
1980 1984 top:
1981 1985 zp = NULL;
1982 1986
1983 1987 /*
1984 1988 * Attempt to lock directory; fail if entry doesn't exist.
1985 1989 */
1986 1990 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1987 1991 NULL, NULL)) {
1988 1992 ZFS_EXIT(zfsvfs);
1989 1993 return (error);
1990 1994 }
1991 1995
1992 1996 vp = ZTOV(zp);
1993 1997
1994 1998 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1995 1999 goto out;
1996 2000 }
1997 2001
1998 2002 if (vp->v_type != VDIR) {
1999 2003 error = ENOTDIR;
2000 2004 goto out;
2001 2005 }
2002 2006
2003 2007 if (vp == cwd) {
2004 2008 error = EINVAL;
2005 2009 goto out;
2006 2010 }
2007 2011
2008 2012 vnevent_rmdir(vp, dvp, name, ct);
2009 2013
2010 2014 /*
2011 2015 * Grab a lock on the directory to make sure that noone is
2012 2016 * trying to add (or lookup) entries while we are removing it.
2013 2017 */
2014 2018 rw_enter(&zp->z_name_lock, RW_WRITER);
2015 2019
2016 2020 /*
2017 2021 * Grab a lock on the parent pointer to make sure we play well
2018 2022 * with the treewalk and directory rename code.
2019 2023 */
2020 2024 rw_enter(&zp->z_parent_lock, RW_WRITER);
2021 2025
2022 2026 tx = dmu_tx_create(zfsvfs->z_os);
2023 2027 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2024 2028 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2025 2029 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2026 2030 zfs_sa_upgrade_txholds(tx, zp);
2027 2031 zfs_sa_upgrade_txholds(tx, dzp);
2028 2032 error = dmu_tx_assign(tx, TXG_NOWAIT);
2029 2033 if (error) {
2030 2034 rw_exit(&zp->z_parent_lock);
2031 2035 rw_exit(&zp->z_name_lock);
2032 2036 zfs_dirent_unlock(dl);
2033 2037 VN_RELE(vp);
2034 2038 if (error == ERESTART) {
2035 2039 dmu_tx_wait(tx);
2036 2040 dmu_tx_abort(tx);
2037 2041 goto top;
2038 2042 }
2039 2043 dmu_tx_abort(tx);
2040 2044 ZFS_EXIT(zfsvfs);
2041 2045 return (error);
2042 2046 }
2043 2047
2044 2048 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2045 2049
2046 2050 if (error == 0) {
2047 2051 uint64_t txtype = TX_RMDIR;
2048 2052 if (flags & FIGNORECASE)
2049 2053 txtype |= TX_CI;
2050 2054 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2051 2055 }
2052 2056
2053 2057 dmu_tx_commit(tx);
2054 2058
2055 2059 rw_exit(&zp->z_parent_lock);
2056 2060 rw_exit(&zp->z_name_lock);
2057 2061 out:
2058 2062 zfs_dirent_unlock(dl);
2059 2063
2060 2064 VN_RELE(vp);
2061 2065
2062 2066 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2063 2067 zil_commit(zilog, 0);
2064 2068
2065 2069 ZFS_EXIT(zfsvfs);
2066 2070 return (error);
2067 2071 }
2068 2072
2069 2073 /*
2070 2074 * Read as many directory entries as will fit into the provided
2071 2075 * buffer from the given directory cursor position (specified in
2072 2076 * the uio structure.
2073 2077 *
2074 2078 * IN: vp - vnode of directory to read.
2075 2079 * uio - structure supplying read location, range info,
2076 2080 * and return buffer.
2077 2081 * cr - credentials of caller.
2078 2082 * ct - caller context
2079 2083 * flags - case flags
2080 2084 *
2081 2085 * OUT: uio - updated offset and range, buffer filled.
2082 2086 * eofp - set to true if end-of-file detected.
2083 2087 *
2084 2088 * RETURN: 0 if success
2085 2089 * error code if failure
2086 2090 *
2087 2091 * Timestamps:
2088 2092 * vp - atime updated
2089 2093 *
2090 2094 * Note that the low 4 bits of the cookie returned by zap is always zero.
2091 2095 * This allows us to use the low range for "special" directory entries:
2092 2096 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2093 2097 * we use the offset 2 for the '.zfs' directory.
2094 2098 */
2095 2099 /* ARGSUSED */
2096 2100 static int
2097 2101 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2098 2102 caller_context_t *ct, int flags)
2099 2103 {
2100 2104 znode_t *zp = VTOZ(vp);
2101 2105 iovec_t *iovp;
2102 2106 edirent_t *eodp;
2103 2107 dirent64_t *odp;
2104 2108 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2105 2109 objset_t *os;
2106 2110 caddr_t outbuf;
2107 2111 size_t bufsize;
2108 2112 zap_cursor_t zc;
2109 2113 zap_attribute_t zap;
2110 2114 uint_t bytes_wanted;
2111 2115 uint64_t offset; /* must be unsigned; checks for < 1 */
2112 2116 uint64_t parent;
2113 2117 int local_eof;
2114 2118 int outcount;
2115 2119 int error;
2116 2120 uint8_t prefetch;
2117 2121 boolean_t check_sysattrs;
2118 2122
2119 2123 ZFS_ENTER(zfsvfs);
2120 2124 ZFS_VERIFY_ZP(zp);
2121 2125
2122 2126 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2123 2127 &parent, sizeof (parent))) != 0) {
2124 2128 ZFS_EXIT(zfsvfs);
2125 2129 return (error);
2126 2130 }
2127 2131
2128 2132 /*
2129 2133 * If we are not given an eof variable,
2130 2134 * use a local one.
2131 2135 */
2132 2136 if (eofp == NULL)
2133 2137 eofp = &local_eof;
2134 2138
2135 2139 /*
2136 2140 * Check for valid iov_len.
2137 2141 */
2138 2142 if (uio->uio_iov->iov_len <= 0) {
2139 2143 ZFS_EXIT(zfsvfs);
2140 2144 return (EINVAL);
2141 2145 }
2142 2146
2143 2147 /*
2144 2148 * Quit if directory has been removed (posix)
2145 2149 */
2146 2150 if ((*eofp = zp->z_unlinked) != 0) {
2147 2151 ZFS_EXIT(zfsvfs);
2148 2152 return (0);
2149 2153 }
2150 2154
2151 2155 error = 0;
2152 2156 os = zfsvfs->z_os;
2153 2157 offset = uio->uio_loffset;
2154 2158 prefetch = zp->z_zn_prefetch;
2155 2159
2156 2160 /*
2157 2161 * Initialize the iterator cursor.
2158 2162 */
2159 2163 if (offset <= 3) {
2160 2164 /*
2161 2165 * Start iteration from the beginning of the directory.
2162 2166 */
2163 2167 zap_cursor_init(&zc, os, zp->z_id);
2164 2168 } else {
2165 2169 /*
2166 2170 * The offset is a serialized cursor.
2167 2171 */
2168 2172 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2169 2173 }
2170 2174
2171 2175 /*
2172 2176 * Get space to change directory entries into fs independent format.
2173 2177 */
2174 2178 iovp = uio->uio_iov;
2175 2179 bytes_wanted = iovp->iov_len;
2176 2180 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2177 2181 bufsize = bytes_wanted;
2178 2182 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2179 2183 odp = (struct dirent64 *)outbuf;
2180 2184 } else {
2181 2185 bufsize = bytes_wanted;
2182 2186 odp = (struct dirent64 *)iovp->iov_base;
2183 2187 }
2184 2188 eodp = (struct edirent *)odp;
2185 2189
2186 2190 /*
2187 2191 * If this VFS supports the system attribute view interface; and
2188 2192 * we're looking at an extended attribute directory; and we care
2189 2193 * about normalization conflicts on this vfs; then we must check
2190 2194 * for normalization conflicts with the sysattr name space.
2191 2195 */
2192 2196 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2193 2197 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2194 2198 (flags & V_RDDIR_ENTFLAGS);
2195 2199
2196 2200 /*
2197 2201 * Transform to file-system independent format
2198 2202 */
2199 2203 outcount = 0;
2200 2204 while (outcount < bytes_wanted) {
2201 2205 ino64_t objnum;
2202 2206 ushort_t reclen;
2203 2207 off64_t *next = NULL;
2204 2208
2205 2209 /*
2206 2210 * Special case `.', `..', and `.zfs'.
2207 2211 */
2208 2212 if (offset == 0) {
2209 2213 (void) strcpy(zap.za_name, ".");
2210 2214 zap.za_normalization_conflict = 0;
2211 2215 objnum = zp->z_id;
2212 2216 } else if (offset == 1) {
2213 2217 (void) strcpy(zap.za_name, "..");
2214 2218 zap.za_normalization_conflict = 0;
2215 2219 objnum = parent;
2216 2220 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2217 2221 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2218 2222 zap.za_normalization_conflict = 0;
2219 2223 objnum = ZFSCTL_INO_ROOT;
2220 2224 } else {
2221 2225 /*
2222 2226 * Grab next entry.
2223 2227 */
2224 2228 if (error = zap_cursor_retrieve(&zc, &zap)) {
2225 2229 if ((*eofp = (error == ENOENT)) != 0)
2226 2230 break;
2227 2231 else
2228 2232 goto update;
2229 2233 }
2230 2234
2231 2235 if (zap.za_integer_length != 8 ||
2232 2236 zap.za_num_integers != 1) {
2233 2237 cmn_err(CE_WARN, "zap_readdir: bad directory "
2234 2238 "entry, obj = %lld, offset = %lld\n",
2235 2239 (u_longlong_t)zp->z_id,
2236 2240 (u_longlong_t)offset);
2237 2241 error = ENXIO;
2238 2242 goto update;
2239 2243 }
2240 2244
2241 2245 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2242 2246 /*
2243 2247 * MacOS X can extract the object type here such as:
2244 2248 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2245 2249 */
2246 2250
2247 2251 if (check_sysattrs && !zap.za_normalization_conflict) {
2248 2252 zap.za_normalization_conflict =
2249 2253 xattr_sysattr_casechk(zap.za_name);
2250 2254 }
2251 2255 }
2252 2256
2253 2257 if (flags & V_RDDIR_ACCFILTER) {
2254 2258 /*
2255 2259 * If we have no access at all, don't include
2256 2260 * this entry in the returned information
2257 2261 */
2258 2262 znode_t *ezp;
2259 2263 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2260 2264 goto skip_entry;
2261 2265 if (!zfs_has_access(ezp, cr)) {
2262 2266 VN_RELE(ZTOV(ezp));
2263 2267 goto skip_entry;
2264 2268 }
2265 2269 VN_RELE(ZTOV(ezp));
2266 2270 }
2267 2271
2268 2272 if (flags & V_RDDIR_ENTFLAGS)
2269 2273 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2270 2274 else
2271 2275 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2272 2276
2273 2277 /*
2274 2278 * Will this entry fit in the buffer?
2275 2279 */
2276 2280 if (outcount + reclen > bufsize) {
2277 2281 /*
2278 2282 * Did we manage to fit anything in the buffer?
2279 2283 */
2280 2284 if (!outcount) {
2281 2285 error = EINVAL;
2282 2286 goto update;
2283 2287 }
2284 2288 break;
2285 2289 }
2286 2290 if (flags & V_RDDIR_ENTFLAGS) {
2287 2291 /*
2288 2292 * Add extended flag entry:
2289 2293 */
2290 2294 eodp->ed_ino = objnum;
2291 2295 eodp->ed_reclen = reclen;
2292 2296 /* NOTE: ed_off is the offset for the *next* entry */
2293 2297 next = &(eodp->ed_off);
2294 2298 eodp->ed_eflags = zap.za_normalization_conflict ?
2295 2299 ED_CASE_CONFLICT : 0;
2296 2300 (void) strncpy(eodp->ed_name, zap.za_name,
2297 2301 EDIRENT_NAMELEN(reclen));
2298 2302 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2299 2303 } else {
2300 2304 /*
2301 2305 * Add normal entry:
2302 2306 */
2303 2307 odp->d_ino = objnum;
2304 2308 odp->d_reclen = reclen;
2305 2309 /* NOTE: d_off is the offset for the *next* entry */
2306 2310 next = &(odp->d_off);
2307 2311 (void) strncpy(odp->d_name, zap.za_name,
2308 2312 DIRENT64_NAMELEN(reclen));
2309 2313 odp = (dirent64_t *)((intptr_t)odp + reclen);
2310 2314 }
2311 2315 outcount += reclen;
2312 2316
2313 2317 ASSERT(outcount <= bufsize);
2314 2318
2315 2319 /* Prefetch znode */
2316 2320 if (prefetch)
2317 2321 dmu_prefetch(os, objnum, 0, 0);
2318 2322
2319 2323 skip_entry:
2320 2324 /*
2321 2325 * Move to the next entry, fill in the previous offset.
2322 2326 */
2323 2327 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2324 2328 zap_cursor_advance(&zc);
2325 2329 offset = zap_cursor_serialize(&zc);
2326 2330 } else {
2327 2331 offset += 1;
2328 2332 }
2329 2333 if (next)
2330 2334 *next = offset;
2331 2335 }
2332 2336 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2333 2337
2334 2338 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2335 2339 iovp->iov_base += outcount;
2336 2340 iovp->iov_len -= outcount;
2337 2341 uio->uio_resid -= outcount;
2338 2342 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2339 2343 /*
2340 2344 * Reset the pointer.
2341 2345 */
2342 2346 offset = uio->uio_loffset;
2343 2347 }
2344 2348
2345 2349 update:
2346 2350 zap_cursor_fini(&zc);
2347 2351 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2348 2352 kmem_free(outbuf, bufsize);
2349 2353
2350 2354 if (error == ENOENT)
2351 2355 error = 0;
2352 2356
2353 2357 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2354 2358
2355 2359 uio->uio_loffset = offset;
2356 2360 ZFS_EXIT(zfsvfs);
2357 2361 return (error);
2358 2362 }
2359 2363
2360 2364 ulong_t zfs_fsync_sync_cnt = 4;
2361 2365
2362 2366 static int
2363 2367 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2364 2368 {
2365 2369 znode_t *zp = VTOZ(vp);
2366 2370 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2367 2371
2368 2372 /*
2369 2373 * Regardless of whether this is required for standards conformance,
2370 2374 * this is the logical behavior when fsync() is called on a file with
2371 2375 * dirty pages. We use B_ASYNC since the ZIL transactions are already
2372 2376 * going to be pushed out as part of the zil_commit().
2373 2377 */
2374 2378 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2375 2379 (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2376 2380 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2377 2381
2378 2382 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2379 2383
2380 2384 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2381 2385 ZFS_ENTER(zfsvfs);
2382 2386 ZFS_VERIFY_ZP(zp);
2383 2387 zil_commit(zfsvfs->z_log, zp->z_id);
2384 2388 ZFS_EXIT(zfsvfs);
2385 2389 }
2386 2390 return (0);
2387 2391 }
2388 2392
2389 2393
2390 2394 /*
2391 2395 * Get the requested file attributes and place them in the provided
2392 2396 * vattr structure.
2393 2397 *
2394 2398 * IN: vp - vnode of file.
2395 2399 * vap - va_mask identifies requested attributes.
2396 2400 * If AT_XVATTR set, then optional attrs are requested
2397 2401 * flags - ATTR_NOACLCHECK (CIFS server context)
2398 2402 * cr - credentials of caller.
2399 2403 * ct - caller context
2400 2404 *
2401 2405 * OUT: vap - attribute values.
2402 2406 *
2403 2407 * RETURN: 0 (always succeeds)
2404 2408 */
2405 2409 /* ARGSUSED */
2406 2410 static int
2407 2411 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2408 2412 caller_context_t *ct)
2409 2413 {
2410 2414 znode_t *zp = VTOZ(vp);
2411 2415 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2412 2416 int error = 0;
2413 2417 uint64_t links;
2414 2418 uint64_t mtime[2], ctime[2];
2415 2419 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2416 2420 xoptattr_t *xoap = NULL;
2417 2421 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2418 2422 sa_bulk_attr_t bulk[2];
2419 2423 int count = 0;
2420 2424
2421 2425 ZFS_ENTER(zfsvfs);
2422 2426 ZFS_VERIFY_ZP(zp);
2423 2427
2424 2428 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2425 2429
2426 2430 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2427 2431 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2428 2432
2429 2433 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2430 2434 ZFS_EXIT(zfsvfs);
2431 2435 return (error);
2432 2436 }
2433 2437
2434 2438 /*
2435 2439 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2436 2440 * Also, if we are the owner don't bother, since owner should
2437 2441 * always be allowed to read basic attributes of file.
2438 2442 */
2439 2443 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2440 2444 (vap->va_uid != crgetuid(cr))) {
2441 2445 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2442 2446 skipaclchk, cr)) {
2443 2447 ZFS_EXIT(zfsvfs);
2444 2448 return (error);
2445 2449 }
2446 2450 }
2447 2451
2448 2452 /*
2449 2453 * Return all attributes. It's cheaper to provide the answer
2450 2454 * than to determine whether we were asked the question.
2451 2455 */
2452 2456
2453 2457 mutex_enter(&zp->z_lock);
2454 2458 vap->va_type = vp->v_type;
2455 2459 vap->va_mode = zp->z_mode & MODEMASK;
2456 2460 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2457 2461 vap->va_nodeid = zp->z_id;
2458 2462 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2459 2463 links = zp->z_links + 1;
2460 2464 else
2461 2465 links = zp->z_links;
2462 2466 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2463 2467 vap->va_size = zp->z_size;
2464 2468 vap->va_rdev = vp->v_rdev;
2465 2469 vap->va_seq = zp->z_seq;
2466 2470
2467 2471 /*
2468 2472 * Add in any requested optional attributes and the create time.
2469 2473 * Also set the corresponding bits in the returned attribute bitmap.
2470 2474 */
2471 2475 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2472 2476 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2473 2477 xoap->xoa_archive =
2474 2478 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2475 2479 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2476 2480 }
2477 2481
2478 2482 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2479 2483 xoap->xoa_readonly =
2480 2484 ((zp->z_pflags & ZFS_READONLY) != 0);
2481 2485 XVA_SET_RTN(xvap, XAT_READONLY);
2482 2486 }
2483 2487
2484 2488 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2485 2489 xoap->xoa_system =
2486 2490 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2487 2491 XVA_SET_RTN(xvap, XAT_SYSTEM);
2488 2492 }
2489 2493
2490 2494 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2491 2495 xoap->xoa_hidden =
2492 2496 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2493 2497 XVA_SET_RTN(xvap, XAT_HIDDEN);
2494 2498 }
2495 2499
2496 2500 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2497 2501 xoap->xoa_nounlink =
2498 2502 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2499 2503 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2500 2504 }
2501 2505
2502 2506 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2503 2507 xoap->xoa_immutable =
2504 2508 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2505 2509 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2506 2510 }
2507 2511
2508 2512 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2509 2513 xoap->xoa_appendonly =
2510 2514 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2511 2515 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2512 2516 }
2513 2517
2514 2518 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2515 2519 xoap->xoa_nodump =
2516 2520 ((zp->z_pflags & ZFS_NODUMP) != 0);
2517 2521 XVA_SET_RTN(xvap, XAT_NODUMP);
2518 2522 }
2519 2523
2520 2524 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2521 2525 xoap->xoa_opaque =
2522 2526 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2523 2527 XVA_SET_RTN(xvap, XAT_OPAQUE);
2524 2528 }
2525 2529
2526 2530 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2527 2531 xoap->xoa_av_quarantined =
2528 2532 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2529 2533 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2530 2534 }
2531 2535
2532 2536 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2533 2537 xoap->xoa_av_modified =
2534 2538 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2535 2539 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2536 2540 }
2537 2541
2538 2542 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2539 2543 vp->v_type == VREG) {
2540 2544 zfs_sa_get_scanstamp(zp, xvap);
2541 2545 }
2542 2546
2543 2547 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2544 2548 uint64_t times[2];
2545 2549
2546 2550 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2547 2551 times, sizeof (times));
2548 2552 ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2549 2553 XVA_SET_RTN(xvap, XAT_CREATETIME);
2550 2554 }
2551 2555
2552 2556 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2553 2557 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2554 2558 XVA_SET_RTN(xvap, XAT_REPARSE);
2555 2559 }
2556 2560 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2557 2561 xoap->xoa_generation = zp->z_gen;
2558 2562 XVA_SET_RTN(xvap, XAT_GEN);
2559 2563 }
2560 2564
2561 2565 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2562 2566 xoap->xoa_offline =
2563 2567 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2564 2568 XVA_SET_RTN(xvap, XAT_OFFLINE);
2565 2569 }
2566 2570
2567 2571 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2568 2572 xoap->xoa_sparse =
2569 2573 ((zp->z_pflags & ZFS_SPARSE) != 0);
2570 2574 XVA_SET_RTN(xvap, XAT_SPARSE);
2571 2575 }
2572 2576 }
2573 2577
2574 2578 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2575 2579 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2576 2580 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2577 2581
2578 2582 mutex_exit(&zp->z_lock);
2579 2583
2580 2584 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2581 2585
2582 2586 if (zp->z_blksz == 0) {
2583 2587 /*
2584 2588 * Block size hasn't been set; suggest maximal I/O transfers.
2585 2589 */
2586 2590 vap->va_blksize = zfsvfs->z_max_blksz;
2587 2591 }
2588 2592
2589 2593 ZFS_EXIT(zfsvfs);
2590 2594 return (0);
2591 2595 }
2592 2596
2593 2597 /*
2594 2598 * Set the file attributes to the values contained in the
2595 2599 * vattr structure.
2596 2600 *
2597 2601 * IN: vp - vnode of file to be modified.
2598 2602 * vap - new attribute values.
2599 2603 * If AT_XVATTR set, then optional attrs are being set
2600 2604 * flags - ATTR_UTIME set if non-default time values provided.
2601 2605 * - ATTR_NOACLCHECK (CIFS context only).
2602 2606 * cr - credentials of caller.
2603 2607 * ct - caller context
2604 2608 *
2605 2609 * RETURN: 0 if success
2606 2610 * error code if failure
2607 2611 *
2608 2612 * Timestamps:
2609 2613 * vp - ctime updated, mtime updated if size changed.
2610 2614 */
2611 2615 /* ARGSUSED */
2612 2616 static int
2613 2617 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2614 2618 caller_context_t *ct)
2615 2619 {
2616 2620 znode_t *zp = VTOZ(vp);
2617 2621 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2618 2622 zilog_t *zilog;
2619 2623 dmu_tx_t *tx;
2620 2624 vattr_t oldva;
2621 2625 xvattr_t tmpxvattr;
2622 2626 uint_t mask = vap->va_mask;
2623 2627 uint_t saved_mask;
2624 2628 int trim_mask = 0;
2625 2629 uint64_t new_mode;
2626 2630 uint64_t new_uid, new_gid;
2627 2631 uint64_t xattr_obj;
2628 2632 uint64_t mtime[2], ctime[2];
2629 2633 znode_t *attrzp;
2630 2634 int need_policy = FALSE;
2631 2635 int err, err2;
2632 2636 zfs_fuid_info_t *fuidp = NULL;
2633 2637 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2634 2638 xoptattr_t *xoap;
2635 2639 zfs_acl_t *aclp;
2636 2640 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2637 2641 boolean_t fuid_dirtied = B_FALSE;
2638 2642 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2639 2643 int count = 0, xattr_count = 0;
2640 2644
2641 2645 if (mask == 0)
2642 2646 return (0);
2643 2647
2644 2648 if (mask & AT_NOSET)
2645 2649 return (EINVAL);
2646 2650
2647 2651 ZFS_ENTER(zfsvfs);
2648 2652 ZFS_VERIFY_ZP(zp);
2649 2653
2650 2654 zilog = zfsvfs->z_log;
2651 2655
2652 2656 /*
2653 2657 * Make sure that if we have ephemeral uid/gid or xvattr specified
2654 2658 * that file system is at proper version level
2655 2659 */
2656 2660
2657 2661 if (zfsvfs->z_use_fuids == B_FALSE &&
2658 2662 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2659 2663 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2660 2664 (mask & AT_XVATTR))) {
2661 2665 ZFS_EXIT(zfsvfs);
2662 2666 return (EINVAL);
2663 2667 }
2664 2668
2665 2669 if (mask & AT_SIZE && vp->v_type == VDIR) {
2666 2670 ZFS_EXIT(zfsvfs);
2667 2671 return (EISDIR);
2668 2672 }
2669 2673
2670 2674 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2671 2675 ZFS_EXIT(zfsvfs);
2672 2676 return (EINVAL);
2673 2677 }
2674 2678
2675 2679 /*
2676 2680 * If this is an xvattr_t, then get a pointer to the structure of
2677 2681 * optional attributes. If this is NULL, then we have a vattr_t.
2678 2682 */
2679 2683 xoap = xva_getxoptattr(xvap);
2680 2684
2681 2685 xva_init(&tmpxvattr);
2682 2686
2683 2687 /*
2684 2688 * Immutable files can only alter immutable bit and atime
2685 2689 */
2686 2690 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2687 2691 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2688 2692 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2689 2693 ZFS_EXIT(zfsvfs);
2690 2694 return (EPERM);
2691 2695 }
2692 2696
2693 2697 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2694 2698 ZFS_EXIT(zfsvfs);
2695 2699 return (EPERM);
2696 2700 }
2697 2701
2698 2702 /*
2699 2703 * Verify timestamps doesn't overflow 32 bits.
2700 2704 * ZFS can handle large timestamps, but 32bit syscalls can't
2701 2705 * handle times greater than 2039. This check should be removed
2702 2706 * once large timestamps are fully supported.
2703 2707 */
2704 2708 if (mask & (AT_ATIME | AT_MTIME)) {
2705 2709 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2706 2710 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2707 2711 ZFS_EXIT(zfsvfs);
2708 2712 return (EOVERFLOW);
2709 2713 }
2710 2714 }
2711 2715
2712 2716 top:
2713 2717 attrzp = NULL;
2714 2718 aclp = NULL;
2715 2719
2716 2720 /* Can this be moved to before the top label? */
2717 2721 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2718 2722 ZFS_EXIT(zfsvfs);
2719 2723 return (EROFS);
2720 2724 }
2721 2725
2722 2726 /*
2723 2727 * First validate permissions
2724 2728 */
2725 2729
2726 2730 if (mask & AT_SIZE) {
2727 2731 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2728 2732 if (err) {
2729 2733 ZFS_EXIT(zfsvfs);
2730 2734 return (err);
2731 2735 }
2732 2736 /*
2733 2737 * XXX - Note, we are not providing any open
2734 2738 * mode flags here (like FNDELAY), so we may
2735 2739 * block if there are locks present... this
2736 2740 * should be addressed in openat().
2737 2741 */
2738 2742 /* XXX - would it be OK to generate a log record here? */
2739 2743 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2740 2744 if (err) {
2741 2745 ZFS_EXIT(zfsvfs);
2742 2746 return (err);
2743 2747 }
2744 2748 }
2745 2749
2746 2750 if (mask & (AT_ATIME|AT_MTIME) ||
2747 2751 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2748 2752 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2749 2753 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2750 2754 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2751 2755 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2752 2756 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2753 2757 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2754 2758 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2755 2759 skipaclchk, cr);
2756 2760 }
2757 2761
2758 2762 if (mask & (AT_UID|AT_GID)) {
2759 2763 int idmask = (mask & (AT_UID|AT_GID));
2760 2764 int take_owner;
2761 2765 int take_group;
2762 2766
2763 2767 /*
2764 2768 * NOTE: even if a new mode is being set,
2765 2769 * we may clear S_ISUID/S_ISGID bits.
2766 2770 */
2767 2771
2768 2772 if (!(mask & AT_MODE))
2769 2773 vap->va_mode = zp->z_mode;
2770 2774
2771 2775 /*
2772 2776 * Take ownership or chgrp to group we are a member of
2773 2777 */
2774 2778
2775 2779 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2776 2780 take_group = (mask & AT_GID) &&
2777 2781 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2778 2782
2779 2783 /*
2780 2784 * If both AT_UID and AT_GID are set then take_owner and
2781 2785 * take_group must both be set in order to allow taking
2782 2786 * ownership.
2783 2787 *
2784 2788 * Otherwise, send the check through secpolicy_vnode_setattr()
2785 2789 *
2786 2790 */
2787 2791
2788 2792 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2789 2793 ((idmask == AT_UID) && take_owner) ||
2790 2794 ((idmask == AT_GID) && take_group)) {
2791 2795 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2792 2796 skipaclchk, cr) == 0) {
2793 2797 /*
2794 2798 * Remove setuid/setgid for non-privileged users
2795 2799 */
2796 2800 secpolicy_setid_clear(vap, cr);
2797 2801 trim_mask = (mask & (AT_UID|AT_GID));
2798 2802 } else {
2799 2803 need_policy = TRUE;
2800 2804 }
2801 2805 } else {
2802 2806 need_policy = TRUE;
2803 2807 }
2804 2808 }
2805 2809
2806 2810 mutex_enter(&zp->z_lock);
2807 2811 oldva.va_mode = zp->z_mode;
2808 2812 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2809 2813 if (mask & AT_XVATTR) {
2810 2814 /*
2811 2815 * Update xvattr mask to include only those attributes
2812 2816 * that are actually changing.
2813 2817 *
2814 2818 * the bits will be restored prior to actually setting
2815 2819 * the attributes so the caller thinks they were set.
2816 2820 */
2817 2821 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2818 2822 if (xoap->xoa_appendonly !=
2819 2823 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2820 2824 need_policy = TRUE;
2821 2825 } else {
2822 2826 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2823 2827 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2824 2828 }
2825 2829 }
2826 2830
2827 2831 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2828 2832 if (xoap->xoa_nounlink !=
2829 2833 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2830 2834 need_policy = TRUE;
2831 2835 } else {
2832 2836 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2833 2837 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2834 2838 }
2835 2839 }
2836 2840
2837 2841 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2838 2842 if (xoap->xoa_immutable !=
2839 2843 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2840 2844 need_policy = TRUE;
2841 2845 } else {
2842 2846 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2843 2847 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2844 2848 }
2845 2849 }
2846 2850
2847 2851 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2848 2852 if (xoap->xoa_nodump !=
2849 2853 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2850 2854 need_policy = TRUE;
2851 2855 } else {
2852 2856 XVA_CLR_REQ(xvap, XAT_NODUMP);
2853 2857 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2854 2858 }
2855 2859 }
2856 2860
2857 2861 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2858 2862 if (xoap->xoa_av_modified !=
2859 2863 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2860 2864 need_policy = TRUE;
2861 2865 } else {
2862 2866 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2863 2867 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2864 2868 }
2865 2869 }
2866 2870
2867 2871 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2868 2872 if ((vp->v_type != VREG &&
2869 2873 xoap->xoa_av_quarantined) ||
2870 2874 xoap->xoa_av_quarantined !=
2871 2875 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2872 2876 need_policy = TRUE;
2873 2877 } else {
2874 2878 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2875 2879 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2876 2880 }
2877 2881 }
2878 2882
2879 2883 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2880 2884 mutex_exit(&zp->z_lock);
2881 2885 ZFS_EXIT(zfsvfs);
2882 2886 return (EPERM);
2883 2887 }
2884 2888
2885 2889 if (need_policy == FALSE &&
2886 2890 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2887 2891 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2888 2892 need_policy = TRUE;
2889 2893 }
2890 2894 }
2891 2895
2892 2896 mutex_exit(&zp->z_lock);
2893 2897
2894 2898 if (mask & AT_MODE) {
2895 2899 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2896 2900 err = secpolicy_setid_setsticky_clear(vp, vap,
2897 2901 &oldva, cr);
2898 2902 if (err) {
2899 2903 ZFS_EXIT(zfsvfs);
2900 2904 return (err);
2901 2905 }
2902 2906 trim_mask |= AT_MODE;
2903 2907 } else {
2904 2908 need_policy = TRUE;
2905 2909 }
2906 2910 }
2907 2911
2908 2912 if (need_policy) {
2909 2913 /*
2910 2914 * If trim_mask is set then take ownership
2911 2915 * has been granted or write_acl is present and user
2912 2916 * has the ability to modify mode. In that case remove
2913 2917 * UID|GID and or MODE from mask so that
2914 2918 * secpolicy_vnode_setattr() doesn't revoke it.
2915 2919 */
2916 2920
2917 2921 if (trim_mask) {
2918 2922 saved_mask = vap->va_mask;
2919 2923 vap->va_mask &= ~trim_mask;
2920 2924 }
2921 2925 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2922 2926 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2923 2927 if (err) {
2924 2928 ZFS_EXIT(zfsvfs);
2925 2929 return (err);
2926 2930 }
2927 2931
2928 2932 if (trim_mask)
2929 2933 vap->va_mask |= saved_mask;
2930 2934 }
2931 2935
2932 2936 /*
2933 2937 * secpolicy_vnode_setattr, or take ownership may have
2934 2938 * changed va_mask
2935 2939 */
2936 2940 mask = vap->va_mask;
2937 2941
2938 2942 if ((mask & (AT_UID | AT_GID))) {
2939 2943 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2940 2944 &xattr_obj, sizeof (xattr_obj));
2941 2945
2942 2946 if (err == 0 && xattr_obj) {
2943 2947 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2944 2948 if (err)
2945 2949 goto out2;
2946 2950 }
2947 2951 if (mask & AT_UID) {
2948 2952 new_uid = zfs_fuid_create(zfsvfs,
2949 2953 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2950 2954 if (new_uid != zp->z_uid &&
2951 2955 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
2952 2956 if (attrzp)
2953 2957 VN_RELE(ZTOV(attrzp));
2954 2958 err = EDQUOT;
2955 2959 goto out2;
2956 2960 }
2957 2961 }
2958 2962
2959 2963 if (mask & AT_GID) {
2960 2964 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2961 2965 cr, ZFS_GROUP, &fuidp);
2962 2966 if (new_gid != zp->z_gid &&
2963 2967 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
2964 2968 if (attrzp)
2965 2969 VN_RELE(ZTOV(attrzp));
2966 2970 err = EDQUOT;
2967 2971 goto out2;
2968 2972 }
2969 2973 }
2970 2974 }
2971 2975 tx = dmu_tx_create(zfsvfs->z_os);
2972 2976
2973 2977 if (mask & AT_MODE) {
2974 2978 uint64_t pmode = zp->z_mode;
2975 2979 uint64_t acl_obj;
2976 2980 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2977 2981
2978 2982 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
2979 2983 goto out;
2980 2984
2981 2985 mutex_enter(&zp->z_lock);
2982 2986 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2983 2987 /*
2984 2988 * Are we upgrading ACL from old V0 format
2985 2989 * to V1 format?
2986 2990 */
2987 2991 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2988 2992 zfs_znode_acl_version(zp) ==
2989 2993 ZFS_ACL_VERSION_INITIAL) {
2990 2994 dmu_tx_hold_free(tx, acl_obj, 0,
2991 2995 DMU_OBJECT_END);
2992 2996 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2993 2997 0, aclp->z_acl_bytes);
2994 2998 } else {
2995 2999 dmu_tx_hold_write(tx, acl_obj, 0,
2996 3000 aclp->z_acl_bytes);
2997 3001 }
2998 3002 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2999 3003 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3000 3004 0, aclp->z_acl_bytes);
3001 3005 }
3002 3006 mutex_exit(&zp->z_lock);
3003 3007 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3004 3008 } else {
3005 3009 if ((mask & AT_XVATTR) &&
3006 3010 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3007 3011 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3008 3012 else
3009 3013 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3010 3014 }
3011 3015
3012 3016 if (attrzp) {
3013 3017 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3014 3018 }
3015 3019
3016 3020 fuid_dirtied = zfsvfs->z_fuid_dirty;
3017 3021 if (fuid_dirtied)
3018 3022 zfs_fuid_txhold(zfsvfs, tx);
3019 3023
3020 3024 zfs_sa_upgrade_txholds(tx, zp);
3021 3025
3022 3026 err = dmu_tx_assign(tx, TXG_NOWAIT);
3023 3027 if (err) {
3024 3028 if (err == ERESTART)
3025 3029 dmu_tx_wait(tx);
3026 3030 goto out;
3027 3031 }
3028 3032
3029 3033 count = 0;
3030 3034 /*
3031 3035 * Set each attribute requested.
3032 3036 * We group settings according to the locks they need to acquire.
3033 3037 *
3034 3038 * Note: you cannot set ctime directly, although it will be
3035 3039 * updated as a side-effect of calling this function.
3036 3040 */
3037 3041
3038 3042
3039 3043 if (mask & (AT_UID|AT_GID|AT_MODE))
3040 3044 mutex_enter(&zp->z_acl_lock);
3041 3045 mutex_enter(&zp->z_lock);
3042 3046
3043 3047 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3044 3048 &zp->z_pflags, sizeof (zp->z_pflags));
3045 3049
3046 3050 if (attrzp) {
3047 3051 if (mask & (AT_UID|AT_GID|AT_MODE))
3048 3052 mutex_enter(&attrzp->z_acl_lock);
3049 3053 mutex_enter(&attrzp->z_lock);
3050 3054 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3051 3055 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3052 3056 sizeof (attrzp->z_pflags));
3053 3057 }
3054 3058
3055 3059 if (mask & (AT_UID|AT_GID)) {
3056 3060
3057 3061 if (mask & AT_UID) {
3058 3062 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3059 3063 &new_uid, sizeof (new_uid));
3060 3064 zp->z_uid = new_uid;
3061 3065 if (attrzp) {
3062 3066 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3063 3067 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3064 3068 sizeof (new_uid));
3065 3069 attrzp->z_uid = new_uid;
3066 3070 }
3067 3071 }
3068 3072
3069 3073 if (mask & AT_GID) {
3070 3074 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3071 3075 NULL, &new_gid, sizeof (new_gid));
3072 3076 zp->z_gid = new_gid;
3073 3077 if (attrzp) {
3074 3078 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3075 3079 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3076 3080 sizeof (new_gid));
3077 3081 attrzp->z_gid = new_gid;
3078 3082 }
3079 3083 }
3080 3084 if (!(mask & AT_MODE)) {
3081 3085 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3082 3086 NULL, &new_mode, sizeof (new_mode));
3083 3087 new_mode = zp->z_mode;
3084 3088 }
3085 3089 err = zfs_acl_chown_setattr(zp);
3086 3090 ASSERT(err == 0);
3087 3091 if (attrzp) {
3088 3092 err = zfs_acl_chown_setattr(attrzp);
↓ open down ↓ |
1352 lines elided |
↑ open up ↑ |
3089 3093 ASSERT(err == 0);
3090 3094 }
3091 3095 }
3092 3096
3093 3097 if (mask & AT_MODE) {
3094 3098 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3095 3099 &new_mode, sizeof (new_mode));
3096 3100 zp->z_mode = new_mode;
3097 3101 ASSERT3U((uintptr_t)aclp, !=, NULL);
3098 3102 err = zfs_aclset_common(zp, aclp, cr, tx);
3099 - ASSERT3U(err, ==, 0);
3103 + ASSERT0(err);
3100 3104 if (zp->z_acl_cached)
3101 3105 zfs_acl_free(zp->z_acl_cached);
3102 3106 zp->z_acl_cached = aclp;
3103 3107 aclp = NULL;
3104 3108 }
3105 3109
3106 3110
3107 3111 if (mask & AT_ATIME) {
3108 3112 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3109 3113 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3110 3114 &zp->z_atime, sizeof (zp->z_atime));
3111 3115 }
3112 3116
3113 3117 if (mask & AT_MTIME) {
3114 3118 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3115 3119 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3116 3120 mtime, sizeof (mtime));
3117 3121 }
3118 3122
3119 3123 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3120 3124 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3121 3125 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3122 3126 NULL, mtime, sizeof (mtime));
3123 3127 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3124 3128 &ctime, sizeof (ctime));
3125 3129 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3126 3130 B_TRUE);
3127 3131 } else if (mask != 0) {
3128 3132 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3129 3133 &ctime, sizeof (ctime));
3130 3134 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3131 3135 B_TRUE);
3132 3136 if (attrzp) {
3133 3137 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3134 3138 SA_ZPL_CTIME(zfsvfs), NULL,
3135 3139 &ctime, sizeof (ctime));
3136 3140 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3137 3141 mtime, ctime, B_TRUE);
3138 3142 }
3139 3143 }
3140 3144 /*
3141 3145 * Do this after setting timestamps to prevent timestamp
3142 3146 * update from toggling bit
3143 3147 */
3144 3148
3145 3149 if (xoap && (mask & AT_XVATTR)) {
3146 3150
3147 3151 /*
3148 3152 * restore trimmed off masks
3149 3153 * so that return masks can be set for caller.
3150 3154 */
3151 3155
3152 3156 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3153 3157 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3154 3158 }
3155 3159 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3156 3160 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3157 3161 }
3158 3162 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3159 3163 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3160 3164 }
3161 3165 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3162 3166 XVA_SET_REQ(xvap, XAT_NODUMP);
3163 3167 }
3164 3168 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3165 3169 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3166 3170 }
3167 3171 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3168 3172 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3169 3173 }
3170 3174
3171 3175 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3172 3176 ASSERT(vp->v_type == VREG);
3173 3177
3174 3178 zfs_xvattr_set(zp, xvap, tx);
3175 3179 }
3176 3180
3177 3181 if (fuid_dirtied)
3178 3182 zfs_fuid_sync(zfsvfs, tx);
3179 3183
3180 3184 if (mask != 0)
3181 3185 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3182 3186
3183 3187 mutex_exit(&zp->z_lock);
3184 3188 if (mask & (AT_UID|AT_GID|AT_MODE))
3185 3189 mutex_exit(&zp->z_acl_lock);
3186 3190
3187 3191 if (attrzp) {
3188 3192 if (mask & (AT_UID|AT_GID|AT_MODE))
3189 3193 mutex_exit(&attrzp->z_acl_lock);
3190 3194 mutex_exit(&attrzp->z_lock);
3191 3195 }
3192 3196 out:
3193 3197 if (err == 0 && attrzp) {
3194 3198 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3195 3199 xattr_count, tx);
3196 3200 ASSERT(err2 == 0);
3197 3201 }
3198 3202
3199 3203 if (attrzp)
3200 3204 VN_RELE(ZTOV(attrzp));
3201 3205 if (aclp)
3202 3206 zfs_acl_free(aclp);
3203 3207
3204 3208 if (fuidp) {
3205 3209 zfs_fuid_info_free(fuidp);
3206 3210 fuidp = NULL;
3207 3211 }
3208 3212
3209 3213 if (err) {
3210 3214 dmu_tx_abort(tx);
3211 3215 if (err == ERESTART)
3212 3216 goto top;
3213 3217 } else {
3214 3218 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3215 3219 dmu_tx_commit(tx);
3216 3220 }
3217 3221
3218 3222 out2:
3219 3223 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3220 3224 zil_commit(zilog, 0);
3221 3225
3222 3226 ZFS_EXIT(zfsvfs);
3223 3227 return (err);
3224 3228 }
3225 3229
3226 3230 typedef struct zfs_zlock {
3227 3231 krwlock_t *zl_rwlock; /* lock we acquired */
3228 3232 znode_t *zl_znode; /* znode we held */
3229 3233 struct zfs_zlock *zl_next; /* next in list */
3230 3234 } zfs_zlock_t;
3231 3235
3232 3236 /*
3233 3237 * Drop locks and release vnodes that were held by zfs_rename_lock().
3234 3238 */
3235 3239 static void
3236 3240 zfs_rename_unlock(zfs_zlock_t **zlpp)
3237 3241 {
3238 3242 zfs_zlock_t *zl;
3239 3243
3240 3244 while ((zl = *zlpp) != NULL) {
3241 3245 if (zl->zl_znode != NULL)
3242 3246 VN_RELE(ZTOV(zl->zl_znode));
3243 3247 rw_exit(zl->zl_rwlock);
3244 3248 *zlpp = zl->zl_next;
3245 3249 kmem_free(zl, sizeof (*zl));
3246 3250 }
3247 3251 }
3248 3252
3249 3253 /*
3250 3254 * Search back through the directory tree, using the ".." entries.
3251 3255 * Lock each directory in the chain to prevent concurrent renames.
3252 3256 * Fail any attempt to move a directory into one of its own descendants.
3253 3257 * XXX - z_parent_lock can overlap with map or grow locks
3254 3258 */
3255 3259 static int
3256 3260 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3257 3261 {
3258 3262 zfs_zlock_t *zl;
3259 3263 znode_t *zp = tdzp;
3260 3264 uint64_t rootid = zp->z_zfsvfs->z_root;
3261 3265 uint64_t oidp = zp->z_id;
3262 3266 krwlock_t *rwlp = &szp->z_parent_lock;
3263 3267 krw_t rw = RW_WRITER;
3264 3268
3265 3269 /*
3266 3270 * First pass write-locks szp and compares to zp->z_id.
3267 3271 * Later passes read-lock zp and compare to zp->z_parent.
3268 3272 */
3269 3273 do {
3270 3274 if (!rw_tryenter(rwlp, rw)) {
3271 3275 /*
3272 3276 * Another thread is renaming in this path.
3273 3277 * Note that if we are a WRITER, we don't have any
3274 3278 * parent_locks held yet.
3275 3279 */
3276 3280 if (rw == RW_READER && zp->z_id > szp->z_id) {
3277 3281 /*
3278 3282 * Drop our locks and restart
3279 3283 */
3280 3284 zfs_rename_unlock(&zl);
3281 3285 *zlpp = NULL;
3282 3286 zp = tdzp;
3283 3287 oidp = zp->z_id;
3284 3288 rwlp = &szp->z_parent_lock;
3285 3289 rw = RW_WRITER;
3286 3290 continue;
3287 3291 } else {
3288 3292 /*
3289 3293 * Wait for other thread to drop its locks
3290 3294 */
3291 3295 rw_enter(rwlp, rw);
3292 3296 }
3293 3297 }
3294 3298
3295 3299 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3296 3300 zl->zl_rwlock = rwlp;
3297 3301 zl->zl_znode = NULL;
3298 3302 zl->zl_next = *zlpp;
3299 3303 *zlpp = zl;
3300 3304
3301 3305 if (oidp == szp->z_id) /* We're a descendant of szp */
3302 3306 return (EINVAL);
3303 3307
3304 3308 if (oidp == rootid) /* We've hit the top */
3305 3309 return (0);
3306 3310
3307 3311 if (rw == RW_READER) { /* i.e. not the first pass */
3308 3312 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3309 3313 if (error)
3310 3314 return (error);
3311 3315 zl->zl_znode = zp;
3312 3316 }
3313 3317 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3314 3318 &oidp, sizeof (oidp));
3315 3319 rwlp = &zp->z_parent_lock;
3316 3320 rw = RW_READER;
3317 3321
3318 3322 } while (zp->z_id != sdzp->z_id);
3319 3323
3320 3324 return (0);
3321 3325 }
3322 3326
3323 3327 /*
3324 3328 * Move an entry from the provided source directory to the target
3325 3329 * directory. Change the entry name as indicated.
3326 3330 *
3327 3331 * IN: sdvp - Source directory containing the "old entry".
3328 3332 * snm - Old entry name.
3329 3333 * tdvp - Target directory to contain the "new entry".
3330 3334 * tnm - New entry name.
3331 3335 * cr - credentials of caller.
3332 3336 * ct - caller context
3333 3337 * flags - case flags
3334 3338 *
3335 3339 * RETURN: 0 if success
3336 3340 * error code if failure
3337 3341 *
3338 3342 * Timestamps:
3339 3343 * sdvp,tdvp - ctime|mtime updated
3340 3344 */
3341 3345 /*ARGSUSED*/
3342 3346 static int
3343 3347 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3344 3348 caller_context_t *ct, int flags)
3345 3349 {
3346 3350 znode_t *tdzp, *szp, *tzp;
3347 3351 znode_t *sdzp = VTOZ(sdvp);
3348 3352 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
3349 3353 zilog_t *zilog;
3350 3354 vnode_t *realvp;
3351 3355 zfs_dirlock_t *sdl, *tdl;
3352 3356 dmu_tx_t *tx;
3353 3357 zfs_zlock_t *zl;
3354 3358 int cmp, serr, terr;
3355 3359 int error = 0;
3356 3360 int zflg = 0;
3357 3361
3358 3362 ZFS_ENTER(zfsvfs);
3359 3363 ZFS_VERIFY_ZP(sdzp);
3360 3364 zilog = zfsvfs->z_log;
3361 3365
3362 3366 /*
3363 3367 * Make sure we have the real vp for the target directory.
3364 3368 */
3365 3369 if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3366 3370 tdvp = realvp;
3367 3371
3368 3372 if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3369 3373 ZFS_EXIT(zfsvfs);
3370 3374 return (EXDEV);
3371 3375 }
3372 3376
3373 3377 tdzp = VTOZ(tdvp);
3374 3378 ZFS_VERIFY_ZP(tdzp);
3375 3379 if (zfsvfs->z_utf8 && u8_validate(tnm,
3376 3380 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3377 3381 ZFS_EXIT(zfsvfs);
3378 3382 return (EILSEQ);
3379 3383 }
3380 3384
3381 3385 if (flags & FIGNORECASE)
3382 3386 zflg |= ZCILOOK;
3383 3387
3384 3388 top:
3385 3389 szp = NULL;
3386 3390 tzp = NULL;
3387 3391 zl = NULL;
3388 3392
3389 3393 /*
3390 3394 * This is to prevent the creation of links into attribute space
3391 3395 * by renaming a linked file into/outof an attribute directory.
3392 3396 * See the comment in zfs_link() for why this is considered bad.
3393 3397 */
3394 3398 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3395 3399 ZFS_EXIT(zfsvfs);
3396 3400 return (EINVAL);
3397 3401 }
3398 3402
3399 3403 /*
3400 3404 * Lock source and target directory entries. To prevent deadlock,
3401 3405 * a lock ordering must be defined. We lock the directory with
3402 3406 * the smallest object id first, or if it's a tie, the one with
3403 3407 * the lexically first name.
3404 3408 */
3405 3409 if (sdzp->z_id < tdzp->z_id) {
3406 3410 cmp = -1;
3407 3411 } else if (sdzp->z_id > tdzp->z_id) {
3408 3412 cmp = 1;
3409 3413 } else {
3410 3414 /*
3411 3415 * First compare the two name arguments without
3412 3416 * considering any case folding.
3413 3417 */
3414 3418 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3415 3419
3416 3420 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3417 3421 ASSERT(error == 0 || !zfsvfs->z_utf8);
3418 3422 if (cmp == 0) {
3419 3423 /*
3420 3424 * POSIX: "If the old argument and the new argument
3421 3425 * both refer to links to the same existing file,
3422 3426 * the rename() function shall return successfully
3423 3427 * and perform no other action."
3424 3428 */
3425 3429 ZFS_EXIT(zfsvfs);
3426 3430 return (0);
3427 3431 }
3428 3432 /*
3429 3433 * If the file system is case-folding, then we may
3430 3434 * have some more checking to do. A case-folding file
3431 3435 * system is either supporting mixed case sensitivity
3432 3436 * access or is completely case-insensitive. Note
3433 3437 * that the file system is always case preserving.
3434 3438 *
3435 3439 * In mixed sensitivity mode case sensitive behavior
3436 3440 * is the default. FIGNORECASE must be used to
3437 3441 * explicitly request case insensitive behavior.
3438 3442 *
3439 3443 * If the source and target names provided differ only
3440 3444 * by case (e.g., a request to rename 'tim' to 'Tim'),
3441 3445 * we will treat this as a special case in the
3442 3446 * case-insensitive mode: as long as the source name
3443 3447 * is an exact match, we will allow this to proceed as
3444 3448 * a name-change request.
3445 3449 */
3446 3450 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3447 3451 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3448 3452 flags & FIGNORECASE)) &&
3449 3453 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3450 3454 &error) == 0) {
3451 3455 /*
3452 3456 * case preserving rename request, require exact
3453 3457 * name matches
3454 3458 */
3455 3459 zflg |= ZCIEXACT;
3456 3460 zflg &= ~ZCILOOK;
3457 3461 }
3458 3462 }
3459 3463
3460 3464 /*
3461 3465 * If the source and destination directories are the same, we should
3462 3466 * grab the z_name_lock of that directory only once.
3463 3467 */
3464 3468 if (sdzp == tdzp) {
3465 3469 zflg |= ZHAVELOCK;
3466 3470 rw_enter(&sdzp->z_name_lock, RW_READER);
3467 3471 }
3468 3472
3469 3473 if (cmp < 0) {
3470 3474 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3471 3475 ZEXISTS | zflg, NULL, NULL);
3472 3476 terr = zfs_dirent_lock(&tdl,
3473 3477 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3474 3478 } else {
3475 3479 terr = zfs_dirent_lock(&tdl,
3476 3480 tdzp, tnm, &tzp, zflg, NULL, NULL);
3477 3481 serr = zfs_dirent_lock(&sdl,
3478 3482 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3479 3483 NULL, NULL);
3480 3484 }
3481 3485
3482 3486 if (serr) {
3483 3487 /*
3484 3488 * Source entry invalid or not there.
3485 3489 */
3486 3490 if (!terr) {
3487 3491 zfs_dirent_unlock(tdl);
3488 3492 if (tzp)
3489 3493 VN_RELE(ZTOV(tzp));
3490 3494 }
3491 3495
3492 3496 if (sdzp == tdzp)
3493 3497 rw_exit(&sdzp->z_name_lock);
3494 3498
3495 3499 if (strcmp(snm, "..") == 0)
3496 3500 serr = EINVAL;
3497 3501 ZFS_EXIT(zfsvfs);
3498 3502 return (serr);
3499 3503 }
3500 3504 if (terr) {
3501 3505 zfs_dirent_unlock(sdl);
3502 3506 VN_RELE(ZTOV(szp));
3503 3507
3504 3508 if (sdzp == tdzp)
3505 3509 rw_exit(&sdzp->z_name_lock);
3506 3510
3507 3511 if (strcmp(tnm, "..") == 0)
3508 3512 terr = EINVAL;
3509 3513 ZFS_EXIT(zfsvfs);
3510 3514 return (terr);
3511 3515 }
3512 3516
3513 3517 /*
3514 3518 * Must have write access at the source to remove the old entry
3515 3519 * and write access at the target to create the new entry.
3516 3520 * Note that if target and source are the same, this can be
3517 3521 * done in a single check.
3518 3522 */
3519 3523
3520 3524 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3521 3525 goto out;
3522 3526
3523 3527 if (ZTOV(szp)->v_type == VDIR) {
3524 3528 /*
3525 3529 * Check to make sure rename is valid.
3526 3530 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3527 3531 */
3528 3532 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3529 3533 goto out;
3530 3534 }
3531 3535
3532 3536 /*
3533 3537 * Does target exist?
3534 3538 */
3535 3539 if (tzp) {
3536 3540 /*
3537 3541 * Source and target must be the same type.
3538 3542 */
3539 3543 if (ZTOV(szp)->v_type == VDIR) {
3540 3544 if (ZTOV(tzp)->v_type != VDIR) {
3541 3545 error = ENOTDIR;
3542 3546 goto out;
3543 3547 }
3544 3548 } else {
3545 3549 if (ZTOV(tzp)->v_type == VDIR) {
3546 3550 error = EISDIR;
3547 3551 goto out;
3548 3552 }
3549 3553 }
3550 3554 /*
3551 3555 * POSIX dictates that when the source and target
3552 3556 * entries refer to the same file object, rename
3553 3557 * must do nothing and exit without error.
3554 3558 */
3555 3559 if (szp->z_id == tzp->z_id) {
3556 3560 error = 0;
3557 3561 goto out;
3558 3562 }
3559 3563 }
3560 3564
3561 3565 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3562 3566 if (tzp)
3563 3567 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3564 3568
3565 3569 /*
3566 3570 * notify the target directory if it is not the same
3567 3571 * as source directory.
3568 3572 */
3569 3573 if (tdvp != sdvp) {
3570 3574 vnevent_rename_dest_dir(tdvp, ct);
3571 3575 }
3572 3576
3573 3577 tx = dmu_tx_create(zfsvfs->z_os);
3574 3578 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3575 3579 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3576 3580 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3577 3581 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3578 3582 if (sdzp != tdzp) {
3579 3583 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3580 3584 zfs_sa_upgrade_txholds(tx, tdzp);
3581 3585 }
3582 3586 if (tzp) {
3583 3587 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3584 3588 zfs_sa_upgrade_txholds(tx, tzp);
3585 3589 }
3586 3590
3587 3591 zfs_sa_upgrade_txholds(tx, szp);
3588 3592 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3589 3593 error = dmu_tx_assign(tx, TXG_NOWAIT);
3590 3594 if (error) {
3591 3595 if (zl != NULL)
3592 3596 zfs_rename_unlock(&zl);
3593 3597 zfs_dirent_unlock(sdl);
3594 3598 zfs_dirent_unlock(tdl);
3595 3599
3596 3600 if (sdzp == tdzp)
3597 3601 rw_exit(&sdzp->z_name_lock);
3598 3602
3599 3603 VN_RELE(ZTOV(szp));
3600 3604 if (tzp)
3601 3605 VN_RELE(ZTOV(tzp));
3602 3606 if (error == ERESTART) {
3603 3607 dmu_tx_wait(tx);
3604 3608 dmu_tx_abort(tx);
3605 3609 goto top;
3606 3610 }
3607 3611 dmu_tx_abort(tx);
3608 3612 ZFS_EXIT(zfsvfs);
3609 3613 return (error);
3610 3614 }
3611 3615
↓ open down ↓ |
502 lines elided |
↑ open up ↑ |
3612 3616 if (tzp) /* Attempt to remove the existing target */
3613 3617 error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3614 3618
3615 3619 if (error == 0) {
3616 3620 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3617 3621 if (error == 0) {
3618 3622 szp->z_pflags |= ZFS_AV_MODIFIED;
3619 3623
3620 3624 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3621 3625 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3622 - ASSERT3U(error, ==, 0);
3626 + ASSERT0(error);
3623 3627
3624 3628 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3625 3629 if (error == 0) {
3626 3630 zfs_log_rename(zilog, tx, TX_RENAME |
3627 3631 (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3628 3632 sdl->dl_name, tdzp, tdl->dl_name, szp);
3629 3633
3630 3634 /*
3631 3635 * Update path information for the target vnode
3632 3636 */
3633 3637 vn_renamepath(tdvp, ZTOV(szp), tnm,
3634 3638 strlen(tnm));
3635 3639 } else {
3636 3640 /*
3637 3641 * At this point, we have successfully created
3638 3642 * the target name, but have failed to remove
3639 3643 * the source name. Since the create was done
3640 3644 * with the ZRENAMING flag, there are
3641 3645 * complications; for one, the link count is
3642 3646 * wrong. The easiest way to deal with this
3643 3647 * is to remove the newly created target, and
3644 3648 * return the original error. This must
3645 3649 * succeed; fortunately, it is very unlikely to
3646 3650 * fail, since we just created it.
3647 3651 */
3648 3652 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3649 3653 ZRENAMING, NULL), ==, 0);
3650 3654 }
3651 3655 }
3652 3656 }
3653 3657
3654 3658 dmu_tx_commit(tx);
3655 3659 out:
3656 3660 if (zl != NULL)
3657 3661 zfs_rename_unlock(&zl);
3658 3662
3659 3663 zfs_dirent_unlock(sdl);
3660 3664 zfs_dirent_unlock(tdl);
3661 3665
3662 3666 if (sdzp == tdzp)
3663 3667 rw_exit(&sdzp->z_name_lock);
3664 3668
3665 3669
3666 3670 VN_RELE(ZTOV(szp));
3667 3671 if (tzp)
3668 3672 VN_RELE(ZTOV(tzp));
3669 3673
3670 3674 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3671 3675 zil_commit(zilog, 0);
3672 3676
3673 3677 ZFS_EXIT(zfsvfs);
3674 3678 return (error);
3675 3679 }
3676 3680
3677 3681 /*
3678 3682 * Insert the indicated symbolic reference entry into the directory.
3679 3683 *
3680 3684 * IN: dvp - Directory to contain new symbolic link.
3681 3685 * link - Name for new symlink entry.
3682 3686 * vap - Attributes of new entry.
3683 3687 * target - Target path of new symlink.
3684 3688 * cr - credentials of caller.
3685 3689 * ct - caller context
3686 3690 * flags - case flags
3687 3691 *
3688 3692 * RETURN: 0 if success
3689 3693 * error code if failure
3690 3694 *
3691 3695 * Timestamps:
3692 3696 * dvp - ctime|mtime updated
3693 3697 */
3694 3698 /*ARGSUSED*/
3695 3699 static int
3696 3700 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3697 3701 caller_context_t *ct, int flags)
3698 3702 {
3699 3703 znode_t *zp, *dzp = VTOZ(dvp);
3700 3704 zfs_dirlock_t *dl;
3701 3705 dmu_tx_t *tx;
3702 3706 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3703 3707 zilog_t *zilog;
3704 3708 uint64_t len = strlen(link);
3705 3709 int error;
3706 3710 int zflg = ZNEW;
3707 3711 zfs_acl_ids_t acl_ids;
3708 3712 boolean_t fuid_dirtied;
3709 3713 uint64_t txtype = TX_SYMLINK;
3710 3714
3711 3715 ASSERT(vap->va_type == VLNK);
3712 3716
3713 3717 ZFS_ENTER(zfsvfs);
3714 3718 ZFS_VERIFY_ZP(dzp);
3715 3719 zilog = zfsvfs->z_log;
3716 3720
3717 3721 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3718 3722 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3719 3723 ZFS_EXIT(zfsvfs);
3720 3724 return (EILSEQ);
3721 3725 }
3722 3726 if (flags & FIGNORECASE)
3723 3727 zflg |= ZCILOOK;
3724 3728
3725 3729 if (len > MAXPATHLEN) {
3726 3730 ZFS_EXIT(zfsvfs);
3727 3731 return (ENAMETOOLONG);
3728 3732 }
3729 3733
3730 3734 if ((error = zfs_acl_ids_create(dzp, 0,
3731 3735 vap, cr, NULL, &acl_ids)) != 0) {
3732 3736 ZFS_EXIT(zfsvfs);
3733 3737 return (error);
3734 3738 }
3735 3739 top:
3736 3740 /*
3737 3741 * Attempt to lock directory; fail if entry already exists.
3738 3742 */
3739 3743 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3740 3744 if (error) {
3741 3745 zfs_acl_ids_free(&acl_ids);
3742 3746 ZFS_EXIT(zfsvfs);
3743 3747 return (error);
3744 3748 }
3745 3749
3746 3750 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3747 3751 zfs_acl_ids_free(&acl_ids);
3748 3752 zfs_dirent_unlock(dl);
3749 3753 ZFS_EXIT(zfsvfs);
3750 3754 return (error);
3751 3755 }
3752 3756
3753 3757 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3754 3758 zfs_acl_ids_free(&acl_ids);
3755 3759 zfs_dirent_unlock(dl);
3756 3760 ZFS_EXIT(zfsvfs);
3757 3761 return (EDQUOT);
3758 3762 }
3759 3763 tx = dmu_tx_create(zfsvfs->z_os);
3760 3764 fuid_dirtied = zfsvfs->z_fuid_dirty;
3761 3765 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3762 3766 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3763 3767 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3764 3768 ZFS_SA_BASE_ATTR_SIZE + len);
3765 3769 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3766 3770 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3767 3771 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3768 3772 acl_ids.z_aclp->z_acl_bytes);
3769 3773 }
3770 3774 if (fuid_dirtied)
3771 3775 zfs_fuid_txhold(zfsvfs, tx);
3772 3776 error = dmu_tx_assign(tx, TXG_NOWAIT);
3773 3777 if (error) {
3774 3778 zfs_dirent_unlock(dl);
3775 3779 if (error == ERESTART) {
3776 3780 dmu_tx_wait(tx);
3777 3781 dmu_tx_abort(tx);
3778 3782 goto top;
3779 3783 }
3780 3784 zfs_acl_ids_free(&acl_ids);
3781 3785 dmu_tx_abort(tx);
3782 3786 ZFS_EXIT(zfsvfs);
3783 3787 return (error);
3784 3788 }
3785 3789
3786 3790 /*
3787 3791 * Create a new object for the symlink.
3788 3792 * for version 4 ZPL datsets the symlink will be an SA attribute
3789 3793 */
3790 3794 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3791 3795
3792 3796 if (fuid_dirtied)
3793 3797 zfs_fuid_sync(zfsvfs, tx);
3794 3798
3795 3799 mutex_enter(&zp->z_lock);
3796 3800 if (zp->z_is_sa)
3797 3801 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3798 3802 link, len, tx);
3799 3803 else
3800 3804 zfs_sa_symlink(zp, link, len, tx);
3801 3805 mutex_exit(&zp->z_lock);
3802 3806
3803 3807 zp->z_size = len;
3804 3808 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3805 3809 &zp->z_size, sizeof (zp->z_size), tx);
3806 3810 /*
3807 3811 * Insert the new object into the directory.
3808 3812 */
3809 3813 (void) zfs_link_create(dl, zp, tx, ZNEW);
3810 3814
3811 3815 if (flags & FIGNORECASE)
3812 3816 txtype |= TX_CI;
3813 3817 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3814 3818
3815 3819 zfs_acl_ids_free(&acl_ids);
3816 3820
3817 3821 dmu_tx_commit(tx);
3818 3822
3819 3823 zfs_dirent_unlock(dl);
3820 3824
3821 3825 VN_RELE(ZTOV(zp));
3822 3826
3823 3827 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3824 3828 zil_commit(zilog, 0);
3825 3829
3826 3830 ZFS_EXIT(zfsvfs);
3827 3831 return (error);
3828 3832 }
3829 3833
3830 3834 /*
3831 3835 * Return, in the buffer contained in the provided uio structure,
3832 3836 * the symbolic path referred to by vp.
3833 3837 *
3834 3838 * IN: vp - vnode of symbolic link.
3835 3839 * uoip - structure to contain the link path.
3836 3840 * cr - credentials of caller.
3837 3841 * ct - caller context
3838 3842 *
3839 3843 * OUT: uio - structure to contain the link path.
3840 3844 *
3841 3845 * RETURN: 0 if success
3842 3846 * error code if failure
3843 3847 *
3844 3848 * Timestamps:
3845 3849 * vp - atime updated
3846 3850 */
3847 3851 /* ARGSUSED */
3848 3852 static int
3849 3853 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3850 3854 {
3851 3855 znode_t *zp = VTOZ(vp);
3852 3856 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3853 3857 int error;
3854 3858
3855 3859 ZFS_ENTER(zfsvfs);
3856 3860 ZFS_VERIFY_ZP(zp);
3857 3861
3858 3862 mutex_enter(&zp->z_lock);
3859 3863 if (zp->z_is_sa)
3860 3864 error = sa_lookup_uio(zp->z_sa_hdl,
3861 3865 SA_ZPL_SYMLINK(zfsvfs), uio);
3862 3866 else
3863 3867 error = zfs_sa_readlink(zp, uio);
3864 3868 mutex_exit(&zp->z_lock);
3865 3869
3866 3870 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3867 3871
3868 3872 ZFS_EXIT(zfsvfs);
3869 3873 return (error);
3870 3874 }
3871 3875
3872 3876 /*
3873 3877 * Insert a new entry into directory tdvp referencing svp.
3874 3878 *
3875 3879 * IN: tdvp - Directory to contain new entry.
3876 3880 * svp - vnode of new entry.
3877 3881 * name - name of new entry.
3878 3882 * cr - credentials of caller.
3879 3883 * ct - caller context
3880 3884 *
3881 3885 * RETURN: 0 if success
3882 3886 * error code if failure
3883 3887 *
3884 3888 * Timestamps:
3885 3889 * tdvp - ctime|mtime updated
3886 3890 * svp - ctime updated
3887 3891 */
3888 3892 /* ARGSUSED */
3889 3893 static int
3890 3894 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3891 3895 caller_context_t *ct, int flags)
3892 3896 {
3893 3897 znode_t *dzp = VTOZ(tdvp);
3894 3898 znode_t *tzp, *szp;
3895 3899 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3896 3900 zilog_t *zilog;
3897 3901 zfs_dirlock_t *dl;
3898 3902 dmu_tx_t *tx;
3899 3903 vnode_t *realvp;
3900 3904 int error;
3901 3905 int zf = ZNEW;
3902 3906 uint64_t parent;
3903 3907 uid_t owner;
3904 3908
3905 3909 ASSERT(tdvp->v_type == VDIR);
3906 3910
3907 3911 ZFS_ENTER(zfsvfs);
3908 3912 ZFS_VERIFY_ZP(dzp);
3909 3913 zilog = zfsvfs->z_log;
3910 3914
3911 3915 if (VOP_REALVP(svp, &realvp, ct) == 0)
3912 3916 svp = realvp;
3913 3917
3914 3918 /*
3915 3919 * POSIX dictates that we return EPERM here.
3916 3920 * Better choices include ENOTSUP or EISDIR.
3917 3921 */
3918 3922 if (svp->v_type == VDIR) {
3919 3923 ZFS_EXIT(zfsvfs);
3920 3924 return (EPERM);
3921 3925 }
3922 3926
3923 3927 if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
3924 3928 ZFS_EXIT(zfsvfs);
3925 3929 return (EXDEV);
3926 3930 }
3927 3931
3928 3932 szp = VTOZ(svp);
3929 3933 ZFS_VERIFY_ZP(szp);
3930 3934
3931 3935 /* Prevent links to .zfs/shares files */
3932 3936
3933 3937 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3934 3938 &parent, sizeof (uint64_t))) != 0) {
3935 3939 ZFS_EXIT(zfsvfs);
3936 3940 return (error);
3937 3941 }
3938 3942 if (parent == zfsvfs->z_shares_dir) {
3939 3943 ZFS_EXIT(zfsvfs);
3940 3944 return (EPERM);
3941 3945 }
3942 3946
3943 3947 if (zfsvfs->z_utf8 && u8_validate(name,
3944 3948 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3945 3949 ZFS_EXIT(zfsvfs);
3946 3950 return (EILSEQ);
3947 3951 }
3948 3952 if (flags & FIGNORECASE)
3949 3953 zf |= ZCILOOK;
3950 3954
3951 3955 /*
3952 3956 * We do not support links between attributes and non-attributes
3953 3957 * because of the potential security risk of creating links
3954 3958 * into "normal" file space in order to circumvent restrictions
3955 3959 * imposed in attribute space.
3956 3960 */
3957 3961 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
3958 3962 ZFS_EXIT(zfsvfs);
3959 3963 return (EINVAL);
3960 3964 }
3961 3965
3962 3966
3963 3967 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3964 3968 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3965 3969 ZFS_EXIT(zfsvfs);
3966 3970 return (EPERM);
3967 3971 }
3968 3972
3969 3973 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3970 3974 ZFS_EXIT(zfsvfs);
3971 3975 return (error);
3972 3976 }
3973 3977
3974 3978 top:
3975 3979 /*
3976 3980 * Attempt to lock directory; fail if entry already exists.
3977 3981 */
3978 3982 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3979 3983 if (error) {
3980 3984 ZFS_EXIT(zfsvfs);
3981 3985 return (error);
3982 3986 }
3983 3987
3984 3988 tx = dmu_tx_create(zfsvfs->z_os);
3985 3989 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3986 3990 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3987 3991 zfs_sa_upgrade_txholds(tx, szp);
3988 3992 zfs_sa_upgrade_txholds(tx, dzp);
3989 3993 error = dmu_tx_assign(tx, TXG_NOWAIT);
3990 3994 if (error) {
3991 3995 zfs_dirent_unlock(dl);
3992 3996 if (error == ERESTART) {
3993 3997 dmu_tx_wait(tx);
3994 3998 dmu_tx_abort(tx);
3995 3999 goto top;
3996 4000 }
3997 4001 dmu_tx_abort(tx);
3998 4002 ZFS_EXIT(zfsvfs);
3999 4003 return (error);
4000 4004 }
4001 4005
4002 4006 error = zfs_link_create(dl, szp, tx, 0);
4003 4007
4004 4008 if (error == 0) {
4005 4009 uint64_t txtype = TX_LINK;
4006 4010 if (flags & FIGNORECASE)
4007 4011 txtype |= TX_CI;
4008 4012 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4009 4013 }
4010 4014
4011 4015 dmu_tx_commit(tx);
4012 4016
4013 4017 zfs_dirent_unlock(dl);
4014 4018
4015 4019 if (error == 0) {
4016 4020 vnevent_link(svp, ct);
4017 4021 }
4018 4022
4019 4023 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4020 4024 zil_commit(zilog, 0);
4021 4025
4022 4026 ZFS_EXIT(zfsvfs);
4023 4027 return (error);
4024 4028 }
4025 4029
4026 4030 /*
4027 4031 * zfs_null_putapage() is used when the file system has been force
4028 4032 * unmounted. It just drops the pages.
4029 4033 */
4030 4034 /* ARGSUSED */
4031 4035 static int
4032 4036 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4033 4037 size_t *lenp, int flags, cred_t *cr)
4034 4038 {
4035 4039 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4036 4040 return (0);
4037 4041 }
4038 4042
4039 4043 /*
4040 4044 * Push a page out to disk, klustering if possible.
4041 4045 *
4042 4046 * IN: vp - file to push page to.
4043 4047 * pp - page to push.
4044 4048 * flags - additional flags.
4045 4049 * cr - credentials of caller.
4046 4050 *
4047 4051 * OUT: offp - start of range pushed.
4048 4052 * lenp - len of range pushed.
4049 4053 *
4050 4054 * RETURN: 0 if success
4051 4055 * error code if failure
4052 4056 *
4053 4057 * NOTE: callers must have locked the page to be pushed. On
4054 4058 * exit, the page (and all other pages in the kluster) must be
4055 4059 * unlocked.
4056 4060 */
4057 4061 /* ARGSUSED */
4058 4062 static int
4059 4063 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4060 4064 size_t *lenp, int flags, cred_t *cr)
4061 4065 {
4062 4066 znode_t *zp = VTOZ(vp);
4063 4067 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4064 4068 dmu_tx_t *tx;
4065 4069 u_offset_t off, koff;
4066 4070 size_t len, klen;
4067 4071 int err;
4068 4072
4069 4073 off = pp->p_offset;
4070 4074 len = PAGESIZE;
4071 4075 /*
4072 4076 * If our blocksize is bigger than the page size, try to kluster
4073 4077 * multiple pages so that we write a full block (thus avoiding
4074 4078 * a read-modify-write).
4075 4079 */
4076 4080 if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4077 4081 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4078 4082 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4079 4083 ASSERT(koff <= zp->z_size);
4080 4084 if (koff + klen > zp->z_size)
4081 4085 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4082 4086 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4083 4087 }
4084 4088 ASSERT3U(btop(len), ==, btopr(len));
4085 4089
4086 4090 /*
4087 4091 * Can't push pages past end-of-file.
4088 4092 */
4089 4093 if (off >= zp->z_size) {
4090 4094 /* ignore all pages */
4091 4095 err = 0;
4092 4096 goto out;
4093 4097 } else if (off + len > zp->z_size) {
4094 4098 int npages = btopr(zp->z_size - off);
4095 4099 page_t *trunc;
4096 4100
4097 4101 page_list_break(&pp, &trunc, npages);
4098 4102 /* ignore pages past end of file */
4099 4103 if (trunc)
4100 4104 pvn_write_done(trunc, flags);
4101 4105 len = zp->z_size - off;
4102 4106 }
4103 4107
4104 4108 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4105 4109 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4106 4110 err = EDQUOT;
4107 4111 goto out;
4108 4112 }
4109 4113 top:
4110 4114 tx = dmu_tx_create(zfsvfs->z_os);
4111 4115 dmu_tx_hold_write(tx, zp->z_id, off, len);
4112 4116
4113 4117 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4114 4118 zfs_sa_upgrade_txholds(tx, zp);
4115 4119 err = dmu_tx_assign(tx, TXG_NOWAIT);
4116 4120 if (err != 0) {
4117 4121 if (err == ERESTART) {
4118 4122 dmu_tx_wait(tx);
4119 4123 dmu_tx_abort(tx);
4120 4124 goto top;
4121 4125 }
4122 4126 dmu_tx_abort(tx);
4123 4127 goto out;
4124 4128 }
4125 4129
4126 4130 if (zp->z_blksz <= PAGESIZE) {
4127 4131 caddr_t va = zfs_map_page(pp, S_READ);
4128 4132 ASSERT3U(len, <=, PAGESIZE);
4129 4133 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4130 4134 zfs_unmap_page(pp, va);
4131 4135 } else {
4132 4136 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4133 4137 }
4134 4138
4135 4139 if (err == 0) {
4136 4140 uint64_t mtime[2], ctime[2];
4137 4141 sa_bulk_attr_t bulk[3];
4138 4142 int count = 0;
4139 4143
4140 4144 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4141 4145 &mtime, 16);
4142 4146 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4143 4147 &ctime, 16);
4144 4148 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4145 4149 &zp->z_pflags, 8);
4146 4150 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4147 4151 B_TRUE);
4148 4152 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4149 4153 }
4150 4154 dmu_tx_commit(tx);
4151 4155
4152 4156 out:
4153 4157 pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4154 4158 if (offp)
4155 4159 *offp = off;
4156 4160 if (lenp)
4157 4161 *lenp = len;
4158 4162
4159 4163 return (err);
4160 4164 }
4161 4165
4162 4166 /*
4163 4167 * Copy the portion of the file indicated from pages into the file.
4164 4168 * The pages are stored in a page list attached to the files vnode.
4165 4169 *
4166 4170 * IN: vp - vnode of file to push page data to.
4167 4171 * off - position in file to put data.
4168 4172 * len - amount of data to write.
4169 4173 * flags - flags to control the operation.
4170 4174 * cr - credentials of caller.
4171 4175 * ct - caller context.
4172 4176 *
4173 4177 * RETURN: 0 if success
4174 4178 * error code if failure
4175 4179 *
4176 4180 * Timestamps:
4177 4181 * vp - ctime|mtime updated
4178 4182 */
4179 4183 /*ARGSUSED*/
4180 4184 static int
4181 4185 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4182 4186 caller_context_t *ct)
4183 4187 {
4184 4188 znode_t *zp = VTOZ(vp);
4185 4189 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4186 4190 page_t *pp;
4187 4191 size_t io_len;
4188 4192 u_offset_t io_off;
4189 4193 uint_t blksz;
4190 4194 rl_t *rl;
4191 4195 int error = 0;
4192 4196
4193 4197 ZFS_ENTER(zfsvfs);
4194 4198 ZFS_VERIFY_ZP(zp);
4195 4199
4196 4200 /*
4197 4201 * There's nothing to do if no data is cached.
4198 4202 */
4199 4203 if (!vn_has_cached_data(vp)) {
4200 4204 ZFS_EXIT(zfsvfs);
4201 4205 return (0);
4202 4206 }
4203 4207
4204 4208 /*
4205 4209 * Align this request to the file block size in case we kluster.
4206 4210 * XXX - this can result in pretty aggresive locking, which can
4207 4211 * impact simultanious read/write access. One option might be
4208 4212 * to break up long requests (len == 0) into block-by-block
4209 4213 * operations to get narrower locking.
4210 4214 */
4211 4215 blksz = zp->z_blksz;
4212 4216 if (ISP2(blksz))
4213 4217 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4214 4218 else
4215 4219 io_off = 0;
4216 4220 if (len > 0 && ISP2(blksz))
4217 4221 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4218 4222 else
4219 4223 io_len = 0;
4220 4224
4221 4225 if (io_len == 0) {
4222 4226 /*
4223 4227 * Search the entire vp list for pages >= io_off.
4224 4228 */
4225 4229 rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4226 4230 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4227 4231 goto out;
4228 4232 }
4229 4233 rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4230 4234
4231 4235 if (off > zp->z_size) {
4232 4236 /* past end of file */
4233 4237 zfs_range_unlock(rl);
4234 4238 ZFS_EXIT(zfsvfs);
4235 4239 return (0);
4236 4240 }
4237 4241
4238 4242 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4239 4243
4240 4244 for (off = io_off; io_off < off + len; io_off += io_len) {
4241 4245 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4242 4246 pp = page_lookup(vp, io_off,
4243 4247 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4244 4248 } else {
4245 4249 pp = page_lookup_nowait(vp, io_off,
4246 4250 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4247 4251 }
4248 4252
4249 4253 if (pp != NULL && pvn_getdirty(pp, flags)) {
4250 4254 int err;
4251 4255
4252 4256 /*
4253 4257 * Found a dirty page to push
4254 4258 */
4255 4259 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4256 4260 if (err)
4257 4261 error = err;
4258 4262 } else {
4259 4263 io_len = PAGESIZE;
4260 4264 }
4261 4265 }
4262 4266 out:
4263 4267 zfs_range_unlock(rl);
4264 4268 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4265 4269 zil_commit(zfsvfs->z_log, zp->z_id);
4266 4270 ZFS_EXIT(zfsvfs);
4267 4271 return (error);
4268 4272 }
4269 4273
4270 4274 /*ARGSUSED*/
4271 4275 void
4272 4276 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4273 4277 {
4274 4278 znode_t *zp = VTOZ(vp);
4275 4279 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4276 4280 int error;
4277 4281
4278 4282 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4279 4283 if (zp->z_sa_hdl == NULL) {
4280 4284 /*
4281 4285 * The fs has been unmounted, or we did a
4282 4286 * suspend/resume and this file no longer exists.
4283 4287 */
4284 4288 if (vn_has_cached_data(vp)) {
4285 4289 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4286 4290 B_INVAL, cr);
4287 4291 }
4288 4292
4289 4293 mutex_enter(&zp->z_lock);
4290 4294 mutex_enter(&vp->v_lock);
4291 4295 ASSERT(vp->v_count == 1);
4292 4296 vp->v_count = 0;
4293 4297 mutex_exit(&vp->v_lock);
4294 4298 mutex_exit(&zp->z_lock);
4295 4299 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4296 4300 zfs_znode_free(zp);
4297 4301 return;
4298 4302 }
4299 4303
4300 4304 /*
4301 4305 * Attempt to push any data in the page cache. If this fails
4302 4306 * we will get kicked out later in zfs_zinactive().
4303 4307 */
4304 4308 if (vn_has_cached_data(vp)) {
4305 4309 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4306 4310 cr);
4307 4311 }
4308 4312
4309 4313 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4310 4314 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4311 4315
4312 4316 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4313 4317 zfs_sa_upgrade_txholds(tx, zp);
4314 4318 error = dmu_tx_assign(tx, TXG_WAIT);
4315 4319 if (error) {
4316 4320 dmu_tx_abort(tx);
4317 4321 } else {
4318 4322 mutex_enter(&zp->z_lock);
4319 4323 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4320 4324 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4321 4325 zp->z_atime_dirty = 0;
4322 4326 mutex_exit(&zp->z_lock);
4323 4327 dmu_tx_commit(tx);
4324 4328 }
4325 4329 }
4326 4330
4327 4331 zfs_zinactive(zp);
4328 4332 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4329 4333 }
4330 4334
4331 4335 /*
4332 4336 * Bounds-check the seek operation.
4333 4337 *
4334 4338 * IN: vp - vnode seeking within
4335 4339 * ooff - old file offset
4336 4340 * noffp - pointer to new file offset
4337 4341 * ct - caller context
4338 4342 *
4339 4343 * RETURN: 0 if success
4340 4344 * EINVAL if new offset invalid
4341 4345 */
4342 4346 /* ARGSUSED */
4343 4347 static int
4344 4348 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4345 4349 caller_context_t *ct)
4346 4350 {
4347 4351 if (vp->v_type == VDIR)
4348 4352 return (0);
4349 4353 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4350 4354 }
4351 4355
4352 4356 /*
4353 4357 * Pre-filter the generic locking function to trap attempts to place
4354 4358 * a mandatory lock on a memory mapped file.
4355 4359 */
4356 4360 static int
4357 4361 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4358 4362 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4359 4363 {
4360 4364 znode_t *zp = VTOZ(vp);
4361 4365 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4362 4366
4363 4367 ZFS_ENTER(zfsvfs);
4364 4368 ZFS_VERIFY_ZP(zp);
4365 4369
4366 4370 /*
4367 4371 * We are following the UFS semantics with respect to mapcnt
4368 4372 * here: If we see that the file is mapped already, then we will
4369 4373 * return an error, but we don't worry about races between this
4370 4374 * function and zfs_map().
4371 4375 */
4372 4376 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4373 4377 ZFS_EXIT(zfsvfs);
4374 4378 return (EAGAIN);
4375 4379 }
4376 4380 ZFS_EXIT(zfsvfs);
4377 4381 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4378 4382 }
4379 4383
4380 4384 /*
4381 4385 * If we can't find a page in the cache, we will create a new page
4382 4386 * and fill it with file data. For efficiency, we may try to fill
4383 4387 * multiple pages at once (klustering) to fill up the supplied page
4384 4388 * list. Note that the pages to be filled are held with an exclusive
4385 4389 * lock to prevent access by other threads while they are being filled.
4386 4390 */
4387 4391 static int
4388 4392 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4389 4393 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4390 4394 {
4391 4395 znode_t *zp = VTOZ(vp);
4392 4396 page_t *pp, *cur_pp;
4393 4397 objset_t *os = zp->z_zfsvfs->z_os;
4394 4398 u_offset_t io_off, total;
4395 4399 size_t io_len;
4396 4400 int err;
4397 4401
4398 4402 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4399 4403 /*
4400 4404 * We only have a single page, don't bother klustering
4401 4405 */
4402 4406 io_off = off;
4403 4407 io_len = PAGESIZE;
4404 4408 pp = page_create_va(vp, io_off, io_len,
4405 4409 PG_EXCL | PG_WAIT, seg, addr);
4406 4410 } else {
4407 4411 /*
4408 4412 * Try to find enough pages to fill the page list
4409 4413 */
4410 4414 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4411 4415 &io_len, off, plsz, 0);
4412 4416 }
4413 4417 if (pp == NULL) {
4414 4418 /*
4415 4419 * The page already exists, nothing to do here.
4416 4420 */
4417 4421 *pl = NULL;
4418 4422 return (0);
4419 4423 }
4420 4424
4421 4425 /*
4422 4426 * Fill the pages in the kluster.
4423 4427 */
4424 4428 cur_pp = pp;
4425 4429 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4426 4430 caddr_t va;
4427 4431
4428 4432 ASSERT3U(io_off, ==, cur_pp->p_offset);
4429 4433 va = zfs_map_page(cur_pp, S_WRITE);
4430 4434 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4431 4435 DMU_READ_PREFETCH);
4432 4436 zfs_unmap_page(cur_pp, va);
4433 4437 if (err) {
4434 4438 /* On error, toss the entire kluster */
4435 4439 pvn_read_done(pp, B_ERROR);
4436 4440 /* convert checksum errors into IO errors */
4437 4441 if (err == ECKSUM)
4438 4442 err = EIO;
4439 4443 return (err);
4440 4444 }
4441 4445 cur_pp = cur_pp->p_next;
4442 4446 }
4443 4447
4444 4448 /*
4445 4449 * Fill in the page list array from the kluster starting
4446 4450 * from the desired offset `off'.
4447 4451 * NOTE: the page list will always be null terminated.
4448 4452 */
4449 4453 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4450 4454 ASSERT(pl == NULL || (*pl)->p_offset == off);
4451 4455
4452 4456 return (0);
4453 4457 }
4454 4458
4455 4459 /*
4456 4460 * Return pointers to the pages for the file region [off, off + len]
4457 4461 * in the pl array. If plsz is greater than len, this function may
4458 4462 * also return page pointers from after the specified region
4459 4463 * (i.e. the region [off, off + plsz]). These additional pages are
4460 4464 * only returned if they are already in the cache, or were created as
4461 4465 * part of a klustered read.
4462 4466 *
4463 4467 * IN: vp - vnode of file to get data from.
4464 4468 * off - position in file to get data from.
4465 4469 * len - amount of data to retrieve.
4466 4470 * plsz - length of provided page list.
4467 4471 * seg - segment to obtain pages for.
4468 4472 * addr - virtual address of fault.
4469 4473 * rw - mode of created pages.
4470 4474 * cr - credentials of caller.
4471 4475 * ct - caller context.
4472 4476 *
4473 4477 * OUT: protp - protection mode of created pages.
4474 4478 * pl - list of pages created.
4475 4479 *
4476 4480 * RETURN: 0 if success
4477 4481 * error code if failure
4478 4482 *
4479 4483 * Timestamps:
4480 4484 * vp - atime updated
4481 4485 */
4482 4486 /* ARGSUSED */
4483 4487 static int
4484 4488 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4485 4489 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4486 4490 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4487 4491 {
4488 4492 znode_t *zp = VTOZ(vp);
4489 4493 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4490 4494 page_t **pl0 = pl;
4491 4495 int err = 0;
4492 4496
4493 4497 /* we do our own caching, faultahead is unnecessary */
4494 4498 if (pl == NULL)
4495 4499 return (0);
4496 4500 else if (len > plsz)
4497 4501 len = plsz;
4498 4502 else
4499 4503 len = P2ROUNDUP(len, PAGESIZE);
4500 4504 ASSERT(plsz >= len);
4501 4505
4502 4506 ZFS_ENTER(zfsvfs);
4503 4507 ZFS_VERIFY_ZP(zp);
4504 4508
4505 4509 if (protp)
4506 4510 *protp = PROT_ALL;
4507 4511
4508 4512 /*
4509 4513 * Loop through the requested range [off, off + len) looking
4510 4514 * for pages. If we don't find a page, we will need to create
4511 4515 * a new page and fill it with data from the file.
4512 4516 */
4513 4517 while (len > 0) {
4514 4518 if (*pl = page_lookup(vp, off, SE_SHARED))
4515 4519 *(pl+1) = NULL;
4516 4520 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4517 4521 goto out;
4518 4522 while (*pl) {
4519 4523 ASSERT3U((*pl)->p_offset, ==, off);
4520 4524 off += PAGESIZE;
4521 4525 addr += PAGESIZE;
4522 4526 if (len > 0) {
4523 4527 ASSERT3U(len, >=, PAGESIZE);
4524 4528 len -= PAGESIZE;
4525 4529 }
4526 4530 ASSERT3U(plsz, >=, PAGESIZE);
4527 4531 plsz -= PAGESIZE;
4528 4532 pl++;
4529 4533 }
4530 4534 }
4531 4535
4532 4536 /*
4533 4537 * Fill out the page array with any pages already in the cache.
4534 4538 */
4535 4539 while (plsz > 0 &&
4536 4540 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4537 4541 off += PAGESIZE;
4538 4542 plsz -= PAGESIZE;
4539 4543 }
4540 4544 out:
4541 4545 if (err) {
4542 4546 /*
4543 4547 * Release any pages we have previously locked.
4544 4548 */
4545 4549 while (pl > pl0)
4546 4550 page_unlock(*--pl);
4547 4551 } else {
4548 4552 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4549 4553 }
4550 4554
4551 4555 *pl = NULL;
4552 4556
4553 4557 ZFS_EXIT(zfsvfs);
4554 4558 return (err);
4555 4559 }
4556 4560
4557 4561 /*
4558 4562 * Request a memory map for a section of a file. This code interacts
4559 4563 * with common code and the VM system as follows:
4560 4564 *
4561 4565 * common code calls mmap(), which ends up in smmap_common()
4562 4566 *
4563 4567 * this calls VOP_MAP(), which takes you into (say) zfs
4564 4568 *
4565 4569 * zfs_map() calls as_map(), passing segvn_create() as the callback
4566 4570 *
4567 4571 * segvn_create() creates the new segment and calls VOP_ADDMAP()
4568 4572 *
4569 4573 * zfs_addmap() updates z_mapcnt
4570 4574 */
4571 4575 /*ARGSUSED*/
4572 4576 static int
4573 4577 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4574 4578 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4575 4579 caller_context_t *ct)
4576 4580 {
4577 4581 znode_t *zp = VTOZ(vp);
4578 4582 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4579 4583 segvn_crargs_t vn_a;
4580 4584 int error;
4581 4585
4582 4586 ZFS_ENTER(zfsvfs);
4583 4587 ZFS_VERIFY_ZP(zp);
4584 4588
4585 4589 if ((prot & PROT_WRITE) && (zp->z_pflags &
4586 4590 (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4587 4591 ZFS_EXIT(zfsvfs);
4588 4592 return (EPERM);
4589 4593 }
4590 4594
4591 4595 if ((prot & (PROT_READ | PROT_EXEC)) &&
4592 4596 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4593 4597 ZFS_EXIT(zfsvfs);
4594 4598 return (EACCES);
4595 4599 }
4596 4600
4597 4601 if (vp->v_flag & VNOMAP) {
4598 4602 ZFS_EXIT(zfsvfs);
4599 4603 return (ENOSYS);
4600 4604 }
4601 4605
4602 4606 if (off < 0 || len > MAXOFFSET_T - off) {
4603 4607 ZFS_EXIT(zfsvfs);
4604 4608 return (ENXIO);
4605 4609 }
4606 4610
4607 4611 if (vp->v_type != VREG) {
4608 4612 ZFS_EXIT(zfsvfs);
4609 4613 return (ENODEV);
4610 4614 }
4611 4615
4612 4616 /*
4613 4617 * If file is locked, disallow mapping.
4614 4618 */
4615 4619 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4616 4620 ZFS_EXIT(zfsvfs);
4617 4621 return (EAGAIN);
4618 4622 }
4619 4623
4620 4624 as_rangelock(as);
4621 4625 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4622 4626 if (error != 0) {
4623 4627 as_rangeunlock(as);
4624 4628 ZFS_EXIT(zfsvfs);
4625 4629 return (error);
4626 4630 }
4627 4631
4628 4632 vn_a.vp = vp;
4629 4633 vn_a.offset = (u_offset_t)off;
4630 4634 vn_a.type = flags & MAP_TYPE;
4631 4635 vn_a.prot = prot;
4632 4636 vn_a.maxprot = maxprot;
4633 4637 vn_a.cred = cr;
4634 4638 vn_a.amp = NULL;
4635 4639 vn_a.flags = flags & ~MAP_TYPE;
4636 4640 vn_a.szc = 0;
4637 4641 vn_a.lgrp_mem_policy_flags = 0;
4638 4642
4639 4643 error = as_map(as, *addrp, len, segvn_create, &vn_a);
4640 4644
4641 4645 as_rangeunlock(as);
4642 4646 ZFS_EXIT(zfsvfs);
4643 4647 return (error);
4644 4648 }
4645 4649
4646 4650 /* ARGSUSED */
4647 4651 static int
4648 4652 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4649 4653 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4650 4654 caller_context_t *ct)
4651 4655 {
4652 4656 uint64_t pages = btopr(len);
4653 4657
4654 4658 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4655 4659 return (0);
4656 4660 }
4657 4661
4658 4662 /*
4659 4663 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4660 4664 * more accurate mtime for the associated file. Since we don't have a way of
4661 4665 * detecting when the data was actually modified, we have to resort to
4662 4666 * heuristics. If an explicit msync() is done, then we mark the mtime when the
4663 4667 * last page is pushed. The problem occurs when the msync() call is omitted,
4664 4668 * which by far the most common case:
4665 4669 *
4666 4670 * open()
4667 4671 * mmap()
4668 4672 * <modify memory>
4669 4673 * munmap()
4670 4674 * close()
4671 4675 * <time lapse>
4672 4676 * putpage() via fsflush
4673 4677 *
4674 4678 * If we wait until fsflush to come along, we can have a modification time that
4675 4679 * is some arbitrary point in the future. In order to prevent this in the
4676 4680 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4677 4681 * torn down.
4678 4682 */
4679 4683 /* ARGSUSED */
4680 4684 static int
4681 4685 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4682 4686 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4683 4687 caller_context_t *ct)
4684 4688 {
4685 4689 uint64_t pages = btopr(len);
4686 4690
4687 4691 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4688 4692 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4689 4693
4690 4694 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4691 4695 vn_has_cached_data(vp))
4692 4696 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4693 4697
4694 4698 return (0);
4695 4699 }
4696 4700
4697 4701 /*
4698 4702 * Free or allocate space in a file. Currently, this function only
4699 4703 * supports the `F_FREESP' command. However, this command is somewhat
4700 4704 * misnamed, as its functionality includes the ability to allocate as
4701 4705 * well as free space.
4702 4706 *
4703 4707 * IN: vp - vnode of file to free data in.
4704 4708 * cmd - action to take (only F_FREESP supported).
4705 4709 * bfp - section of file to free/alloc.
4706 4710 * flag - current file open mode flags.
4707 4711 * offset - current file offset.
4708 4712 * cr - credentials of caller [UNUSED].
4709 4713 * ct - caller context.
4710 4714 *
4711 4715 * RETURN: 0 if success
4712 4716 * error code if failure
4713 4717 *
4714 4718 * Timestamps:
4715 4719 * vp - ctime|mtime updated
4716 4720 */
4717 4721 /* ARGSUSED */
4718 4722 static int
4719 4723 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4720 4724 offset_t offset, cred_t *cr, caller_context_t *ct)
4721 4725 {
4722 4726 znode_t *zp = VTOZ(vp);
4723 4727 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4724 4728 uint64_t off, len;
4725 4729 int error;
4726 4730
4727 4731 ZFS_ENTER(zfsvfs);
4728 4732 ZFS_VERIFY_ZP(zp);
4729 4733
4730 4734 if (cmd != F_FREESP) {
4731 4735 ZFS_EXIT(zfsvfs);
4732 4736 return (EINVAL);
4733 4737 }
4734 4738
4735 4739 if (error = convoff(vp, bfp, 0, offset)) {
4736 4740 ZFS_EXIT(zfsvfs);
4737 4741 return (error);
4738 4742 }
4739 4743
4740 4744 if (bfp->l_len < 0) {
4741 4745 ZFS_EXIT(zfsvfs);
4742 4746 return (EINVAL);
4743 4747 }
4744 4748
4745 4749 off = bfp->l_start;
4746 4750 len = bfp->l_len; /* 0 means from off to end of file */
4747 4751
4748 4752 error = zfs_freesp(zp, off, len, flag, TRUE);
4749 4753
4750 4754 ZFS_EXIT(zfsvfs);
4751 4755 return (error);
4752 4756 }
4753 4757
4754 4758 /*ARGSUSED*/
4755 4759 static int
4756 4760 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4757 4761 {
4758 4762 znode_t *zp = VTOZ(vp);
4759 4763 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4760 4764 uint32_t gen;
4761 4765 uint64_t gen64;
4762 4766 uint64_t object = zp->z_id;
4763 4767 zfid_short_t *zfid;
4764 4768 int size, i, error;
4765 4769
4766 4770 ZFS_ENTER(zfsvfs);
4767 4771 ZFS_VERIFY_ZP(zp);
4768 4772
4769 4773 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4770 4774 &gen64, sizeof (uint64_t))) != 0) {
4771 4775 ZFS_EXIT(zfsvfs);
4772 4776 return (error);
4773 4777 }
4774 4778
4775 4779 gen = (uint32_t)gen64;
4776 4780
4777 4781 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4778 4782 if (fidp->fid_len < size) {
4779 4783 fidp->fid_len = size;
4780 4784 ZFS_EXIT(zfsvfs);
4781 4785 return (ENOSPC);
4782 4786 }
4783 4787
4784 4788 zfid = (zfid_short_t *)fidp;
4785 4789
4786 4790 zfid->zf_len = size;
4787 4791
4788 4792 for (i = 0; i < sizeof (zfid->zf_object); i++)
4789 4793 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4790 4794
4791 4795 /* Must have a non-zero generation number to distinguish from .zfs */
4792 4796 if (gen == 0)
4793 4797 gen = 1;
4794 4798 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4795 4799 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4796 4800
4797 4801 if (size == LONG_FID_LEN) {
4798 4802 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4799 4803 zfid_long_t *zlfid;
4800 4804
4801 4805 zlfid = (zfid_long_t *)fidp;
4802 4806
4803 4807 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4804 4808 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4805 4809
4806 4810 /* XXX - this should be the generation number for the objset */
4807 4811 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4808 4812 zlfid->zf_setgen[i] = 0;
4809 4813 }
4810 4814
4811 4815 ZFS_EXIT(zfsvfs);
4812 4816 return (0);
4813 4817 }
4814 4818
4815 4819 static int
4816 4820 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4817 4821 caller_context_t *ct)
4818 4822 {
4819 4823 znode_t *zp, *xzp;
4820 4824 zfsvfs_t *zfsvfs;
4821 4825 zfs_dirlock_t *dl;
4822 4826 int error;
4823 4827
4824 4828 switch (cmd) {
4825 4829 case _PC_LINK_MAX:
4826 4830 *valp = ULONG_MAX;
4827 4831 return (0);
4828 4832
4829 4833 case _PC_FILESIZEBITS:
4830 4834 *valp = 64;
4831 4835 return (0);
4832 4836
4833 4837 case _PC_XATTR_EXISTS:
4834 4838 zp = VTOZ(vp);
4835 4839 zfsvfs = zp->z_zfsvfs;
4836 4840 ZFS_ENTER(zfsvfs);
4837 4841 ZFS_VERIFY_ZP(zp);
4838 4842 *valp = 0;
4839 4843 error = zfs_dirent_lock(&dl, zp, "", &xzp,
4840 4844 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4841 4845 if (error == 0) {
4842 4846 zfs_dirent_unlock(dl);
4843 4847 if (!zfs_dirempty(xzp))
4844 4848 *valp = 1;
4845 4849 VN_RELE(ZTOV(xzp));
4846 4850 } else if (error == ENOENT) {
4847 4851 /*
4848 4852 * If there aren't extended attributes, it's the
4849 4853 * same as having zero of them.
4850 4854 */
4851 4855 error = 0;
4852 4856 }
4853 4857 ZFS_EXIT(zfsvfs);
4854 4858 return (error);
4855 4859
4856 4860 case _PC_SATTR_ENABLED:
4857 4861 case _PC_SATTR_EXISTS:
4858 4862 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4859 4863 (vp->v_type == VREG || vp->v_type == VDIR);
4860 4864 return (0);
4861 4865
4862 4866 case _PC_ACCESS_FILTERING:
4863 4867 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4864 4868 vp->v_type == VDIR;
4865 4869 return (0);
4866 4870
4867 4871 case _PC_ACL_ENABLED:
4868 4872 *valp = _ACL_ACE_ENABLED;
4869 4873 return (0);
4870 4874
4871 4875 case _PC_MIN_HOLE_SIZE:
4872 4876 *valp = (ulong_t)SPA_MINBLOCKSIZE;
4873 4877 return (0);
4874 4878
4875 4879 case _PC_TIMESTAMP_RESOLUTION:
4876 4880 /* nanosecond timestamp resolution */
4877 4881 *valp = 1L;
4878 4882 return (0);
4879 4883
4880 4884 default:
4881 4885 return (fs_pathconf(vp, cmd, valp, cr, ct));
4882 4886 }
4883 4887 }
4884 4888
4885 4889 /*ARGSUSED*/
4886 4890 static int
4887 4891 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4888 4892 caller_context_t *ct)
4889 4893 {
4890 4894 znode_t *zp = VTOZ(vp);
4891 4895 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4892 4896 int error;
4893 4897 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4894 4898
4895 4899 ZFS_ENTER(zfsvfs);
4896 4900 ZFS_VERIFY_ZP(zp);
4897 4901 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4898 4902 ZFS_EXIT(zfsvfs);
4899 4903
4900 4904 return (error);
4901 4905 }
4902 4906
4903 4907 /*ARGSUSED*/
4904 4908 static int
4905 4909 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4906 4910 caller_context_t *ct)
4907 4911 {
4908 4912 znode_t *zp = VTOZ(vp);
4909 4913 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4910 4914 int error;
4911 4915 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4912 4916 zilog_t *zilog = zfsvfs->z_log;
4913 4917
4914 4918 ZFS_ENTER(zfsvfs);
4915 4919 ZFS_VERIFY_ZP(zp);
4916 4920
4917 4921 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4918 4922
4919 4923 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4920 4924 zil_commit(zilog, 0);
4921 4925
4922 4926 ZFS_EXIT(zfsvfs);
4923 4927 return (error);
4924 4928 }
4925 4929
4926 4930 /*
4927 4931 * Tunable, both must be a power of 2.
4928 4932 *
4929 4933 * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
4930 4934 * zcr_blksz_max: if set to less than the file block size, allow loaning out of
4931 4935 * an arcbuf for a partial block read
4932 4936 */
4933 4937 int zcr_blksz_min = (1 << 10); /* 1K */
4934 4938 int zcr_blksz_max = (1 << 17); /* 128K */
4935 4939
4936 4940 /*ARGSUSED*/
4937 4941 static int
4938 4942 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
4939 4943 caller_context_t *ct)
4940 4944 {
4941 4945 znode_t *zp = VTOZ(vp);
4942 4946 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4943 4947 int max_blksz = zfsvfs->z_max_blksz;
4944 4948 uio_t *uio = &xuio->xu_uio;
4945 4949 ssize_t size = uio->uio_resid;
4946 4950 offset_t offset = uio->uio_loffset;
4947 4951 int blksz;
4948 4952 int fullblk, i;
4949 4953 arc_buf_t *abuf;
4950 4954 ssize_t maxsize;
4951 4955 int preamble, postamble;
4952 4956
4953 4957 if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4954 4958 return (EINVAL);
4955 4959
4956 4960 ZFS_ENTER(zfsvfs);
4957 4961 ZFS_VERIFY_ZP(zp);
4958 4962 switch (ioflag) {
4959 4963 case UIO_WRITE:
4960 4964 /*
4961 4965 * Loan out an arc_buf for write if write size is bigger than
4962 4966 * max_blksz, and the file's block size is also max_blksz.
4963 4967 */
4964 4968 blksz = max_blksz;
4965 4969 if (size < blksz || zp->z_blksz != blksz) {
4966 4970 ZFS_EXIT(zfsvfs);
4967 4971 return (EINVAL);
4968 4972 }
4969 4973 /*
4970 4974 * Caller requests buffers for write before knowing where the
4971 4975 * write offset might be (e.g. NFS TCP write).
4972 4976 */
4973 4977 if (offset == -1) {
4974 4978 preamble = 0;
4975 4979 } else {
4976 4980 preamble = P2PHASE(offset, blksz);
4977 4981 if (preamble) {
4978 4982 preamble = blksz - preamble;
4979 4983 size -= preamble;
4980 4984 }
4981 4985 }
4982 4986
4983 4987 postamble = P2PHASE(size, blksz);
4984 4988 size -= postamble;
4985 4989
4986 4990 fullblk = size / blksz;
4987 4991 (void) dmu_xuio_init(xuio,
4988 4992 (preamble != 0) + fullblk + (postamble != 0));
4989 4993 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
4990 4994 int, postamble, int,
4991 4995 (preamble != 0) + fullblk + (postamble != 0));
4992 4996
4993 4997 /*
4994 4998 * Have to fix iov base/len for partial buffers. They
4995 4999 * currently represent full arc_buf's.
4996 5000 */
4997 5001 if (preamble) {
4998 5002 /* data begins in the middle of the arc_buf */
4999 5003 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5000 5004 blksz);
5001 5005 ASSERT(abuf);
5002 5006 (void) dmu_xuio_add(xuio, abuf,
5003 5007 blksz - preamble, preamble);
5004 5008 }
5005 5009
5006 5010 for (i = 0; i < fullblk; i++) {
5007 5011 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5008 5012 blksz);
5009 5013 ASSERT(abuf);
5010 5014 (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5011 5015 }
5012 5016
5013 5017 if (postamble) {
5014 5018 /* data ends in the middle of the arc_buf */
5015 5019 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5016 5020 blksz);
5017 5021 ASSERT(abuf);
5018 5022 (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5019 5023 }
5020 5024 break;
5021 5025 case UIO_READ:
5022 5026 /*
5023 5027 * Loan out an arc_buf for read if the read size is larger than
5024 5028 * the current file block size. Block alignment is not
5025 5029 * considered. Partial arc_buf will be loaned out for read.
5026 5030 */
5027 5031 blksz = zp->z_blksz;
5028 5032 if (blksz < zcr_blksz_min)
5029 5033 blksz = zcr_blksz_min;
5030 5034 if (blksz > zcr_blksz_max)
5031 5035 blksz = zcr_blksz_max;
5032 5036 /* avoid potential complexity of dealing with it */
5033 5037 if (blksz > max_blksz) {
5034 5038 ZFS_EXIT(zfsvfs);
5035 5039 return (EINVAL);
5036 5040 }
5037 5041
5038 5042 maxsize = zp->z_size - uio->uio_loffset;
5039 5043 if (size > maxsize)
5040 5044 size = maxsize;
5041 5045
5042 5046 if (size < blksz || vn_has_cached_data(vp)) {
5043 5047 ZFS_EXIT(zfsvfs);
5044 5048 return (EINVAL);
5045 5049 }
5046 5050 break;
5047 5051 default:
5048 5052 ZFS_EXIT(zfsvfs);
5049 5053 return (EINVAL);
5050 5054 }
5051 5055
5052 5056 uio->uio_extflg = UIO_XUIO;
5053 5057 XUIO_XUZC_RW(xuio) = ioflag;
5054 5058 ZFS_EXIT(zfsvfs);
5055 5059 return (0);
5056 5060 }
5057 5061
5058 5062 /*ARGSUSED*/
5059 5063 static int
5060 5064 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5061 5065 {
5062 5066 int i;
5063 5067 arc_buf_t *abuf;
5064 5068 int ioflag = XUIO_XUZC_RW(xuio);
5065 5069
5066 5070 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5067 5071
5068 5072 i = dmu_xuio_cnt(xuio);
5069 5073 while (i-- > 0) {
5070 5074 abuf = dmu_xuio_arcbuf(xuio, i);
5071 5075 /*
5072 5076 * if abuf == NULL, it must be a write buffer
5073 5077 * that has been returned in zfs_write().
5074 5078 */
5075 5079 if (abuf)
5076 5080 dmu_return_arcbuf(abuf);
5077 5081 ASSERT(abuf || ioflag == UIO_WRITE);
5078 5082 }
5079 5083
5080 5084 dmu_xuio_fini(xuio);
5081 5085 return (0);
5082 5086 }
5083 5087
5084 5088 /*
5085 5089 * Predeclare these here so that the compiler assumes that
5086 5090 * this is an "old style" function declaration that does
5087 5091 * not include arguments => we won't get type mismatch errors
5088 5092 * in the initializations that follow.
5089 5093 */
5090 5094 static int zfs_inval();
5091 5095 static int zfs_isdir();
5092 5096
5093 5097 static int
5094 5098 zfs_inval()
5095 5099 {
5096 5100 return (EINVAL);
5097 5101 }
5098 5102
5099 5103 static int
5100 5104 zfs_isdir()
5101 5105 {
5102 5106 return (EISDIR);
5103 5107 }
5104 5108 /*
5105 5109 * Directory vnode operations template
5106 5110 */
5107 5111 vnodeops_t *zfs_dvnodeops;
5108 5112 const fs_operation_def_t zfs_dvnodeops_template[] = {
5109 5113 VOPNAME_OPEN, { .vop_open = zfs_open },
5110 5114 VOPNAME_CLOSE, { .vop_close = zfs_close },
5111 5115 VOPNAME_READ, { .error = zfs_isdir },
5112 5116 VOPNAME_WRITE, { .error = zfs_isdir },
5113 5117 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5114 5118 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5115 5119 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5116 5120 VOPNAME_ACCESS, { .vop_access = zfs_access },
5117 5121 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5118 5122 VOPNAME_CREATE, { .vop_create = zfs_create },
5119 5123 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
5120 5124 VOPNAME_LINK, { .vop_link = zfs_link },
5121 5125 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5122 5126 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
5123 5127 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
5124 5128 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
5125 5129 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
5126 5130 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5127 5131 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5128 5132 VOPNAME_FID, { .vop_fid = zfs_fid },
5129 5133 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5130 5134 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5131 5135 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5132 5136 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5133 5137 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5134 5138 NULL, NULL
5135 5139 };
5136 5140
5137 5141 /*
5138 5142 * Regular file vnode operations template
5139 5143 */
5140 5144 vnodeops_t *zfs_fvnodeops;
5141 5145 const fs_operation_def_t zfs_fvnodeops_template[] = {
5142 5146 VOPNAME_OPEN, { .vop_open = zfs_open },
5143 5147 VOPNAME_CLOSE, { .vop_close = zfs_close },
5144 5148 VOPNAME_READ, { .vop_read = zfs_read },
5145 5149 VOPNAME_WRITE, { .vop_write = zfs_write },
5146 5150 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5147 5151 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5148 5152 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5149 5153 VOPNAME_ACCESS, { .vop_access = zfs_access },
5150 5154 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5151 5155 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5152 5156 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5153 5157 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5154 5158 VOPNAME_FID, { .vop_fid = zfs_fid },
5155 5159 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5156 5160 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
5157 5161 VOPNAME_SPACE, { .vop_space = zfs_space },
5158 5162 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
5159 5163 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
5160 5164 VOPNAME_MAP, { .vop_map = zfs_map },
5161 5165 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
5162 5166 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
5163 5167 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5164 5168 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5165 5169 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5166 5170 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5167 5171 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
5168 5172 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
5169 5173 NULL, NULL
5170 5174 };
5171 5175
5172 5176 /*
5173 5177 * Symbolic link vnode operations template
5174 5178 */
5175 5179 vnodeops_t *zfs_symvnodeops;
5176 5180 const fs_operation_def_t zfs_symvnodeops_template[] = {
5177 5181 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5178 5182 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5179 5183 VOPNAME_ACCESS, { .vop_access = zfs_access },
5180 5184 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5181 5185 VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
5182 5186 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5183 5187 VOPNAME_FID, { .vop_fid = zfs_fid },
5184 5188 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5185 5189 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5186 5190 NULL, NULL
5187 5191 };
5188 5192
5189 5193 /*
5190 5194 * special share hidden files vnode operations template
5191 5195 */
5192 5196 vnodeops_t *zfs_sharevnodeops;
5193 5197 const fs_operation_def_t zfs_sharevnodeops_template[] = {
5194 5198 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5195 5199 VOPNAME_ACCESS, { .vop_access = zfs_access },
5196 5200 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5197 5201 VOPNAME_FID, { .vop_fid = zfs_fid },
5198 5202 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5199 5203 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5200 5204 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5201 5205 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5202 5206 NULL, NULL
5203 5207 };
5204 5208
5205 5209 /*
5206 5210 * Extended attribute directory vnode operations template
5207 5211 * This template is identical to the directory vnodes
5208 5212 * operation template except for restricted operations:
5209 5213 * VOP_MKDIR()
5210 5214 * VOP_SYMLINK()
5211 5215 * Note that there are other restrictions embedded in:
5212 5216 * zfs_create() - restrict type to VREG
5213 5217 * zfs_link() - no links into/out of attribute space
5214 5218 * zfs_rename() - no moves into/out of attribute space
5215 5219 */
5216 5220 vnodeops_t *zfs_xdvnodeops;
5217 5221 const fs_operation_def_t zfs_xdvnodeops_template[] = {
5218 5222 VOPNAME_OPEN, { .vop_open = zfs_open },
5219 5223 VOPNAME_CLOSE, { .vop_close = zfs_close },
5220 5224 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5221 5225 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5222 5226 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5223 5227 VOPNAME_ACCESS, { .vop_access = zfs_access },
5224 5228 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5225 5229 VOPNAME_CREATE, { .vop_create = zfs_create },
5226 5230 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
5227 5231 VOPNAME_LINK, { .vop_link = zfs_link },
5228 5232 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5229 5233 VOPNAME_MKDIR, { .error = zfs_inval },
5230 5234 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
5231 5235 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
5232 5236 VOPNAME_SYMLINK, { .error = zfs_inval },
5233 5237 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5234 5238 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5235 5239 VOPNAME_FID, { .vop_fid = zfs_fid },
5236 5240 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5237 5241 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5238 5242 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5239 5243 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5240 5244 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5241 5245 NULL, NULL
5242 5246 };
5243 5247
5244 5248 /*
5245 5249 * Error vnode operations template
5246 5250 */
5247 5251 vnodeops_t *zfs_evnodeops;
5248 5252 const fs_operation_def_t zfs_evnodeops_template[] = {
5249 5253 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5250 5254 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5251 5255 NULL, NULL
5252 5256 };
↓ open down ↓ |
1620 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX