1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2015, Joyent, Inc. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/t_lock.h> 32 #include <sys/param.h> 33 #include <sys/time.h> 34 #include <sys/systm.h> 35 #include <sys/sysmacros.h> 36 #include <sys/resource.h> 37 #include <sys/signal.h> 38 #include <sys/cred.h> 39 #include <sys/user.h> 40 #include <sys/buf.h> 41 #include <sys/vfs.h> 42 #include <sys/vfs_opreg.h> 43 #include <sys/stat.h> 44 #include <sys/vnode.h> 45 #include <sys/mode.h> 46 #include <sys/proc.h> 47 #include <sys/disp.h> 48 #include <sys/file.h> 49 #include <sys/fcntl.h> 50 #include <sys/flock.h> 51 #include <sys/kmem.h> 52 #include <sys/uio.h> 53 #include <sys/dnlc.h> 54 #include <sys/conf.h> 55 #include <sys/errno.h> 56 #include <sys/mman.h> 57 #include <sys/fbuf.h> 58 #include <sys/pathname.h> 59 #include <sys/debug.h> 60 #include <sys/vmsystm.h> 61 #include <sys/cmn_err.h> 62 #include <sys/dirent.h> 63 #include <sys/errno.h> 64 #include <sys/modctl.h> 65 #include <sys/statvfs.h> 66 #include <sys/mount.h> 67 #include <sys/sunddi.h> 68 #include <sys/bootconf.h> 69 #include <sys/policy.h> 70 71 #include <vm/hat.h> 72 #include <vm/page.h> 73 #include <vm/pvn.h> 74 #include <vm/as.h> 75 #include <vm/seg.h> 76 #include <vm/seg_map.h> 77 #include <vm/seg_kmem.h> 78 #include <vm/seg_vn.h> 79 #include <vm/rm.h> 80 #include <vm/page.h> 81 #include <sys/swap.h> 82 83 #include <fs/fs_subr.h> 84 85 #include <sys/fs/udf_volume.h> 86 #include <sys/fs/udf_inode.h> 87 88 static int32_t udf_open(struct vnode **, 89 int32_t, struct cred *, caller_context_t *); 90 static int32_t udf_close(struct vnode *, 91 int32_t, int32_t, offset_t, struct cred *, caller_context_t *); 92 static int32_t udf_read(struct vnode *, 93 struct uio *, int32_t, struct cred *, caller_context_t *); 94 static int32_t udf_write(struct vnode *, 95 struct uio *, int32_t, struct cred *, caller_context_t *); 96 static int32_t udf_ioctl(struct vnode *, 97 int32_t, intptr_t, int32_t, struct cred *, int32_t *, 98 caller_context_t *); 99 static int32_t udf_getattr(struct vnode *, 100 struct vattr *, int32_t, struct cred *, caller_context_t *); 101 static int32_t udf_setattr(struct vnode *, 102 struct vattr *, int32_t, struct cred *, caller_context_t *); 103 static int32_t udf_access(struct vnode *, 104 int32_t, int32_t, struct cred *, caller_context_t *); 105 static int32_t udf_lookup(struct vnode *, 106 char *, struct vnode **, struct pathname *, 107 int32_t, struct vnode *, struct cred *, 108 caller_context_t *, int *, pathname_t *); 109 static int32_t udf_create(struct vnode *, 110 char *, struct vattr *, enum vcexcl, 111 int32_t, struct vnode **, struct cred *, int32_t, 112 caller_context_t *, vsecattr_t *); 113 static int32_t udf_remove(struct vnode *, 114 char *, struct cred *, caller_context_t *, int); 115 static int32_t udf_link(struct vnode *, 116 struct vnode *, char *, struct cred *, caller_context_t *, int); 117 static int32_t udf_rename(struct vnode *, 118 char *, struct vnode *, char *, struct cred *, caller_context_t *, int); 119 static int32_t udf_mkdir(struct vnode *, 120 char *, struct vattr *, struct vnode **, struct cred *, 121 caller_context_t *, int, vsecattr_t *); 122 static int32_t udf_rmdir(struct vnode *, 123 char *, struct vnode *, struct cred *, caller_context_t *, int); 124 static int32_t udf_readdir(struct vnode *, 125 struct uio *, struct cred *, int32_t *, caller_context_t *, int); 126 static int32_t udf_symlink(struct vnode *, 127 char *, struct vattr *, char *, struct cred *, caller_context_t *, int); 128 static int32_t udf_readlink(struct vnode *, 129 struct uio *, struct cred *, caller_context_t *); 130 static int32_t udf_fsync(struct vnode *, 131 int32_t, struct cred *, caller_context_t *); 132 static void udf_inactive(struct vnode *, 133 struct cred *, caller_context_t *); 134 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *); 135 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *); 136 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *); 137 static int32_t udf_seek(struct vnode *, offset_t, offset_t *, 138 caller_context_t *); 139 static int32_t udf_frlock(struct vnode *, int32_t, 140 struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *, 141 caller_context_t *); 142 static int32_t udf_space(struct vnode *, int32_t, 143 struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *); 144 static int32_t udf_getpage(struct vnode *, offset_t, 145 size_t, uint32_t *, struct page **, size_t, 146 struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *); 147 static int32_t udf_putpage(struct vnode *, offset_t, 148 size_t, int32_t, struct cred *, caller_context_t *); 149 static int32_t udf_map(struct vnode *, offset_t, struct as *, 150 caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *, 151 caller_context_t *); 152 static int32_t udf_addmap(struct vnode *, offset_t, struct as *, 153 caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *, 154 caller_context_t *); 155 static int32_t udf_delmap(struct vnode *, offset_t, struct as *, 156 caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *, 157 caller_context_t *); 158 static int32_t udf_l_pathconf(struct vnode *, int32_t, 159 ulong_t *, struct cred *, caller_context_t *); 160 static int32_t udf_pageio(struct vnode *, struct page *, 161 u_offset_t, size_t, int32_t, struct cred *, caller_context_t *); 162 163 int32_t ud_getpage_miss(struct vnode *, u_offset_t, 164 size_t, struct seg *, caddr_t, page_t *pl[], 165 size_t, enum seg_rw, int32_t); 166 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 167 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *); 168 int32_t ud_page_fill(struct ud_inode *, page_t *, 169 u_offset_t, uint32_t, u_offset_t *); 170 int32_t ud_iodone(struct buf *); 171 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *); 172 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *); 173 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t); 174 int32_t ud_slave_done(struct buf *); 175 176 /* 177 * Structures to control multiple IO operations to get or put pages 178 * that are backed by discontiguous blocks. The master struct is 179 * a dummy that holds the original bp from pageio_setup. The 180 * slave struct holds the working bp's to do the actual IO. Once 181 * all the slave IOs complete. The master is processed as if a single 182 * IO op has completed. 183 */ 184 uint32_t master_index = 0; 185 typedef struct mio_master { 186 kmutex_t mm_mutex; /* protect the fields below */ 187 int32_t mm_size; 188 buf_t *mm_bp; /* original bp */ 189 int32_t mm_resid; /* bytes remaining to transfer */ 190 int32_t mm_error; /* accumulated error from slaves */ 191 int32_t mm_index; /* XXX debugging */ 192 } mio_master_t; 193 194 typedef struct mio_slave { 195 buf_t ms_buf; /* working buffer for this IO chunk */ 196 mio_master_t *ms_ptr; /* pointer to master */ 197 } mio_slave_t; 198 199 struct vnodeops *udf_vnodeops; 200 201 const fs_operation_def_t udf_vnodeops_template[] = { 202 VOPNAME_OPEN, { .vop_open = udf_open }, 203 VOPNAME_CLOSE, { .vop_close = udf_close }, 204 VOPNAME_READ, { .vop_read = udf_read }, 205 VOPNAME_WRITE, { .vop_write = udf_write }, 206 VOPNAME_IOCTL, { .vop_ioctl = udf_ioctl }, 207 VOPNAME_GETATTR, { .vop_getattr = udf_getattr }, 208 VOPNAME_SETATTR, { .vop_setattr = udf_setattr }, 209 VOPNAME_ACCESS, { .vop_access = udf_access }, 210 VOPNAME_LOOKUP, { .vop_lookup = udf_lookup }, 211 VOPNAME_CREATE, { .vop_create = udf_create }, 212 VOPNAME_REMOVE, { .vop_remove = udf_remove }, 213 VOPNAME_LINK, { .vop_link = udf_link }, 214 VOPNAME_RENAME, { .vop_rename = udf_rename }, 215 VOPNAME_MKDIR, { .vop_mkdir = udf_mkdir }, 216 VOPNAME_RMDIR, { .vop_rmdir = udf_rmdir }, 217 VOPNAME_READDIR, { .vop_readdir = udf_readdir }, 218 VOPNAME_SYMLINK, { .vop_symlink = udf_symlink }, 219 VOPNAME_READLINK, { .vop_readlink = udf_readlink }, 220 VOPNAME_FSYNC, { .vop_fsync = udf_fsync }, 221 VOPNAME_INACTIVE, { .vop_inactive = udf_inactive }, 222 VOPNAME_FID, { .vop_fid = udf_fid }, 223 VOPNAME_RWLOCK, { .vop_rwlock = udf_rwlock }, 224 VOPNAME_RWUNLOCK, { .vop_rwunlock = udf_rwunlock }, 225 VOPNAME_SEEK, { .vop_seek = udf_seek }, 226 VOPNAME_FRLOCK, { .vop_frlock = udf_frlock }, 227 VOPNAME_SPACE, { .vop_space = udf_space }, 228 VOPNAME_GETPAGE, { .vop_getpage = udf_getpage }, 229 VOPNAME_PUTPAGE, { .vop_putpage = udf_putpage }, 230 VOPNAME_MAP, { .vop_map = udf_map }, 231 VOPNAME_ADDMAP, { .vop_addmap = udf_addmap }, 232 VOPNAME_DELMAP, { .vop_delmap = udf_delmap }, 233 VOPNAME_PATHCONF, { .vop_pathconf = udf_l_pathconf }, 234 VOPNAME_PAGEIO, { .vop_pageio = udf_pageio }, 235 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 236 NULL, NULL 237 }; 238 239 /* ARGSUSED */ 240 static int32_t 241 udf_open( 242 struct vnode **vpp, 243 int32_t flag, 244 struct cred *cr, 245 caller_context_t *ct) 246 { 247 ud_printf("udf_open\n"); 248 249 return (0); 250 } 251 252 /* ARGSUSED */ 253 static int32_t 254 udf_close( 255 struct vnode *vp, 256 int32_t flag, 257 int32_t count, 258 offset_t offset, 259 struct cred *cr, 260 caller_context_t *ct) 261 { 262 struct ud_inode *ip = VTOI(vp); 263 264 ud_printf("udf_close\n"); 265 266 ITIMES(ip); 267 268 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 269 cleanshares(vp, ttoproc(curthread)->p_pid); 270 271 /* 272 * Push partially filled cluster at last close. 273 * ``last close'' is approximated because the dnlc 274 * may have a hold on the vnode. 275 */ 276 if (vp->v_count <= 2 && vp->v_type != VBAD) { 277 struct ud_inode *ip = VTOI(vp); 278 if (ip->i_delaylen) { 279 (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen, 280 B_ASYNC | B_FREE, cr); 281 ip->i_delaylen = 0; 282 } 283 } 284 285 return (0); 286 } 287 288 /* ARGSUSED */ 289 static int32_t 290 udf_read( 291 struct vnode *vp, 292 struct uio *uiop, 293 int32_t ioflag, 294 struct cred *cr, 295 caller_context_t *ct) 296 { 297 struct ud_inode *ip = VTOI(vp); 298 int32_t error; 299 300 ud_printf("udf_read\n"); 301 302 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 303 304 if (MANDLOCK(vp, ip->i_char)) { 305 /* 306 * udf_getattr ends up being called by chklock 307 */ 308 error = chklock(vp, FREAD, uiop->uio_loffset, 309 uiop->uio_resid, uiop->uio_fmode, ct); 310 if (error) { 311 goto end; 312 } 313 } 314 315 rw_enter(&ip->i_contents, RW_READER); 316 error = ud_rdip(ip, uiop, ioflag, cr); 317 rw_exit(&ip->i_contents); 318 319 end: 320 return (error); 321 } 322 323 324 int32_t ud_WRITES = 1; 325 int32_t ud_HW = 96 * 1024; 326 int32_t ud_LW = 64 * 1024; 327 int32_t ud_throttles = 0; 328 329 /* ARGSUSED */ 330 static int32_t 331 udf_write( 332 struct vnode *vp, 333 struct uio *uiop, 334 int32_t ioflag, 335 struct cred *cr, 336 caller_context_t *ct) 337 { 338 struct ud_inode *ip = VTOI(vp); 339 int32_t error = 0; 340 341 ud_printf("udf_write\n"); 342 343 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 344 345 if (MANDLOCK(vp, ip->i_char)) { 346 /* 347 * ud_getattr ends up being called by chklock 348 */ 349 error = chklock(vp, FWRITE, uiop->uio_loffset, 350 uiop->uio_resid, uiop->uio_fmode, ct); 351 if (error) { 352 goto end; 353 } 354 } 355 /* 356 * Throttle writes. 357 */ 358 mutex_enter(&ip->i_tlock); 359 if (ud_WRITES && (ip->i_writes > ud_HW)) { 360 while (ip->i_writes > ud_HW) { 361 ud_throttles++; 362 cv_wait(&ip->i_wrcv, &ip->i_tlock); 363 } 364 } 365 mutex_exit(&ip->i_tlock); 366 367 /* 368 * Write to the file 369 */ 370 rw_enter(&ip->i_contents, RW_WRITER); 371 if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) { 372 /* 373 * In append mode start at end of file. 374 */ 375 uiop->uio_loffset = ip->i_size; 376 } 377 error = ud_wrip(ip, uiop, ioflag, cr); 378 rw_exit(&ip->i_contents); 379 380 end: 381 return (error); 382 } 383 384 /* ARGSUSED */ 385 static int32_t 386 udf_ioctl( 387 struct vnode *vp, 388 int32_t cmd, 389 intptr_t arg, 390 int32_t flag, 391 struct cred *cr, 392 int32_t *rvalp, 393 caller_context_t *ct) 394 { 395 return (ENOTTY); 396 } 397 398 /* ARGSUSED */ 399 static int32_t 400 udf_getattr( 401 struct vnode *vp, 402 struct vattr *vap, 403 int32_t flags, 404 struct cred *cr, 405 caller_context_t *ct) 406 { 407 struct ud_inode *ip = VTOI(vp); 408 409 ud_printf("udf_getattr\n"); 410 411 if (vap->va_mask == AT_SIZE) { 412 /* 413 * for performance, if only the size is requested don't bother 414 * with anything else. 415 */ 416 vap->va_size = ip->i_size; 417 return (0); 418 } 419 420 rw_enter(&ip->i_contents, RW_READER); 421 422 vap->va_type = vp->v_type; 423 vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char; 424 425 vap->va_uid = ip->i_uid; 426 vap->va_gid = ip->i_gid; 427 vap->va_fsid = ip->i_dev; 428 vap->va_nodeid = ip->i_icb_lbano; 429 vap->va_nlink = ip->i_nlink; 430 vap->va_size = ip->i_size; 431 vap->va_seq = ip->i_seq; 432 if (vp->v_type == VCHR || vp->v_type == VBLK) { 433 vap->va_rdev = ip->i_rdev; 434 } else { 435 vap->va_rdev = 0; 436 } 437 438 mutex_enter(&ip->i_tlock); 439 ITIMES_NOLOCK(ip); /* mark correct time in inode */ 440 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec; 441 vap->va_atime.tv_nsec = ip->i_atime.tv_nsec; 442 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec; 443 vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec; 444 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec; 445 vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec; 446 mutex_exit(&ip->i_tlock); 447 448 switch (ip->i_type) { 449 case VBLK: 450 vap->va_blksize = MAXBSIZE; 451 break; 452 case VCHR: 453 vap->va_blksize = MAXBSIZE; 454 break; 455 default: 456 vap->va_blksize = ip->i_udf->udf_lbsize; 457 break; 458 } 459 vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift; 460 461 rw_exit(&ip->i_contents); 462 463 return (0); 464 } 465 466 static int 467 ud_iaccess_vmode(void *ip, int mode, struct cred *cr) 468 { 469 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0)); 470 } 471 472 /*ARGSUSED4*/ 473 static int32_t 474 udf_setattr( 475 struct vnode *vp, 476 struct vattr *vap, 477 int32_t flags, 478 struct cred *cr, 479 caller_context_t *ct) 480 { 481 int32_t error = 0; 482 uint32_t mask = vap->va_mask; 483 struct ud_inode *ip; 484 timestruc_t now; 485 struct vattr ovap; 486 487 ud_printf("udf_setattr\n"); 488 489 ip = VTOI(vp); 490 491 /* 492 * not updates allowed to 4096 files 493 */ 494 if (ip->i_astrat == STRAT_TYPE4096) { 495 return (EINVAL); 496 } 497 498 /* 499 * Cannot set these attributes 500 */ 501 if (mask & AT_NOSET) { 502 return (EINVAL); 503 } 504 505 rw_enter(&ip->i_rwlock, RW_WRITER); 506 rw_enter(&ip->i_contents, RW_WRITER); 507 508 ovap.va_uid = ip->i_uid; 509 ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char; 510 error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags, 511 ud_iaccess_vmode, ip); 512 if (error) 513 goto update_inode; 514 515 mask = vap->va_mask; 516 /* 517 * Change file access modes. 518 */ 519 if (mask & AT_MODE) { 520 ip->i_perm = VA2UD_PERM(vap->va_mode); 521 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX); 522 mutex_enter(&ip->i_tlock); 523 ip->i_flag |= ICHG; 524 mutex_exit(&ip->i_tlock); 525 } 526 if (mask & (AT_UID|AT_GID)) { 527 if (mask & AT_UID) { 528 ip->i_uid = vap->va_uid; 529 } 530 if (mask & AT_GID) { 531 ip->i_gid = vap->va_gid; 532 } 533 mutex_enter(&ip->i_tlock); 534 ip->i_flag |= ICHG; 535 mutex_exit(&ip->i_tlock); 536 } 537 /* 538 * Truncate file. Must have write permission and not be a directory. 539 */ 540 if (mask & AT_SIZE) { 541 if (vp->v_type == VDIR) { 542 error = EISDIR; 543 goto update_inode; 544 } 545 if (error = ud_iaccess(ip, IWRITE, cr, 0)) { 546 goto update_inode; 547 } 548 if (vap->va_size > MAXOFFSET_T) { 549 error = EFBIG; 550 goto update_inode; 551 } 552 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) { 553 goto update_inode; 554 } 555 556 if (vap->va_size == 0) 557 vnevent_truncate(vp, ct); 558 } 559 /* 560 * Change file access or modified times. 561 */ 562 if (mask & (AT_ATIME|AT_MTIME)) { 563 mutex_enter(&ip->i_tlock); 564 if (mask & AT_ATIME) { 565 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 566 ip->i_atime.tv_nsec = vap->va_atime.tv_nsec; 567 ip->i_flag &= ~IACC; 568 } 569 if (mask & AT_MTIME) { 570 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 571 ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec; 572 gethrestime(&now); 573 ip->i_ctime.tv_sec = now.tv_sec; 574 ip->i_ctime.tv_nsec = now.tv_nsec; 575 ip->i_flag &= ~(IUPD|ICHG); 576 ip->i_flag |= IMODTIME; 577 } 578 ip->i_flag |= IMOD; 579 mutex_exit(&ip->i_tlock); 580 } 581 582 update_inode: 583 if (curthread->t_flag & T_DONTPEND) { 584 ud_iupdat(ip, 1); 585 } else { 586 ITIMES_NOLOCK(ip); 587 } 588 rw_exit(&ip->i_contents); 589 rw_exit(&ip->i_rwlock); 590 591 return (error); 592 } 593 594 /* ARGSUSED */ 595 static int32_t 596 udf_access( 597 struct vnode *vp, 598 int32_t mode, 599 int32_t flags, 600 struct cred *cr, 601 caller_context_t *ct) 602 { 603 struct ud_inode *ip = VTOI(vp); 604 605 ud_printf("udf_access\n"); 606 607 if (ip->i_udf == NULL) { 608 return (EIO); 609 } 610 611 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1)); 612 } 613 614 int32_t udfs_stickyhack = 1; 615 616 /* ARGSUSED */ 617 static int32_t 618 udf_lookup( 619 struct vnode *dvp, 620 char *nm, 621 struct vnode **vpp, 622 struct pathname *pnp, 623 int32_t flags, 624 struct vnode *rdir, 625 struct cred *cr, 626 caller_context_t *ct, 627 int *direntflags, 628 pathname_t *realpnp) 629 { 630 int32_t error; 631 struct vnode *vp; 632 struct ud_inode *ip, *xip; 633 634 ud_printf("udf_lookup\n"); 635 /* 636 * Null component name is a synonym for directory being searched. 637 */ 638 if (*nm == '\0') { 639 VN_HOLD(dvp); 640 *vpp = dvp; 641 error = 0; 642 goto out; 643 } 644 645 /* 646 * Fast path: Check the directory name lookup cache. 647 */ 648 ip = VTOI(dvp); 649 if (vp = dnlc_lookup(dvp, nm)) { 650 /* 651 * Check accessibility of directory. 652 */ 653 if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) { 654 VN_RELE(vp); 655 } 656 xip = VTOI(vp); 657 } else { 658 error = ud_dirlook(ip, nm, &xip, cr, 1); 659 ITIMES(ip); 660 } 661 662 if (error == 0) { 663 ip = xip; 664 *vpp = ITOV(ip); 665 if ((ip->i_type != VDIR) && 666 (ip->i_char & ISVTX) && 667 ((ip->i_perm & IEXEC) == 0) && 668 udfs_stickyhack) { 669 mutex_enter(&(*vpp)->v_lock); 670 (*vpp)->v_flag |= VISSWAP; 671 mutex_exit(&(*vpp)->v_lock); 672 } 673 ITIMES(ip); 674 /* 675 * If vnode is a device return special vnode instead. 676 */ 677 if (IS_DEVVP(*vpp)) { 678 struct vnode *newvp; 679 newvp = specvp(*vpp, (*vpp)->v_rdev, 680 (*vpp)->v_type, cr); 681 VN_RELE(*vpp); 682 if (newvp == NULL) { 683 error = ENOSYS; 684 } else { 685 *vpp = newvp; 686 } 687 } 688 } 689 out: 690 return (error); 691 } 692 693 /* ARGSUSED */ 694 static int32_t 695 udf_create( 696 struct vnode *dvp, 697 char *name, 698 struct vattr *vap, 699 enum vcexcl excl, 700 int32_t mode, 701 struct vnode **vpp, 702 struct cred *cr, 703 int32_t flag, 704 caller_context_t *ct, 705 vsecattr_t *vsecp) 706 { 707 int32_t error; 708 struct ud_inode *ip = VTOI(dvp), *xip; 709 710 ud_printf("udf_create\n"); 711 712 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0) 713 vap->va_mode &= ~VSVTX; 714 715 if (*name == '\0') { 716 /* 717 * Null component name refers to the directory itself. 718 */ 719 VN_HOLD(dvp); 720 ITIMES(ip); 721 error = EEXIST; 722 } else { 723 xip = NULL; 724 rw_enter(&ip->i_rwlock, RW_WRITER); 725 error = ud_direnter(ip, name, DE_CREATE, 726 (struct ud_inode *)0, (struct ud_inode *)0, 727 vap, &xip, cr, ct); 728 rw_exit(&ip->i_rwlock); 729 ITIMES(ip); 730 ip = xip; 731 } 732 if (ip != NULL) { 733 rw_enter(&ip->i_contents, RW_WRITER); 734 } 735 736 /* 737 * If the file already exists and this is a non-exclusive create, 738 * check permissions and allow access for non-directories. 739 * Read-only create of an existing directory is also allowed. 740 * We fail an exclusive create of anything which already exists. 741 */ 742 if (error == EEXIST) { 743 if (excl == NONEXCL) { 744 if ((ip->i_type == VDIR) && (mode & VWRITE)) { 745 error = EISDIR; 746 } else if (mode) { 747 error = ud_iaccess(ip, 748 UD_UPERM2DPERM(mode), cr, 0); 749 } else { 750 error = 0; 751 } 752 } 753 if (error) { 754 rw_exit(&ip->i_contents); 755 VN_RELE(ITOV(ip)); 756 goto out; 757 } else if ((ip->i_type == VREG) && 758 (vap->va_mask & AT_SIZE) && vap->va_size == 0) { 759 /* 760 * Truncate regular files, if requested by caller. 761 * Grab i_rwlock to make sure no one else is 762 * currently writing to the file (we promised 763 * bmap we would do this). 764 * Must get the locks in the correct order. 765 */ 766 if (ip->i_size == 0) { 767 ip->i_flag |= ICHG | IUPD; 768 } else { 769 rw_exit(&ip->i_contents); 770 rw_enter(&ip->i_rwlock, RW_WRITER); 771 rw_enter(&ip->i_contents, RW_WRITER); 772 (void) ud_itrunc(ip, 0, 0, cr); 773 rw_exit(&ip->i_rwlock); 774 } 775 vnevent_create(ITOV(ip), ct); 776 } 777 } 778 779 if (error == 0) { 780 *vpp = ITOV(ip); 781 ITIMES(ip); 782 } 783 if (ip != NULL) { 784 rw_exit(&ip->i_contents); 785 } 786 if (error) { 787 goto out; 788 } 789 790 /* 791 * If vnode is a device return special vnode instead. 792 */ 793 if (!error && IS_DEVVP(*vpp)) { 794 struct vnode *newvp; 795 796 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 797 VN_RELE(*vpp); 798 if (newvp == NULL) { 799 error = ENOSYS; 800 goto out; 801 } 802 *vpp = newvp; 803 } 804 out: 805 return (error); 806 } 807 808 /* ARGSUSED */ 809 static int32_t 810 udf_remove( 811 struct vnode *vp, 812 char *nm, 813 struct cred *cr, 814 caller_context_t *ct, 815 int flags) 816 { 817 int32_t error; 818 struct ud_inode *ip = VTOI(vp); 819 820 ud_printf("udf_remove\n"); 821 822 rw_enter(&ip->i_rwlock, RW_WRITER); 823 error = ud_dirremove(ip, nm, 824 (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct); 825 rw_exit(&ip->i_rwlock); 826 ITIMES(ip); 827 828 return (error); 829 } 830 831 /* ARGSUSED */ 832 static int32_t 833 udf_link( 834 struct vnode *tdvp, 835 struct vnode *svp, 836 char *tnm, 837 struct cred *cr, 838 caller_context_t *ct, 839 int flags) 840 { 841 int32_t error; 842 struct vnode *realvp; 843 struct ud_inode *sip; 844 struct ud_inode *tdp; 845 846 ud_printf("udf_link\n"); 847 if (VOP_REALVP(svp, &realvp, ct) == 0) { 848 svp = realvp; 849 } 850 851 /* 852 * Do not allow links to directories 853 */ 854 if (svp->v_type == VDIR) { 855 return (EPERM); 856 } 857 858 sip = VTOI(svp); 859 860 if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0) 861 return (EPERM); 862 863 tdp = VTOI(tdvp); 864 865 rw_enter(&tdp->i_rwlock, RW_WRITER); 866 error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0, 867 sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct); 868 rw_exit(&tdp->i_rwlock); 869 ITIMES(sip); 870 ITIMES(tdp); 871 872 if (error == 0) { 873 vnevent_link(svp, ct); 874 } 875 876 return (error); 877 } 878 879 /* ARGSUSED */ 880 static int32_t 881 udf_rename( 882 struct vnode *sdvp, 883 char *snm, 884 struct vnode *tdvp, 885 char *tnm, 886 struct cred *cr, 887 caller_context_t *ct, 888 int flags) 889 { 890 int32_t error = 0; 891 struct udf_vfs *udf_vfsp; 892 struct ud_inode *sip; /* source inode */ 893 struct ud_inode *tip; /* target inode */ 894 struct ud_inode *sdp, *tdp; /* source and target parent inode */ 895 struct vnode *realvp; 896 897 ud_printf("udf_rename\n"); 898 899 if (VOP_REALVP(tdvp, &realvp, ct) == 0) { 900 tdvp = realvp; 901 } 902 903 sdp = VTOI(sdvp); 904 tdp = VTOI(tdvp); 905 906 udf_vfsp = sdp->i_udf; 907 908 mutex_enter(&udf_vfsp->udf_rename_lck); 909 /* 910 * Look up inode of file we're supposed to rename. 911 */ 912 if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) { 913 mutex_exit(&udf_vfsp->udf_rename_lck); 914 return (error); 915 } 916 /* 917 * be sure this is not a directory with another file system mounted 918 * over it. If it is just give up the locks, and return with 919 * EBUSY 920 */ 921 if (vn_mountedvfs(ITOV(sip)) != NULL) { 922 error = EBUSY; 923 goto errout; 924 } 925 /* 926 * Make sure we can delete the source entry. This requires 927 * write permission on the containing directory. If that 928 * directory is "sticky" it further requires (except for 929 * privileged users) that the user own the directory or the 930 * source entry, or else have permission to write the source 931 * entry. 932 */ 933 rw_enter(&sdp->i_contents, RW_READER); 934 rw_enter(&sip->i_contents, RW_READER); 935 if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 || 936 (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) { 937 rw_exit(&sip->i_contents); 938 rw_exit(&sdp->i_contents); 939 ITIMES(sip); 940 goto errout; 941 } 942 943 /* 944 * Check for renaming '.' or '..' or alias of '.' 945 */ 946 if ((strcmp(snm, ".") == 0) || 947 (strcmp(snm, "..") == 0) || 948 (sdp == sip)) { 949 error = EINVAL; 950 rw_exit(&sip->i_contents); 951 rw_exit(&sdp->i_contents); 952 goto errout; 953 } 954 955 rw_exit(&sip->i_contents); 956 rw_exit(&sdp->i_contents); 957 958 if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) { 959 vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct); 960 VN_RELE(ITOV(tip)); 961 } 962 963 /* Notify the target dir. if not the same as the source dir. */ 964 if (sdvp != tdvp) 965 vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct); 966 967 vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct); 968 969 /* 970 * Link source to the target. 971 */ 972 rw_enter(&tdp->i_rwlock, RW_WRITER); 973 if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip, 974 (struct vattr *)0, (struct ud_inode **)0, cr, ct)) { 975 /* 976 * ESAME isn't really an error; it indicates that the 977 * operation should not be done because the source and target 978 * are the same file, but that no error should be reported. 979 */ 980 if (error == ESAME) { 981 error = 0; 982 } 983 rw_exit(&tdp->i_rwlock); 984 goto errout; 985 } 986 rw_exit(&tdp->i_rwlock); 987 988 rw_enter(&sdp->i_rwlock, RW_WRITER); 989 /* 990 * Unlink the source. 991 * Remove the source entry. ud_dirremove() checks that the entry 992 * still reflects sip, and returns an error if it doesn't. 993 * If the entry has changed just forget about it. Release 994 * the source inode. 995 */ 996 if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0, 997 DR_RENAME, cr, ct)) == ENOENT) { 998 error = 0; 999 } 1000 rw_exit(&sdp->i_rwlock); 1001 1002 if (error == 0) { 1003 vnevent_rename_src(ITOV(sip), sdvp, snm, ct); 1004 /* 1005 * vnevent_rename_dest and vnevent_rename_dest_dir are called 1006 * in ud_direnter(). 1007 */ 1008 } 1009 1010 errout: 1011 ITIMES(sdp); 1012 ITIMES(tdp); 1013 VN_RELE(ITOV(sip)); 1014 mutex_exit(&udf_vfsp->udf_rename_lck); 1015 1016 return (error); 1017 } 1018 1019 /* ARGSUSED */ 1020 static int32_t 1021 udf_mkdir( 1022 struct vnode *dvp, 1023 char *dirname, 1024 struct vattr *vap, 1025 struct vnode **vpp, 1026 struct cred *cr, 1027 caller_context_t *ct, 1028 int flags, 1029 vsecattr_t *vsecp) 1030 { 1031 int32_t error; 1032 struct ud_inode *ip; 1033 struct ud_inode *xip; 1034 1035 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 1036 1037 ud_printf("udf_mkdir\n"); 1038 1039 ip = VTOI(dvp); 1040 rw_enter(&ip->i_rwlock, RW_WRITER); 1041 error = ud_direnter(ip, dirname, DE_MKDIR, 1042 (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct); 1043 rw_exit(&ip->i_rwlock); 1044 ITIMES(ip); 1045 if (error == 0) { 1046 ip = xip; 1047 *vpp = ITOV(ip); 1048 ITIMES(ip); 1049 } else if (error == EEXIST) { 1050 ITIMES(xip); 1051 VN_RELE(ITOV(xip)); 1052 } 1053 1054 return (error); 1055 } 1056 1057 /* ARGSUSED */ 1058 static int32_t 1059 udf_rmdir( 1060 struct vnode *vp, 1061 char *nm, 1062 struct vnode *cdir, 1063 struct cred *cr, 1064 caller_context_t *ct, 1065 int flags) 1066 { 1067 int32_t error; 1068 struct ud_inode *ip = VTOI(vp); 1069 1070 ud_printf("udf_rmdir\n"); 1071 1072 rw_enter(&ip->i_rwlock, RW_WRITER); 1073 error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR, 1074 cr, ct); 1075 rw_exit(&ip->i_rwlock); 1076 ITIMES(ip); 1077 1078 return (error); 1079 } 1080 1081 /* ARGSUSED */ 1082 static int32_t 1083 udf_readdir( 1084 struct vnode *vp, 1085 struct uio *uiop, 1086 struct cred *cr, 1087 int32_t *eofp, 1088 caller_context_t *ct, 1089 int flags) 1090 { 1091 struct ud_inode *ip; 1092 struct dirent64 *nd; 1093 struct udf_vfs *udf_vfsp; 1094 int32_t error = 0, len, outcount = 0; 1095 uint32_t dirsiz, offset; 1096 uint32_t bufsize, ndlen, dummy; 1097 caddr_t outbuf; 1098 caddr_t outb, end_outb; 1099 struct iovec *iovp; 1100 1101 uint8_t *dname; 1102 int32_t length; 1103 1104 uint8_t *buf = NULL; 1105 1106 struct fbuf *fbp = NULL; 1107 struct file_id *fid; 1108 uint8_t *name; 1109 1110 1111 ud_printf("udf_readdir\n"); 1112 1113 ip = VTOI(vp); 1114 udf_vfsp = ip->i_udf; 1115 1116 dirsiz = ip->i_size; 1117 if ((uiop->uio_offset >= dirsiz) || 1118 (ip->i_nlink <= 0)) { 1119 if (eofp) { 1120 *eofp = 1; 1121 } 1122 return (0); 1123 } 1124 1125 offset = uiop->uio_offset; 1126 iovp = uiop->uio_iov; 1127 bufsize = iovp->iov_len; 1128 1129 outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP); 1130 end_outb = outb + bufsize; 1131 nd = (struct dirent64 *)outbuf; 1132 1133 dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP); 1134 buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP); 1135 1136 if (offset == 0) { 1137 len = DIRENT64_RECLEN(1); 1138 if (((caddr_t)nd + len) >= end_outb) { 1139 error = EINVAL; 1140 goto end; 1141 } 1142 nd->d_ino = ip->i_icb_lbano; 1143 nd->d_reclen = (uint16_t)len; 1144 nd->d_off = 0x10; 1145 nd->d_name[0] = '.'; 1146 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1); 1147 nd = (struct dirent64 *)((char *)nd + nd->d_reclen); 1148 outcount++; 1149 } else if (offset == 0x10) { 1150 offset = 0; 1151 } 1152 1153 while (offset < dirsiz) { 1154 error = ud_get_next_fid(ip, &fbp, 1155 offset, &fid, &name, buf); 1156 if (error != 0) { 1157 break; 1158 } 1159 1160 if ((fid->fid_flags & FID_DELETED) == 0) { 1161 if (fid->fid_flags & FID_PARENT) { 1162 1163 len = DIRENT64_RECLEN(2); 1164 if (((caddr_t)nd + len) >= end_outb) { 1165 error = EINVAL; 1166 break; 1167 } 1168 1169 nd->d_ino = ip->i_icb_lbano; 1170 nd->d_reclen = (uint16_t)len; 1171 nd->d_off = offset + FID_LEN(fid); 1172 nd->d_name[0] = '.'; 1173 nd->d_name[1] = '.'; 1174 bzero(&nd->d_name[2], 1175 DIRENT64_NAMELEN(len) - 2); 1176 nd = (struct dirent64 *) 1177 ((char *)nd + nd->d_reclen); 1178 } else { 1179 if ((error = ud_uncompress(fid->fid_idlen, 1180 &length, name, dname)) != 0) { 1181 break; 1182 } 1183 if (length == 0) { 1184 offset += FID_LEN(fid); 1185 continue; 1186 } 1187 len = DIRENT64_RECLEN(length); 1188 if (((caddr_t)nd + len) >= end_outb) { 1189 if (!outcount) { 1190 error = EINVAL; 1191 } 1192 break; 1193 } 1194 (void) strncpy(nd->d_name, 1195 (caddr_t)dname, length); 1196 bzero(&nd->d_name[length], 1197 DIRENT64_NAMELEN(len) - length); 1198 nd->d_ino = ud_xlate_to_daddr(udf_vfsp, 1199 SWAP_16(fid->fid_icb.lad_ext_prn), 1200 SWAP_32(fid->fid_icb.lad_ext_loc), 1, 1201 &dummy); 1202 nd->d_reclen = (uint16_t)len; 1203 nd->d_off = offset + FID_LEN(fid); 1204 nd = (struct dirent64 *) 1205 ((char *)nd + nd->d_reclen); 1206 } 1207 outcount++; 1208 } 1209 1210 offset += FID_LEN(fid); 1211 } 1212 1213 end: 1214 if (fbp != NULL) { 1215 fbrelse(fbp, S_OTHER); 1216 } 1217 ndlen = ((char *)nd - outbuf); 1218 /* 1219 * In case of error do not call uiomove. 1220 * Return the error to the caller. 1221 */ 1222 if ((error == 0) && (ndlen != 0)) { 1223 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop); 1224 uiop->uio_offset = offset; 1225 } 1226 kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize); 1227 kmem_free((caddr_t)dname, 1024); 1228 kmem_free(outbuf, (uint32_t)bufsize); 1229 if (eofp && error == 0) { 1230 *eofp = (uiop->uio_offset >= dirsiz); 1231 } 1232 return (error); 1233 } 1234 1235 /* ARGSUSED */ 1236 static int32_t 1237 udf_symlink( 1238 struct vnode *dvp, 1239 char *linkname, 1240 struct vattr *vap, 1241 char *target, 1242 struct cred *cr, 1243 caller_context_t *ct, 1244 int flags) 1245 { 1246 int32_t error = 0, outlen; 1247 uint32_t ioflag = 0; 1248 struct ud_inode *ip, *dip = VTOI(dvp); 1249 1250 struct path_comp *pc; 1251 int8_t *dname = NULL, *uname = NULL, *sp; 1252 1253 ud_printf("udf_symlink\n"); 1254 1255 ip = (struct ud_inode *)0; 1256 vap->va_type = VLNK; 1257 vap->va_rdev = 0; 1258 1259 rw_enter(&dip->i_rwlock, RW_WRITER); 1260 error = ud_direnter(dip, linkname, DE_CREATE, 1261 (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct); 1262 rw_exit(&dip->i_rwlock); 1263 if (error == 0) { 1264 dname = kmem_zalloc(1024, KM_SLEEP); 1265 uname = kmem_zalloc(PAGESIZE, KM_SLEEP); 1266 1267 pc = (struct path_comp *)uname; 1268 /* 1269 * If the first character in target is "/" 1270 * then skip it and create entry for it 1271 */ 1272 if (*target == '/') { 1273 pc->pc_type = 2; 1274 pc->pc_len = 0; 1275 pc = (struct path_comp *)(((char *)pc) + 4); 1276 while (*target == '/') { 1277 target++; 1278 } 1279 } 1280 1281 while (*target != NULL) { 1282 sp = target; 1283 while ((*target != '/') && (*target != '\0')) { 1284 target ++; 1285 } 1286 /* 1287 * We got the next component of the 1288 * path name. Create path_comp of 1289 * appropriate type 1290 */ 1291 if (((target - sp) == 1) && (*sp == '.')) { 1292 /* 1293 * Dot entry. 1294 */ 1295 pc->pc_type = 4; 1296 pc = (struct path_comp *)(((char *)pc) + 4); 1297 } else if (((target - sp) == 2) && 1298 (*sp == '.') && ((*(sp + 1)) == '.')) { 1299 /* 1300 * DotDot entry. 1301 */ 1302 pc->pc_type = 3; 1303 pc = (struct path_comp *)(((char *)pc) + 4); 1304 } else { 1305 /* 1306 * convert the user given name 1307 * into appropriate form to be put 1308 * on the media 1309 */ 1310 outlen = 1024; /* set to size of dname */ 1311 if (error = ud_compress(target - sp, &outlen, 1312 (uint8_t *)sp, (uint8_t *)dname)) { 1313 break; 1314 } 1315 pc->pc_type = 5; 1316 /* LINTED */ 1317 pc->pc_len = outlen; 1318 dname[outlen] = '\0'; 1319 (void) strcpy((char *)pc->pc_id, dname); 1320 pc = (struct path_comp *) 1321 (((char *)pc) + 4 + outlen); 1322 } 1323 while (*target == '/') { 1324 target++; 1325 } 1326 if (*target == NULL) { 1327 break; 1328 } 1329 } 1330 1331 rw_enter(&ip->i_contents, RW_WRITER); 1332 if (error == 0) { 1333 ioflag = FWRITE; 1334 if (curthread->t_flag & T_DONTPEND) { 1335 ioflag |= FDSYNC; 1336 } 1337 error = ud_rdwri(UIO_WRITE, ioflag, ip, 1338 uname, ((int8_t *)pc) - uname, 1339 (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr); 1340 } 1341 if (error) { 1342 ud_idrop(ip); 1343 rw_exit(&ip->i_contents); 1344 rw_enter(&dip->i_rwlock, RW_WRITER); 1345 (void) ud_dirremove(dip, linkname, (struct ud_inode *)0, 1346 (struct vnode *)0, DR_REMOVE, cr, ct); 1347 rw_exit(&dip->i_rwlock); 1348 goto update_inode; 1349 } 1350 rw_exit(&ip->i_contents); 1351 } 1352 1353 if ((error == 0) || (error == EEXIST)) { 1354 VN_RELE(ITOV(ip)); 1355 } 1356 1357 update_inode: 1358 ITIMES(VTOI(dvp)); 1359 if (uname != NULL) { 1360 kmem_free(uname, PAGESIZE); 1361 } 1362 if (dname != NULL) { 1363 kmem_free(dname, 1024); 1364 } 1365 1366 return (error); 1367 } 1368 1369 /* ARGSUSED */ 1370 static int32_t 1371 udf_readlink( 1372 struct vnode *vp, 1373 struct uio *uiop, 1374 struct cred *cr, 1375 caller_context_t *ct) 1376 { 1377 int32_t error = 0, off, id_len, size, len; 1378 int8_t *dname = NULL, *uname = NULL; 1379 struct ud_inode *ip; 1380 struct fbuf *fbp = NULL; 1381 struct path_comp *pc; 1382 1383 ud_printf("udf_readlink\n"); 1384 1385 if (vp->v_type != VLNK) { 1386 return (EINVAL); 1387 } 1388 1389 ip = VTOI(vp); 1390 size = ip->i_size; 1391 if (size > PAGESIZE) { 1392 return (EIO); 1393 } 1394 1395 if (size == 0) { 1396 return (0); 1397 } 1398 1399 dname = kmem_zalloc(1024, KM_SLEEP); 1400 uname = kmem_zalloc(PAGESIZE, KM_SLEEP); 1401 1402 rw_enter(&ip->i_contents, RW_READER); 1403 1404 if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) { 1405 goto end; 1406 } 1407 1408 off = 0; 1409 1410 while (off < size) { 1411 pc = (struct path_comp *)(fbp->fb_addr + off); 1412 switch (pc->pc_type) { 1413 case 1 : 1414 (void) strcpy(uname, ip->i_udf->udf_fsmnt); 1415 (void) strcat(uname, "/"); 1416 break; 1417 case 2 : 1418 if (pc->pc_len != 0) { 1419 goto end; 1420 } 1421 uname[0] = '/'; 1422 uname[1] = '\0'; 1423 break; 1424 case 3 : 1425 (void) strcat(uname, "../"); 1426 break; 1427 case 4 : 1428 (void) strcat(uname, "./"); 1429 break; 1430 case 5 : 1431 if ((error = ud_uncompress(pc->pc_len, &id_len, 1432 pc->pc_id, (uint8_t *)dname)) != 0) { 1433 break; 1434 } 1435 dname[id_len] = '\0'; 1436 (void) strcat(uname, dname); 1437 (void) strcat(uname, "/"); 1438 break; 1439 default : 1440 error = EINVAL; 1441 goto end; 1442 } 1443 off += 4 + pc->pc_len; 1444 } 1445 len = strlen(uname) - 1; 1446 if (uname[len] == '/') { 1447 if (len == 0) { 1448 /* 1449 * special case link to / 1450 */ 1451 len = 1; 1452 } else { 1453 uname[len] = '\0'; 1454 } 1455 } 1456 1457 error = uiomove(uname, len, UIO_READ, uiop); 1458 1459 ITIMES(ip); 1460 1461 end: 1462 if (fbp != NULL) { 1463 fbrelse(fbp, S_OTHER); 1464 } 1465 rw_exit(&ip->i_contents); 1466 if (uname != NULL) { 1467 kmem_free(uname, PAGESIZE); 1468 } 1469 if (dname != NULL) { 1470 kmem_free(dname, 1024); 1471 } 1472 return (error); 1473 } 1474 1475 /* ARGSUSED */ 1476 static int32_t 1477 udf_fsync( 1478 struct vnode *vp, 1479 int32_t syncflag, 1480 struct cred *cr, 1481 caller_context_t *ct) 1482 { 1483 int32_t error = 0; 1484 struct ud_inode *ip = VTOI(vp); 1485 1486 ud_printf("udf_fsync\n"); 1487 1488 rw_enter(&ip->i_contents, RW_WRITER); 1489 if (!(IS_SWAPVP(vp))) { 1490 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */ 1491 } 1492 if (error == 0) { 1493 error = ud_sync_indir(ip); 1494 } 1495 ITIMES(ip); /* XXX: is this necessary ??? */ 1496 rw_exit(&ip->i_contents); 1497 1498 return (error); 1499 } 1500 1501 /* ARGSUSED */ 1502 static void 1503 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) 1504 { 1505 ud_printf("udf_iinactive\n"); 1506 1507 ud_iinactive(VTOI(vp), cr); 1508 } 1509 1510 /* ARGSUSED */ 1511 static int32_t 1512 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) 1513 { 1514 struct udf_fid *udfidp; 1515 struct ud_inode *ip = VTOI(vp); 1516 1517 ud_printf("udf_fid\n"); 1518 1519 if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) { 1520 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t); 1521 return (ENOSPC); 1522 } 1523 1524 udfidp = (struct udf_fid *)fidp; 1525 bzero((char *)udfidp, sizeof (struct udf_fid)); 1526 rw_enter(&ip->i_contents, RW_READER); 1527 udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t); 1528 udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff; 1529 udfidp->udfid_prn = ip->i_icb_prn; 1530 udfidp->udfid_icb_lbn = ip->i_icb_block; 1531 rw_exit(&ip->i_contents); 1532 1533 return (0); 1534 } 1535 1536 /* ARGSUSED2 */ 1537 static int 1538 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp) 1539 { 1540 struct ud_inode *ip = VTOI(vp); 1541 1542 ud_printf("udf_rwlock\n"); 1543 1544 if (write_lock) { 1545 rw_enter(&ip->i_rwlock, RW_WRITER); 1546 } else { 1547 rw_enter(&ip->i_rwlock, RW_READER); 1548 } 1549 return (write_lock); 1550 } 1551 1552 /* ARGSUSED */ 1553 static void 1554 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp) 1555 { 1556 struct ud_inode *ip = VTOI(vp); 1557 1558 ud_printf("udf_rwunlock\n"); 1559 1560 rw_exit(&ip->i_rwlock); 1561 1562 } 1563 1564 /* ARGSUSED */ 1565 static int32_t 1566 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 1567 { 1568 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 1569 } 1570 1571 static int32_t 1572 udf_frlock( 1573 struct vnode *vp, 1574 int32_t cmd, 1575 struct flock64 *bfp, 1576 int32_t flag, 1577 offset_t offset, 1578 struct flk_callback *flk_cbp, 1579 cred_t *cr, 1580 caller_context_t *ct) 1581 { 1582 struct ud_inode *ip = VTOI(vp); 1583 1584 ud_printf("udf_frlock\n"); 1585 1586 /* 1587 * If file is being mapped, disallow frlock. 1588 * XXX I am not holding tlock while checking i_mapcnt because the 1589 * current locking strategy drops all locks before calling fs_frlock. 1590 * So, mapcnt could change before we enter fs_frlock making is 1591 * meaningless to have held tlock in the first place. 1592 */ 1593 if ((ip->i_mapcnt > 0) && 1594 (MANDLOCK(vp, ip->i_char))) { 1595 return (EAGAIN); 1596 } 1597 1598 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 1599 } 1600 1601 /*ARGSUSED6*/ 1602 static int32_t 1603 udf_space( 1604 struct vnode *vp, 1605 int32_t cmd, 1606 struct flock64 *bfp, 1607 int32_t flag, 1608 offset_t offset, 1609 cred_t *cr, 1610 caller_context_t *ct) 1611 { 1612 int32_t error = 0; 1613 1614 ud_printf("udf_space\n"); 1615 1616 if (cmd != F_FREESP) { 1617 error = EINVAL; 1618 } else if ((error = convoff(vp, bfp, 0, offset)) == 0) { 1619 error = ud_freesp(vp, bfp, flag, cr); 1620 1621 if (error == 0 && bfp->l_start == 0) 1622 vnevent_truncate(vp, ct); 1623 } 1624 1625 return (error); 1626 } 1627 1628 /* ARGSUSED */ 1629 static int32_t 1630 udf_getpage( 1631 struct vnode *vp, 1632 offset_t off, 1633 size_t len, 1634 uint32_t *protp, 1635 struct page **plarr, 1636 size_t plsz, 1637 struct seg *seg, 1638 caddr_t addr, 1639 enum seg_rw rw, 1640 struct cred *cr, 1641 caller_context_t *ct) 1642 { 1643 struct ud_inode *ip = VTOI(vp); 1644 int32_t error, has_holes, beyond_eof, seqmode, dolock; 1645 int32_t pgsize = PAGESIZE; 1646 struct udf_vfs *udf_vfsp = ip->i_udf; 1647 page_t **pl; 1648 u_offset_t pgoff, eoff, uoff; 1649 krw_t rwtype; 1650 caddr_t pgaddr; 1651 1652 ud_printf("udf_getpage\n"); 1653 1654 uoff = (u_offset_t)off; /* type conversion */ 1655 if (protp) { 1656 *protp = PROT_ALL; 1657 } 1658 if (vp->v_flag & VNOMAP) { 1659 return (ENOSYS); 1660 } 1661 seqmode = ip->i_nextr == uoff && rw != S_CREATE; 1662 1663 rwtype = RW_READER; 1664 dolock = (rw_owner(&ip->i_contents) != curthread); 1665 retrylock: 1666 if (dolock) { 1667 rw_enter(&ip->i_contents, rwtype); 1668 } 1669 1670 /* 1671 * We may be getting called as a side effect of a bmap using 1672 * fbread() when the blocks might be being allocated and the 1673 * size has not yet been up'ed. In this case we want to be 1674 * able to return zero pages if we get back UDF_HOLE from 1675 * calling bmap for a non write case here. We also might have 1676 * to read some frags from the disk into a page if we are 1677 * extending the number of frags for a given lbn in bmap(). 1678 */ 1679 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET; 1680 if (beyond_eof && seg != segkmap) { 1681 if (dolock) { 1682 rw_exit(&ip->i_contents); 1683 } 1684 return (EFAULT); 1685 } 1686 1687 /* 1688 * Must hold i_contents lock throughout the call to pvn_getpages 1689 * since locked pages are returned from each call to ud_getapage. 1690 * Must *not* return locked pages and then try for contents lock 1691 * due to lock ordering requirements (inode > page) 1692 */ 1693 1694 has_holes = ud_bmap_has_holes(ip); 1695 1696 if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) { 1697 int32_t blk_size, count; 1698 u_offset_t offset; 1699 1700 /* 1701 * We must acquire the RW_WRITER lock in order to 1702 * call bmap_write(). 1703 */ 1704 if (dolock && rwtype == RW_READER) { 1705 rwtype = RW_WRITER; 1706 1707 if (!rw_tryupgrade(&ip->i_contents)) { 1708 1709 rw_exit(&ip->i_contents); 1710 1711 goto retrylock; 1712 } 1713 } 1714 1715 /* 1716 * May be allocating disk blocks for holes here as 1717 * a result of mmap faults. write(2) does the bmap_write 1718 * in rdip/wrip, not here. We are not dealing with frags 1719 * in this case. 1720 */ 1721 offset = uoff; 1722 while ((offset < uoff + len) && 1723 (offset < ip->i_size)) { 1724 /* 1725 * the variable "bnp" is to simplify the expression for 1726 * the compiler; * just passing in &bn to bmap_write 1727 * causes a compiler "loop" 1728 */ 1729 1730 blk_size = udf_vfsp->udf_lbsize; 1731 if ((offset + blk_size) > ip->i_size) { 1732 count = ip->i_size - offset; 1733 } else { 1734 count = blk_size; 1735 } 1736 error = ud_bmap_write(ip, offset, count, 0, cr); 1737 if (error) { 1738 goto update_inode; 1739 } 1740 offset += count; /* XXX - make this contig */ 1741 } 1742 } 1743 1744 /* 1745 * Can be a reader from now on. 1746 */ 1747 if (dolock && rwtype == RW_WRITER) { 1748 rw_downgrade(&ip->i_contents); 1749 } 1750 1751 /* 1752 * We remove PROT_WRITE in cases when the file has UDF holes 1753 * because we don't want to call bmap_read() to check each 1754 * page if it is backed with a disk block. 1755 */ 1756 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) { 1757 *protp &= ~PROT_WRITE; 1758 } 1759 1760 error = 0; 1761 1762 /* 1763 * The loop looks up pages in the range <off, off + len). 1764 * For each page, we first check if we should initiate an asynchronous 1765 * read ahead before we call page_lookup (we may sleep in page_lookup 1766 * for a previously initiated disk read). 1767 */ 1768 eoff = (uoff + len); 1769 for (pgoff = uoff, pgaddr = addr, pl = plarr; 1770 pgoff < eoff; /* empty */) { 1771 page_t *pp; 1772 u_offset_t nextrio; 1773 se_t se; 1774 1775 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED); 1776 1777 /* 1778 * Handle async getpage (faultahead) 1779 */ 1780 if (plarr == NULL) { 1781 ip->i_nextrio = pgoff; 1782 ud_getpage_ra(vp, pgoff, seg, pgaddr); 1783 pgoff += pgsize; 1784 pgaddr += pgsize; 1785 continue; 1786 } 1787 1788 /* 1789 * Check if we should initiate read ahead of next cluster. 1790 * We call page_exists only when we need to confirm that 1791 * we have the current page before we initiate the read ahead. 1792 */ 1793 nextrio = ip->i_nextrio; 1794 if (seqmode && 1795 pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio && 1796 nextrio < ip->i_size && page_exists(vp, pgoff)) 1797 ud_getpage_ra(vp, pgoff, seg, pgaddr); 1798 1799 if ((pp = page_lookup(vp, pgoff, se)) != NULL) { 1800 1801 /* 1802 * We found the page in the page cache. 1803 */ 1804 *pl++ = pp; 1805 pgoff += pgsize; 1806 pgaddr += pgsize; 1807 len -= pgsize; 1808 plsz -= pgsize; 1809 } else { 1810 1811 /* 1812 * We have to create the page, or read it from disk. 1813 */ 1814 if (error = ud_getpage_miss(vp, pgoff, len, 1815 seg, pgaddr, pl, plsz, rw, seqmode)) { 1816 goto error_out; 1817 } 1818 1819 while (*pl != NULL) { 1820 pl++; 1821 pgoff += pgsize; 1822 pgaddr += pgsize; 1823 len -= pgsize; 1824 plsz -= pgsize; 1825 } 1826 } 1827 } 1828 1829 /* 1830 * Return pages up to plsz if they are in the page cache. 1831 * We cannot return pages if there is a chance that they are 1832 * backed with a UDF hole and rw is S_WRITE or S_CREATE. 1833 */ 1834 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) { 1835 1836 ASSERT((protp == NULL) || 1837 !(has_holes && (*protp & PROT_WRITE))); 1838 1839 eoff = pgoff + plsz; 1840 while (pgoff < eoff) { 1841 page_t *pp; 1842 1843 if ((pp = page_lookup_nowait(vp, pgoff, 1844 SE_SHARED)) == NULL) 1845 break; 1846 1847 *pl++ = pp; 1848 pgoff += pgsize; 1849 plsz -= pgsize; 1850 } 1851 } 1852 1853 if (plarr) 1854 *pl = NULL; /* Terminate page list */ 1855 ip->i_nextr = pgoff; 1856 1857 error_out: 1858 if (error && plarr) { 1859 /* 1860 * Release any pages we have locked. 1861 */ 1862 while (pl > &plarr[0]) 1863 page_unlock(*--pl); 1864 1865 plarr[0] = NULL; 1866 } 1867 1868 update_inode: 1869 if (dolock) { 1870 rw_exit(&ip->i_contents); 1871 } 1872 1873 /* 1874 * If the inode is not already marked for IACC (in rwip() for read) 1875 * and the inode is not marked for no access time update (in rwip() 1876 * for write) then update the inode access time and mod time now. 1877 */ 1878 mutex_enter(&ip->i_tlock); 1879 if ((ip->i_flag & (IACC | INOACC)) == 0) { 1880 if ((rw != S_OTHER) && (ip->i_type != VDIR)) { 1881 ip->i_flag |= IACC; 1882 } 1883 if (rw == S_WRITE) { 1884 ip->i_flag |= IUPD; 1885 } 1886 ITIMES_NOLOCK(ip); 1887 } 1888 mutex_exit(&ip->i_tlock); 1889 1890 return (error); 1891 } 1892 1893 int32_t ud_delay = 1; 1894 1895 /* ARGSUSED */ 1896 static int32_t 1897 udf_putpage( 1898 struct vnode *vp, 1899 offset_t off, 1900 size_t len, 1901 int32_t flags, 1902 struct cred *cr, 1903 caller_context_t *ct) 1904 { 1905 struct ud_inode *ip; 1906 int32_t error = 0; 1907 1908 ud_printf("udf_putpage\n"); 1909 1910 ip = VTOI(vp); 1911 1912 if (vp->v_count == 0) { 1913 cmn_err(CE_WARN, "ud_putpage : bad v_count"); 1914 error = EINVAL; 1915 goto out; 1916 } 1917 1918 if (vp->v_flag & VNOMAP) { 1919 error = ENOSYS; 1920 goto out; 1921 } 1922 1923 if (flags & B_ASYNC) { 1924 if (ud_delay && len && 1925 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) { 1926 mutex_enter(&ip->i_tlock); 1927 1928 /* 1929 * If nobody stalled, start a new cluster. 1930 */ 1931 if (ip->i_delaylen == 0) { 1932 ip->i_delayoff = off; 1933 ip->i_delaylen = len; 1934 mutex_exit(&ip->i_tlock); 1935 goto out; 1936 } 1937 1938 /* 1939 * If we have a full cluster or they are not contig, 1940 * then push last cluster and start over. 1941 */ 1942 if (ip->i_delaylen >= WR_CLUSTSZ(ip) || 1943 ip->i_delayoff + ip->i_delaylen != off) { 1944 u_offset_t doff; 1945 size_t dlen; 1946 1947 doff = ip->i_delayoff; 1948 dlen = ip->i_delaylen; 1949 ip->i_delayoff = off; 1950 ip->i_delaylen = len; 1951 mutex_exit(&ip->i_tlock); 1952 error = ud_putpages(vp, doff, dlen, flags, cr); 1953 /* LMXXX - flags are new val, not old */ 1954 goto out; 1955 } 1956 1957 /* 1958 * There is something there, it's not full, and 1959 * it is contig. 1960 */ 1961 ip->i_delaylen += len; 1962 mutex_exit(&ip->i_tlock); 1963 goto out; 1964 } 1965 1966 /* 1967 * Must have weird flags or we are not clustering. 1968 */ 1969 } 1970 1971 error = ud_putpages(vp, off, len, flags, cr); 1972 1973 out: 1974 return (error); 1975 } 1976 1977 /* ARGSUSED */ 1978 static int32_t 1979 udf_map( 1980 struct vnode *vp, 1981 offset_t off, 1982 struct as *as, 1983 caddr_t *addrp, 1984 size_t len, 1985 uint8_t prot, 1986 uint8_t maxprot, 1987 uint32_t flags, 1988 struct cred *cr, 1989 caller_context_t *ct) 1990 { 1991 struct segvn_crargs vn_a; 1992 int32_t error = 0; 1993 1994 ud_printf("udf_map\n"); 1995 1996 if (vp->v_flag & VNOMAP) { 1997 error = ENOSYS; 1998 goto end; 1999 } 2000 2001 if ((off < (offset_t)0) || 2002 ((off + len) < (offset_t)0)) { 2003 error = EINVAL; 2004 goto end; 2005 } 2006 2007 if (vp->v_type != VREG) { 2008 error = ENODEV; 2009 goto end; 2010 } 2011 2012 /* 2013 * If file is being locked, disallow mapping. 2014 */ 2015 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) { 2016 error = EAGAIN; 2017 goto end; 2018 } 2019 2020 as_rangelock(as); 2021 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 2022 if (error != 0) { 2023 as_rangeunlock(as); 2024 goto end; 2025 } 2026 2027 vn_a.vp = vp; 2028 vn_a.offset = off; 2029 vn_a.type = flags & MAP_TYPE; 2030 vn_a.prot = prot; 2031 vn_a.maxprot = maxprot; 2032 vn_a.cred = cr; 2033 vn_a.amp = NULL; 2034 vn_a.flags = flags & ~MAP_TYPE; 2035 vn_a.szc = 0; 2036 vn_a.lgrp_mem_policy_flags = 0; 2037 2038 error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a); 2039 as_rangeunlock(as); 2040 2041 end: 2042 return (error); 2043 } 2044 2045 /* ARGSUSED */ 2046 static int32_t 2047 udf_addmap(struct vnode *vp, 2048 offset_t off, 2049 struct as *as, 2050 caddr_t addr, 2051 size_t len, 2052 uint8_t prot, 2053 uint8_t maxprot, 2054 uint32_t flags, 2055 struct cred *cr, 2056 caller_context_t *ct) 2057 { 2058 struct ud_inode *ip = VTOI(vp); 2059 2060 ud_printf("udf_addmap\n"); 2061 2062 if (vp->v_flag & VNOMAP) { 2063 return (ENOSYS); 2064 } 2065 2066 mutex_enter(&ip->i_tlock); 2067 ip->i_mapcnt += btopr(len); 2068 mutex_exit(&ip->i_tlock); 2069 2070 return (0); 2071 } 2072 2073 /* ARGSUSED */ 2074 static int32_t 2075 udf_delmap( 2076 struct vnode *vp, offset_t off, 2077 struct as *as, 2078 caddr_t addr, 2079 size_t len, 2080 uint32_t prot, 2081 uint32_t maxprot, 2082 uint32_t flags, 2083 struct cred *cr, 2084 caller_context_t *ct) 2085 { 2086 struct ud_inode *ip = VTOI(vp); 2087 2088 ud_printf("udf_delmap\n"); 2089 2090 if (vp->v_flag & VNOMAP) { 2091 return (ENOSYS); 2092 } 2093 2094 mutex_enter(&ip->i_tlock); 2095 ip->i_mapcnt -= btopr(len); /* Count released mappings */ 2096 ASSERT(ip->i_mapcnt >= 0); 2097 mutex_exit(&ip->i_tlock); 2098 2099 return (0); 2100 } 2101 2102 /* ARGSUSED */ 2103 static int32_t 2104 udf_l_pathconf( 2105 struct vnode *vp, 2106 int32_t cmd, 2107 ulong_t *valp, 2108 struct cred *cr, 2109 caller_context_t *ct) 2110 { 2111 int32_t error = 0; 2112 2113 ud_printf("udf_l_pathconf\n"); 2114 2115 if (cmd == _PC_FILESIZEBITS) { 2116 /* 2117 * udf supports 64 bits as file size 2118 * but there are several other restrictions 2119 * it only supports 32-bit block numbers and 2120 * daddr32_t is only and int32_t so taking these 2121 * into account we can stay just as where ufs is 2122 */ 2123 *valp = 41; 2124 } else if (cmd == _PC_TIMESTAMP_RESOLUTION) { 2125 /* nanosecond timestamp resolution */ 2126 *valp = 1L; 2127 } else { 2128 error = fs_pathconf(vp, cmd, valp, cr, ct); 2129 } 2130 2131 return (error); 2132 } 2133 2134 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0; 2135 2136 /* 2137 * Assumption is that there will not be a pageio request 2138 * to a enbedded file 2139 */ 2140 /* ARGSUSED */ 2141 static int32_t 2142 udf_pageio( 2143 struct vnode *vp, 2144 struct page *pp, 2145 u_offset_t io_off, 2146 size_t io_len, 2147 int32_t flags, 2148 struct cred *cr, 2149 caller_context_t *ct) 2150 { 2151 daddr_t bn; 2152 struct buf *bp; 2153 struct ud_inode *ip = VTOI(vp); 2154 int32_t dolock, error = 0, contig, multi_io; 2155 size_t done_len = 0, cur_len = 0; 2156 page_t *npp = NULL, *opp = NULL, *cpp = pp; 2157 2158 if (pp == NULL) { 2159 return (EINVAL); 2160 } 2161 2162 dolock = (rw_owner(&ip->i_contents) != curthread); 2163 2164 /* 2165 * We need a better check. Ideally, we would use another 2166 * vnodeops so that hlocked and forcibly unmounted file 2167 * systems would return EIO where appropriate and w/o the 2168 * need for these checks. 2169 */ 2170 if (ip->i_udf == NULL) { 2171 return (EIO); 2172 } 2173 2174 if (dolock) { 2175 rw_enter(&ip->i_contents, RW_READER); 2176 } 2177 2178 /* 2179 * Break the io request into chunks, one for each contiguous 2180 * stretch of disk blocks in the target file. 2181 */ 2182 while (done_len < io_len) { 2183 ASSERT(cpp); 2184 bp = NULL; 2185 contig = 0; 2186 if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len), 2187 &bn, &contig)) { 2188 break; 2189 } 2190 2191 if (bn == UDF_HOLE) { /* No holey swapfiles */ 2192 cmn_err(CE_WARN, "SWAP file has HOLES"); 2193 error = EINVAL; 2194 break; 2195 } 2196 2197 cur_len = MIN(io_len - done_len, contig); 2198 2199 /* 2200 * Check if more than one I/O is 2201 * required to complete the given 2202 * I/O operation 2203 */ 2204 if (ip->i_udf->udf_lbsize < PAGESIZE) { 2205 if (cur_len >= PAGESIZE) { 2206 multi_io = 0; 2207 cur_len &= PAGEMASK; 2208 } else { 2209 multi_io = 1; 2210 cur_len = MIN(io_len - done_len, PAGESIZE); 2211 } 2212 } 2213 page_list_break(&cpp, &npp, btop(cur_len)); 2214 2215 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags); 2216 ASSERT(bp != NULL); 2217 2218 bp->b_edev = ip->i_dev; 2219 bp->b_dev = cmpdev(ip->i_dev); 2220 bp->b_blkno = bn; 2221 bp->b_un.b_addr = (caddr_t)0; 2222 bp->b_file = vp; 2223 bp->b_offset = (offset_t)(io_off + done_len); 2224 2225 /* 2226 * ub.ub_pageios.value.ul++; 2227 */ 2228 if (multi_io == 0) { 2229 (void) bdev_strategy(bp); 2230 } else { 2231 error = ud_multi_strat(ip, cpp, bp, 2232 (u_offset_t)(io_off + done_len)); 2233 if (error != 0) { 2234 pageio_done(bp); 2235 break; 2236 } 2237 } 2238 if (flags & B_READ) { 2239 ud_pageio_reads++; 2240 } else { 2241 ud_pageio_writes++; 2242 } 2243 2244 /* 2245 * If the request is not B_ASYNC, wait for i/o to complete 2246 * and re-assemble the page list to return to the caller. 2247 * If it is B_ASYNC we leave the page list in pieces and 2248 * cleanup() will dispose of them. 2249 */ 2250 if ((flags & B_ASYNC) == 0) { 2251 error = biowait(bp); 2252 pageio_done(bp); 2253 if (error) { 2254 break; 2255 } 2256 page_list_concat(&opp, &cpp); 2257 } 2258 cpp = npp; 2259 npp = NULL; 2260 done_len += cur_len; 2261 } 2262 2263 ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len)); 2264 if (error) { 2265 if (flags & B_ASYNC) { 2266 /* Cleanup unprocessed parts of list */ 2267 page_list_concat(&cpp, &npp); 2268 if (flags & B_READ) { 2269 pvn_read_done(cpp, B_ERROR); 2270 } else { 2271 pvn_write_done(cpp, B_ERROR); 2272 } 2273 } else { 2274 /* Re-assemble list and let caller clean up */ 2275 page_list_concat(&opp, &cpp); 2276 page_list_concat(&opp, &npp); 2277 } 2278 } 2279 2280 if (dolock) { 2281 rw_exit(&ip->i_contents); 2282 } 2283 2284 return (error); 2285 } 2286 2287 2288 2289 2290 /* -------------------- local functions --------------------------- */ 2291 2292 2293 2294 int32_t 2295 ud_rdwri(enum uio_rw rw, int32_t ioflag, 2296 struct ud_inode *ip, caddr_t base, int32_t len, 2297 offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr) 2298 { 2299 int32_t error; 2300 struct uio auio; 2301 struct iovec aiov; 2302 2303 ud_printf("ud_rdwri\n"); 2304 2305 bzero((caddr_t)&auio, sizeof (uio_t)); 2306 bzero((caddr_t)&aiov, sizeof (iovec_t)); 2307 2308 aiov.iov_base = base; 2309 aiov.iov_len = len; 2310 auio.uio_iov = &aiov; 2311 auio.uio_iovcnt = 1; 2312 auio.uio_loffset = offset; 2313 auio.uio_segflg = (int16_t)seg; 2314 auio.uio_resid = len; 2315 2316 if (rw == UIO_WRITE) { 2317 auio.uio_fmode = FWRITE; 2318 auio.uio_extflg = UIO_COPY_DEFAULT; 2319 auio.uio_llimit = curproc->p_fsz_ctl; 2320 error = ud_wrip(ip, &auio, ioflag, cr); 2321 } else { 2322 auio.uio_fmode = FREAD; 2323 auio.uio_extflg = UIO_COPY_CACHED; 2324 auio.uio_llimit = MAXOFFSET_T; 2325 error = ud_rdip(ip, &auio, ioflag, cr); 2326 } 2327 2328 if (aresid) { 2329 *aresid = auio.uio_resid; 2330 } else if (auio.uio_resid) { 2331 error = EIO; 2332 } 2333 return (error); 2334 } 2335 2336 /* 2337 * Free behind hacks. The pager is busted. 2338 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 2339 * or B_FREE_IF_TIGHT_ON_MEMORY. 2340 */ 2341 int32_t ud_freebehind = 1; 2342 int32_t ud_smallfile = 32 * 1024; 2343 2344 /* ARGSUSED */ 2345 int32_t 2346 ud_getpage_miss(struct vnode *vp, u_offset_t off, 2347 size_t len, struct seg *seg, caddr_t addr, page_t *pl[], 2348 size_t plsz, enum seg_rw rw, int32_t seq) 2349 { 2350 struct ud_inode *ip = VTOI(vp); 2351 int32_t err = 0; 2352 size_t io_len; 2353 u_offset_t io_off; 2354 u_offset_t pgoff; 2355 page_t *pp; 2356 2357 pl[0] = NULL; 2358 2359 /* 2360 * Figure out whether the page can be created, or must be 2361 * read from the disk 2362 */ 2363 if (rw == S_CREATE) { 2364 if ((pp = page_create_va(vp, off, 2365 PAGESIZE, PG_WAIT, seg, addr)) == NULL) { 2366 cmn_err(CE_WARN, "ud_getpage_miss: page_create"); 2367 return (EINVAL); 2368 } 2369 io_len = PAGESIZE; 2370 } else { 2371 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 2372 &io_len, off, PAGESIZE, 0); 2373 2374 /* 2375 * Some other thread has entered the page. 2376 * ud_getpage will retry page_lookup. 2377 */ 2378 if (pp == NULL) { 2379 return (0); 2380 } 2381 2382 /* 2383 * Fill the page with as much data as we can from the file. 2384 */ 2385 err = ud_page_fill(ip, pp, off, B_READ, &pgoff); 2386 if (err) { 2387 pvn_read_done(pp, B_ERROR); 2388 return (err); 2389 } 2390 2391 /* 2392 * XXX ??? ufs has io_len instead of pgoff below 2393 */ 2394 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK); 2395 2396 /* 2397 * If the file access is sequential, initiate read ahead 2398 * of the next cluster. 2399 */ 2400 if (seq && ip->i_nextrio < ip->i_size) { 2401 ud_getpage_ra(vp, off, seg, addr); 2402 } 2403 } 2404 2405 outmiss: 2406 pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw); 2407 return (err); 2408 } 2409 2410 /* ARGSUSED */ 2411 void 2412 ud_getpage_ra(struct vnode *vp, 2413 u_offset_t off, struct seg *seg, caddr_t addr) 2414 { 2415 page_t *pp; 2416 size_t io_len; 2417 struct ud_inode *ip = VTOI(vp); 2418 u_offset_t io_off = ip->i_nextrio, pgoff; 2419 caddr_t addr2 = addr + (io_off - off); 2420 daddr_t bn; 2421 int32_t contig = 0; 2422 2423 /* 2424 * Is this test needed? 2425 */ 2426 2427 if (addr2 >= seg->s_base + seg->s_size) { 2428 return; 2429 } 2430 2431 contig = 0; 2432 if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) { 2433 return; 2434 } 2435 2436 pp = pvn_read_kluster(vp, io_off, seg, addr2, 2437 &io_off, &io_len, io_off, PAGESIZE, 1); 2438 2439 /* 2440 * Some other thread has entered the page. 2441 * So no read head done here (ie we will have to and wait 2442 * for the read when needed). 2443 */ 2444 2445 if (pp == NULL) { 2446 return; 2447 } 2448 2449 (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff); 2450 ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK); 2451 } 2452 2453 int 2454 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off, 2455 uint32_t bflgs, u_offset_t *pg_off) 2456 { 2457 daddr_t bn; 2458 struct buf *bp; 2459 caddr_t kaddr, caddr; 2460 int32_t error = 0, contig = 0, multi_io = 0; 2461 int32_t lbsize = ip->i_udf->udf_lbsize; 2462 int32_t lbmask = ip->i_udf->udf_lbmask; 2463 uint64_t isize; 2464 2465 isize = (ip->i_size + lbmask) & (~lbmask); 2466 if (ip->i_desc_type == ICB_FLAG_ONE_AD) { 2467 2468 /* 2469 * Embedded file read file_entry 2470 * from buffer cache and copy the required 2471 * portions 2472 */ 2473 bp = ud_bread(ip->i_dev, 2474 ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize); 2475 if ((bp->b_error == 0) && 2476 (bp->b_resid == 0)) { 2477 2478 caddr = bp->b_un.b_addr + ip->i_data_off; 2479 2480 /* 2481 * mapin to kvm 2482 */ 2483 kaddr = (caddr_t)ppmapin(pp, 2484 PROT_READ | PROT_WRITE, (caddr_t)-1); 2485 (void) kcopy(caddr, kaddr, ip->i_size); 2486 2487 /* 2488 * mapout of kvm 2489 */ 2490 ppmapout(kaddr); 2491 } 2492 brelse(bp); 2493 contig = ip->i_size; 2494 } else { 2495 2496 /* 2497 * Get the continuous size and block number 2498 * at offset "off" 2499 */ 2500 if (error = ud_bmap_read(ip, off, &bn, &contig)) 2501 goto out; 2502 contig = MIN(contig, PAGESIZE); 2503 contig = (contig + lbmask) & (~lbmask); 2504 2505 /* 2506 * Zero part of the page which we are not 2507 * going to read from the disk. 2508 */ 2509 2510 if (bn == UDF_HOLE) { 2511 2512 /* 2513 * This is a HOLE. Just zero out 2514 * the page 2515 */ 2516 if (((off + contig) == isize) || 2517 (contig == PAGESIZE)) { 2518 pagezero(pp->p_prev, 0, PAGESIZE); 2519 goto out; 2520 } 2521 } 2522 2523 if (contig < PAGESIZE) { 2524 uint64_t count; 2525 2526 count = isize - off; 2527 if (contig != count) { 2528 multi_io = 1; 2529 contig = (int32_t)(MIN(count, PAGESIZE)); 2530 } else { 2531 pagezero(pp->p_prev, contig, PAGESIZE - contig); 2532 } 2533 } 2534 2535 /* 2536 * Get a bp and initialize it 2537 */ 2538 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs); 2539 ASSERT(bp != NULL); 2540 2541 bp->b_edev = ip->i_dev; 2542 bp->b_dev = cmpdev(ip->i_dev); 2543 bp->b_blkno = bn; 2544 bp->b_un.b_addr = 0; 2545 bp->b_file = ip->i_vnode; 2546 2547 /* 2548 * Start I/O 2549 */ 2550 if (multi_io == 0) { 2551 2552 /* 2553 * Single I/O is sufficient for this page 2554 */ 2555 (void) bdev_strategy(bp); 2556 } else { 2557 2558 /* 2559 * We need to do the I/O in 2560 * piece's 2561 */ 2562 error = ud_multi_strat(ip, pp, bp, off); 2563 if (error != 0) { 2564 goto out; 2565 } 2566 } 2567 if ((bflgs & B_ASYNC) == 0) { 2568 2569 /* 2570 * Wait for i/o to complete. 2571 */ 2572 2573 error = biowait(bp); 2574 pageio_done(bp); 2575 if (error) { 2576 goto out; 2577 } 2578 } 2579 } 2580 if ((off + contig) >= ip->i_size) { 2581 contig = ip->i_size - off; 2582 } 2583 2584 out: 2585 *pg_off = contig; 2586 return (error); 2587 } 2588 2589 int32_t 2590 ud_putpages(struct vnode *vp, offset_t off, 2591 size_t len, int32_t flags, struct cred *cr) 2592 { 2593 struct ud_inode *ip; 2594 page_t *pp; 2595 u_offset_t io_off; 2596 size_t io_len; 2597 u_offset_t eoff; 2598 int32_t err = 0; 2599 int32_t dolock; 2600 2601 ud_printf("ud_putpages\n"); 2602 2603 if (vp->v_count == 0) { 2604 cmn_err(CE_WARN, "ud_putpages: bad v_count"); 2605 return (EINVAL); 2606 } 2607 2608 ip = VTOI(vp); 2609 2610 /* 2611 * Acquire the readers/write inode lock before locking 2612 * any pages in this inode. 2613 * The inode lock is held during i/o. 2614 */ 2615 if (len == 0) { 2616 mutex_enter(&ip->i_tlock); 2617 ip->i_delayoff = ip->i_delaylen = 0; 2618 mutex_exit(&ip->i_tlock); 2619 } 2620 dolock = (rw_owner(&ip->i_contents) != curthread); 2621 if (dolock) { 2622 rw_enter(&ip->i_contents, RW_READER); 2623 } 2624 2625 if (!vn_has_cached_data(vp)) { 2626 if (dolock) { 2627 rw_exit(&ip->i_contents); 2628 } 2629 return (0); 2630 } 2631 2632 if (len == 0) { 2633 /* 2634 * Search the entire vp list for pages >= off. 2635 */ 2636 err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage, 2637 flags, cr); 2638 } else { 2639 /* 2640 * Loop over all offsets in the range looking for 2641 * pages to deal with. 2642 */ 2643 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) { 2644 eoff = MIN(off + len, eoff); 2645 } else { 2646 eoff = off + len; 2647 } 2648 2649 for (io_off = off; io_off < eoff; io_off += io_len) { 2650 /* 2651 * If we are not invalidating, synchronously 2652 * freeing or writing pages, use the routine 2653 * page_lookup_nowait() to prevent reclaiming 2654 * them from the free list. 2655 */ 2656 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 2657 pp = page_lookup(vp, io_off, 2658 (flags & (B_INVAL | B_FREE)) ? 2659 SE_EXCL : SE_SHARED); 2660 } else { 2661 pp = page_lookup_nowait(vp, io_off, 2662 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2663 } 2664 2665 if (pp == NULL || pvn_getdirty(pp, flags) == 0) { 2666 io_len = PAGESIZE; 2667 } else { 2668 2669 err = ud_putapage(vp, pp, 2670 &io_off, &io_len, flags, cr); 2671 if (err != 0) { 2672 break; 2673 } 2674 /* 2675 * "io_off" and "io_len" are returned as 2676 * the range of pages we actually wrote. 2677 * This allows us to skip ahead more quickly 2678 * since several pages may've been dealt 2679 * with by this iteration of the loop. 2680 */ 2681 } 2682 } 2683 } 2684 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) { 2685 /* 2686 * We have just sync'ed back all the pages on 2687 * the inode, turn off the IMODTIME flag. 2688 */ 2689 mutex_enter(&ip->i_tlock); 2690 ip->i_flag &= ~IMODTIME; 2691 mutex_exit(&ip->i_tlock); 2692 } 2693 if (dolock) { 2694 rw_exit(&ip->i_contents); 2695 } 2696 return (err); 2697 } 2698 2699 /* ARGSUSED */ 2700 int32_t 2701 ud_putapage(struct vnode *vp, 2702 page_t *pp, u_offset_t *offp, 2703 size_t *lenp, int32_t flags, struct cred *cr) 2704 { 2705 daddr_t bn; 2706 size_t io_len; 2707 struct ud_inode *ip; 2708 int32_t error = 0, contig, multi_io = 0; 2709 struct udf_vfs *udf_vfsp; 2710 u_offset_t off, io_off; 2711 caddr_t kaddr, caddr; 2712 struct buf *bp = NULL; 2713 int32_t lbmask; 2714 uint64_t isize; 2715 uint16_t crc_len; 2716 struct file_entry *fe; 2717 2718 ud_printf("ud_putapage\n"); 2719 2720 ip = VTOI(vp); 2721 ASSERT(ip); 2722 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2723 lbmask = ip->i_udf->udf_lbmask; 2724 isize = (ip->i_size + lbmask) & (~lbmask); 2725 2726 udf_vfsp = ip->i_udf; 2727 ASSERT(udf_vfsp->udf_flags & UDF_FL_RW); 2728 2729 /* 2730 * If the modified time on the inode has not already been 2731 * set elsewhere (e.g. for write/setattr) we set the time now. 2732 * This gives us approximate modified times for mmap'ed files 2733 * which are modified via stores in the user address space. 2734 */ 2735 if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) { 2736 mutex_enter(&ip->i_tlock); 2737 ip->i_flag |= IUPD; 2738 ITIMES_NOLOCK(ip); 2739 mutex_exit(&ip->i_tlock); 2740 } 2741 2742 2743 /* 2744 * Align the request to a block boundry (for old file systems), 2745 * and go ask bmap() how contiguous things are for this file. 2746 */ 2747 off = pp->p_offset & ~(offset_t)lbmask; 2748 /* block align it */ 2749 2750 2751 if (ip->i_desc_type == ICB_FLAG_ONE_AD) { 2752 ASSERT(ip->i_size <= ip->i_max_emb); 2753 2754 pp = pvn_write_kluster(vp, pp, &io_off, 2755 &io_len, off, PAGESIZE, flags); 2756 if (io_len == 0) { 2757 io_len = PAGESIZE; 2758 } 2759 2760 bp = ud_bread(ip->i_dev, 2761 ip->i_icb_lbano << udf_vfsp->udf_l2d_shift, 2762 udf_vfsp->udf_lbsize); 2763 fe = (struct file_entry *)bp->b_un.b_addr; 2764 if ((bp->b_flags & B_ERROR) || 2765 (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY, 2766 ip->i_icb_block, 2767 1, udf_vfsp->udf_lbsize) != 0)) { 2768 if (pp != NULL) 2769 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 2770 if (bp->b_flags & B_ERROR) { 2771 error = EIO; 2772 } else { 2773 error = EINVAL; 2774 } 2775 brelse(bp); 2776 return (error); 2777 } 2778 if ((bp->b_error == 0) && 2779 (bp->b_resid == 0)) { 2780 2781 caddr = bp->b_un.b_addr + ip->i_data_off; 2782 kaddr = (caddr_t)ppmapin(pp, 2783 PROT_READ | PROT_WRITE, (caddr_t)-1); 2784 (void) kcopy(kaddr, caddr, ip->i_size); 2785 ppmapout(kaddr); 2786 } 2787 crc_len = offsetof(struct file_entry, fe_spec) + 2788 SWAP_32(fe->fe_len_ear); 2789 crc_len += ip->i_size; 2790 ud_make_tag(ip->i_udf, &fe->fe_tag, 2791 UD_FILE_ENTRY, ip->i_icb_block, crc_len); 2792 2793 bwrite(bp); 2794 2795 if (flags & B_ASYNC) { 2796 pvn_write_done(pp, flags); 2797 } 2798 contig = ip->i_size; 2799 } else { 2800 2801 if (error = ud_bmap_read(ip, off, &bn, &contig)) { 2802 goto out; 2803 } 2804 contig = MIN(contig, PAGESIZE); 2805 contig = (contig + lbmask) & (~lbmask); 2806 2807 if (contig < PAGESIZE) { 2808 uint64_t count; 2809 2810 count = isize - off; 2811 if (contig != count) { 2812 multi_io = 1; 2813 contig = (int32_t)(MIN(count, PAGESIZE)); 2814 } 2815 } 2816 2817 if ((off + contig) > isize) { 2818 contig = isize - off; 2819 } 2820 2821 if (contig > PAGESIZE) { 2822 if (contig & PAGEOFFSET) { 2823 contig &= PAGEMASK; 2824 } 2825 } 2826 2827 pp = pvn_write_kluster(vp, pp, &io_off, 2828 &io_len, off, contig, flags); 2829 if (io_len == 0) { 2830 io_len = PAGESIZE; 2831 } 2832 2833 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags); 2834 ASSERT(bp != NULL); 2835 2836 bp->b_edev = ip->i_dev; 2837 bp->b_dev = cmpdev(ip->i_dev); 2838 bp->b_blkno = bn; 2839 bp->b_un.b_addr = 0; 2840 bp->b_file = vp; 2841 bp->b_offset = (offset_t)off; 2842 2843 2844 /* 2845 * write throttle 2846 */ 2847 ASSERT(bp->b_iodone == NULL); 2848 bp->b_iodone = ud_iodone; 2849 mutex_enter(&ip->i_tlock); 2850 ip->i_writes += bp->b_bcount; 2851 mutex_exit(&ip->i_tlock); 2852 2853 if (multi_io == 0) { 2854 2855 (void) bdev_strategy(bp); 2856 } else { 2857 error = ud_multi_strat(ip, pp, bp, off); 2858 if (error != 0) { 2859 goto out; 2860 } 2861 } 2862 2863 if ((flags & B_ASYNC) == 0) { 2864 /* 2865 * Wait for i/o to complete. 2866 */ 2867 error = biowait(bp); 2868 pageio_done(bp); 2869 } 2870 } 2871 2872 if ((flags & B_ASYNC) == 0) { 2873 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags); 2874 } 2875 2876 pp = NULL; 2877 2878 out: 2879 if (error != 0 && pp != NULL) { 2880 pvn_write_done(pp, B_ERROR | B_WRITE | flags); 2881 } 2882 2883 if (offp) { 2884 *offp = io_off; 2885 } 2886 if (lenp) { 2887 *lenp = io_len; 2888 } 2889 2890 return (error); 2891 } 2892 2893 2894 int32_t 2895 ud_iodone(struct buf *bp) 2896 { 2897 struct ud_inode *ip; 2898 2899 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ)); 2900 2901 bp->b_iodone = NULL; 2902 2903 ip = VTOI(bp->b_pages->p_vnode); 2904 2905 mutex_enter(&ip->i_tlock); 2906 if (ip->i_writes >= ud_LW) { 2907 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) { 2908 if (ud_WRITES) { 2909 cv_broadcast(&ip->i_wrcv); /* wake all up */ 2910 } 2911 } 2912 } else { 2913 ip->i_writes -= bp->b_bcount; 2914 } 2915 mutex_exit(&ip->i_tlock); 2916 iodone(bp); 2917 return (0); 2918 } 2919 2920 /* ARGSUSED3 */ 2921 int32_t 2922 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr) 2923 { 2924 struct vnode *vp; 2925 struct udf_vfs *udf_vfsp; 2926 krw_t rwtype; 2927 caddr_t base; 2928 uint32_t flags; 2929 int32_t error, n, on, mapon, dofree; 2930 u_offset_t off; 2931 long oresid = uio->uio_resid; 2932 2933 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2934 if ((ip->i_type != VREG) && 2935 (ip->i_type != VDIR) && 2936 (ip->i_type != VLNK)) { 2937 return (EIO); 2938 } 2939 2940 if (uio->uio_loffset > MAXOFFSET_T) { 2941 return (0); 2942 } 2943 2944 if ((uio->uio_loffset < (offset_t)0) || 2945 ((uio->uio_loffset + uio->uio_resid) < 0)) { 2946 return (EINVAL); 2947 } 2948 if (uio->uio_resid == 0) { 2949 return (0); 2950 } 2951 2952 vp = ITOV(ip); 2953 udf_vfsp = ip->i_udf; 2954 mutex_enter(&ip->i_tlock); 2955 ip->i_flag |= IACC; 2956 mutex_exit(&ip->i_tlock); 2957 2958 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER); 2959 2960 do { 2961 offset_t diff; 2962 u_offset_t uoff = uio->uio_loffset; 2963 off = uoff & (offset_t)MAXBMASK; 2964 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 2965 on = (int)blkoff(udf_vfsp, uoff); 2966 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid); 2967 2968 diff = ip->i_size - uoff; 2969 2970 if (diff <= (offset_t)0) { 2971 error = 0; 2972 goto out; 2973 } 2974 if (diff < (offset_t)n) { 2975 n = (int)diff; 2976 } 2977 dofree = ud_freebehind && 2978 ip->i_nextr == (off & PAGEMASK) && 2979 off > ud_smallfile; 2980 2981 if (rwtype == RW_READER) { 2982 rw_exit(&ip->i_contents); 2983 } 2984 2985 base = segmap_getmapflt(segkmap, vp, (off + mapon), 2986 (uint32_t)n, 1, S_READ); 2987 error = uiomove(base + mapon, (long)n, UIO_READ, uio); 2988 2989 flags = 0; 2990 if (!error) { 2991 /* 2992 * If read a whole block, or read to eof, 2993 * won't need this buffer again soon. 2994 */ 2995 if (n + on == MAXBSIZE && ud_freebehind && dofree && 2996 freemem < lotsfree + pages_before_pager) { 2997 flags = SM_FREE | SM_DONTNEED |SM_ASYNC; 2998 } 2999 /* 3000 * In POSIX SYNC (FSYNC and FDSYNC) read mode, 3001 * we want to make sure that the page which has 3002 * been read, is written on disk if it is dirty. 3003 * And corresponding indirect blocks should also 3004 * be flushed out. 3005 */ 3006 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) { 3007 flags &= ~SM_ASYNC; 3008 flags |= SM_WRITE; 3009 } 3010 error = segmap_release(segkmap, base, flags); 3011 } else { 3012 (void) segmap_release(segkmap, base, flags); 3013 } 3014 3015 if (rwtype == RW_READER) { 3016 rw_enter(&ip->i_contents, rwtype); 3017 } 3018 } while (error == 0 && uio->uio_resid > 0 && n != 0); 3019 out: 3020 /* 3021 * Inode is updated according to this table if FRSYNC is set. 3022 * 3023 * FSYNC FDSYNC(posix.4) 3024 * -------------------------- 3025 * always IATTCHG|IBDWRITE 3026 */ 3027 if (ioflag & FRSYNC) { 3028 if ((ioflag & FSYNC) || 3029 ((ioflag & FDSYNC) && 3030 (ip->i_flag & (IATTCHG|IBDWRITE)))) { 3031 rw_exit(&ip->i_contents); 3032 rw_enter(&ip->i_contents, RW_WRITER); 3033 ud_iupdat(ip, 1); 3034 } 3035 } 3036 /* 3037 * If we've already done a partial read, terminate 3038 * the read but return no error. 3039 */ 3040 if (oresid != uio->uio_resid) { 3041 error = 0; 3042 } 3043 ITIMES(ip); 3044 3045 return (error); 3046 } 3047 3048 int32_t 3049 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr) 3050 { 3051 caddr_t base; 3052 struct vnode *vp; 3053 struct udf_vfs *udf_vfsp; 3054 uint32_t flags; 3055 int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0; 3056 int32_t pagecreate, newpage; 3057 uint64_t old_i_size; 3058 u_offset_t off; 3059 long start_resid = uio->uio_resid, premove_resid; 3060 rlim64_t limit = uio->uio_limit; 3061 3062 3063 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 3064 if ((ip->i_type != VREG) && 3065 (ip->i_type != VDIR) && 3066 (ip->i_type != VLNK)) { 3067 return (EIO); 3068 } 3069 3070 if (uio->uio_loffset >= MAXOFFSET_T) { 3071 return (EFBIG); 3072 } 3073 /* 3074 * see udf_l_pathconf 3075 */ 3076 if (limit > (((uint64_t)1 << 40) - 1)) { 3077 limit = ((uint64_t)1 << 40) - 1; 3078 } 3079 if (uio->uio_loffset >= limit) { 3080 proc_t *p = ttoproc(curthread); 3081 3082 mutex_enter(&p->p_lock); 3083 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 3084 p, RCA_UNSAFE_SIGINFO); 3085 mutex_exit(&p->p_lock); 3086 return (EFBIG); 3087 } 3088 if ((uio->uio_loffset < (offset_t)0) || 3089 ((uio->uio_loffset + uio->uio_resid) < 0)) { 3090 return (EINVAL); 3091 } 3092 if (uio->uio_resid == 0) { 3093 return (0); 3094 } 3095 3096 mutex_enter(&ip->i_tlock); 3097 ip->i_flag |= INOACC; 3098 3099 if (ioflag & (FSYNC | FDSYNC)) { 3100 ip->i_flag |= ISYNC; 3101 iupdat_flag = 1; 3102 } 3103 mutex_exit(&ip->i_tlock); 3104 3105 udf_vfsp = ip->i_udf; 3106 vp = ITOV(ip); 3107 3108 do { 3109 u_offset_t uoff = uio->uio_loffset; 3110 off = uoff & (offset_t)MAXBMASK; 3111 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 3112 on = (int)blkoff(udf_vfsp, uoff); 3113 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid); 3114 3115 if (ip->i_type == VREG && uoff + n >= limit) { 3116 if (uoff >= limit) { 3117 error = EFBIG; 3118 goto out; 3119 } 3120 n = (int)(limit - (rlim64_t)uoff); 3121 } 3122 if (uoff + n > ip->i_size) { 3123 /* 3124 * We are extending the length of the file. 3125 * bmap is used so that we are sure that 3126 * if we need to allocate new blocks, that it 3127 * is done here before we up the file size. 3128 */ 3129 error = ud_bmap_write(ip, uoff, 3130 (int)(on + n), mapon == 0, cr); 3131 if (error) { 3132 break; 3133 } 3134 i_size_changed = 1; 3135 old_i_size = ip->i_size; 3136 ip->i_size = uoff + n; 3137 /* 3138 * If we are writing from the beginning of 3139 * the mapping, we can just create the 3140 * pages without having to read them. 3141 */ 3142 pagecreate = (mapon == 0); 3143 } else if (n == MAXBSIZE) { 3144 /* 3145 * Going to do a whole mappings worth, 3146 * so we can just create the pages w/o 3147 * having to read them in. But before 3148 * we do that, we need to make sure any 3149 * needed blocks are allocated first. 3150 */ 3151 error = ud_bmap_write(ip, uoff, 3152 (int)(on + n), 1, cr); 3153 if (error) { 3154 break; 3155 } 3156 pagecreate = 1; 3157 } else { 3158 pagecreate = 0; 3159 } 3160 3161 rw_exit(&ip->i_contents); 3162 3163 /* 3164 * Touch the page and fault it in if it is not in 3165 * core before segmap_getmapflt can lock it. This 3166 * is to avoid the deadlock if the buffer is mapped 3167 * to the same file through mmap which we want to 3168 * write to. 3169 */ 3170 uio_prefaultpages((long)n, uio); 3171 3172 base = segmap_getmapflt(segkmap, vp, (off + mapon), 3173 (uint32_t)n, !pagecreate, S_WRITE); 3174 3175 /* 3176 * segmap_pagecreate() returns 1 if it calls 3177 * page_create_va() to allocate any pages. 3178 */ 3179 newpage = 0; 3180 if (pagecreate) { 3181 newpage = segmap_pagecreate(segkmap, base, 3182 (size_t)n, 0); 3183 } 3184 3185 premove_resid = uio->uio_resid; 3186 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio); 3187 3188 if (pagecreate && 3189 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) { 3190 /* 3191 * We created pages w/o initializing them completely, 3192 * thus we need to zero the part that wasn't set up. 3193 * This happens on most EOF write cases and if 3194 * we had some sort of error during the uiomove. 3195 */ 3196 int nzero, nmoved; 3197 3198 nmoved = (int)(uio->uio_loffset - (off + mapon)); 3199 ASSERT(nmoved >= 0 && nmoved <= n); 3200 nzero = roundup(on + n, PAGESIZE) - nmoved; 3201 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE); 3202 (void) kzero(base + mapon + nmoved, (uint32_t)nzero); 3203 } 3204 3205 /* 3206 * Unlock the pages allocated by page_create_va() 3207 * in segmap_pagecreate() 3208 */ 3209 if (newpage) { 3210 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE); 3211 } 3212 3213 if (error) { 3214 /* 3215 * If we failed on a write, we may have already 3216 * allocated file blocks as well as pages. It's 3217 * hard to undo the block allocation, but we must 3218 * be sure to invalidate any pages that may have 3219 * been allocated. 3220 */ 3221 (void) segmap_release(segkmap, base, SM_INVAL); 3222 } else { 3223 flags = 0; 3224 /* 3225 * Force write back for synchronous write cases. 3226 */ 3227 if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) { 3228 /* 3229 * If the sticky bit is set but the 3230 * execute bit is not set, we do a 3231 * synchronous write back and free 3232 * the page when done. We set up swap 3233 * files to be handled this way to 3234 * prevent servers from keeping around 3235 * the client's swap pages too long. 3236 * XXX - there ought to be a better way. 3237 */ 3238 if (IS_SWAPVP(vp)) { 3239 flags = SM_WRITE | SM_FREE | 3240 SM_DONTNEED; 3241 iupdat_flag = 0; 3242 } else { 3243 flags = SM_WRITE; 3244 } 3245 } else if (((mapon + n) == MAXBSIZE) || 3246 IS_SWAPVP(vp)) { 3247 /* 3248 * Have written a whole block. 3249 * Start an asynchronous write and 3250 * mark the buffer to indicate that 3251 * it won't be needed again soon. 3252 */ 3253 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED; 3254 } 3255 error = segmap_release(segkmap, base, flags); 3256 3257 /* 3258 * If the operation failed and is synchronous, 3259 * then we need to unwind what uiomove() last 3260 * did so we can potentially return an error to 3261 * the caller. If this write operation was 3262 * done in two pieces and the first succeeded, 3263 * then we won't return an error for the second 3264 * piece that failed. However, we only want to 3265 * return a resid value that reflects what was 3266 * really done. 3267 * 3268 * Failures for non-synchronous operations can 3269 * be ignored since the page subsystem will 3270 * retry the operation until it succeeds or the 3271 * file system is unmounted. 3272 */ 3273 if (error) { 3274 if ((ioflag & (FSYNC | FDSYNC)) || 3275 ip->i_type == VDIR) { 3276 uio->uio_resid = premove_resid; 3277 } else { 3278 error = 0; 3279 } 3280 } 3281 } 3282 3283 /* 3284 * Re-acquire contents lock. 3285 */ 3286 rw_enter(&ip->i_contents, RW_WRITER); 3287 /* 3288 * If the uiomove() failed or if a synchronous 3289 * page push failed, fix up i_size. 3290 */ 3291 if (error) { 3292 if (i_size_changed) { 3293 /* 3294 * The uiomove failed, and we 3295 * allocated blocks,so get rid 3296 * of them. 3297 */ 3298 (void) ud_itrunc(ip, old_i_size, 0, cr); 3299 } 3300 } else { 3301 /* 3302 * XXX - Can this be out of the loop? 3303 */ 3304 ip->i_flag |= IUPD | ICHG; 3305 if (i_size_changed) { 3306 ip->i_flag |= IATTCHG; 3307 } 3308 if ((ip->i_perm & (IEXEC | (IEXEC >> 5) | 3309 (IEXEC >> 10))) != 0 && 3310 (ip->i_char & (ISUID | ISGID)) != 0 && 3311 secpolicy_vnode_setid_retain(cr, 3312 (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) { 3313 /* 3314 * Clear Set-UID & Set-GID bits on 3315 * successful write if not privileged 3316 * and at least one of the execute bits 3317 * is set. If we always clear Set-GID, 3318 * mandatory file and record locking is 3319 * unuseable. 3320 */ 3321 ip->i_char &= ~(ISUID | ISGID); 3322 } 3323 } 3324 } while (error == 0 && uio->uio_resid > 0 && n != 0); 3325 3326 out: 3327 /* 3328 * Inode is updated according to this table - 3329 * 3330 * FSYNC FDSYNC(posix.4) 3331 * -------------------------- 3332 * always@ IATTCHG|IBDWRITE 3333 * 3334 * @ - If we are doing synchronous write the only time we should 3335 * not be sync'ing the ip here is if we have the stickyhack 3336 * activated, the file is marked with the sticky bit and 3337 * no exec bit, the file length has not been changed and 3338 * no new blocks have been allocated during this write. 3339 */ 3340 if ((ip->i_flag & ISYNC) != 0) { 3341 /* 3342 * we have eliminated nosync 3343 */ 3344 if ((ip->i_flag & (IATTCHG|IBDWRITE)) || 3345 ((ioflag & FSYNC) && iupdat_flag)) { 3346 ud_iupdat(ip, 1); 3347 } 3348 } 3349 3350 /* 3351 * If we've already done a partial-write, terminate 3352 * the write but return no error. 3353 */ 3354 if (start_resid != uio->uio_resid) { 3355 error = 0; 3356 } 3357 ip->i_flag &= ~(INOACC | ISYNC); 3358 ITIMES_NOLOCK(ip); 3359 3360 return (error); 3361 } 3362 3363 int32_t 3364 ud_multi_strat(struct ud_inode *ip, 3365 page_t *pp, struct buf *bp, u_offset_t start) 3366 { 3367 daddr_t bn; 3368 int32_t error = 0, io_count, contig, alloc_sz, i; 3369 uint32_t io_off; 3370 mio_master_t *mm = NULL; 3371 mio_slave_t *ms = NULL; 3372 struct buf *rbp; 3373 3374 ASSERT(!(start & PAGEOFFSET)); 3375 3376 /* 3377 * Figure out how many buffers to allocate 3378 */ 3379 io_count = 0; 3380 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) { 3381 contig = 0; 3382 if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off), 3383 &bn, &contig)) { 3384 goto end; 3385 } 3386 if (contig == 0) { 3387 goto end; 3388 } 3389 contig = MIN(contig, PAGESIZE - io_off); 3390 if (bn != UDF_HOLE) { 3391 io_count ++; 3392 } else { 3393 /* 3394 * HOLE 3395 */ 3396 if (bp->b_flags & B_READ) { 3397 3398 /* 3399 * This is a hole and is read 3400 * it should be filled with 0's 3401 */ 3402 pagezero(pp, io_off, contig); 3403 } 3404 } 3405 } 3406 3407 3408 if (io_count != 0) { 3409 3410 /* 3411 * Allocate memory for all the 3412 * required number of buffers 3413 */ 3414 alloc_sz = sizeof (mio_master_t) + 3415 (sizeof (mio_slave_t) * io_count); 3416 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP); 3417 if (mm == NULL) { 3418 error = ENOMEM; 3419 goto end; 3420 } 3421 3422 /* 3423 * initialize master 3424 */ 3425 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL); 3426 mm->mm_size = alloc_sz; 3427 mm->mm_bp = bp; 3428 mm->mm_resid = 0; 3429 mm->mm_error = 0; 3430 mm->mm_index = master_index++; 3431 3432 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t)); 3433 3434 /* 3435 * Initialize buffers 3436 */ 3437 io_count = 0; 3438 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) { 3439 contig = 0; 3440 if (error = ud_bmap_read(ip, 3441 (u_offset_t)(start + io_off), 3442 &bn, &contig)) { 3443 goto end; 3444 } 3445 ASSERT(contig); 3446 if ((io_off + contig) > bp->b_bcount) { 3447 contig = bp->b_bcount - io_off; 3448 } 3449 if (bn != UDF_HOLE) { 3450 /* 3451 * Clone the buffer 3452 * and prepare to start I/O 3453 */ 3454 ms->ms_ptr = mm; 3455 bioinit(&ms->ms_buf); 3456 rbp = bioclone(bp, io_off, (size_t)contig, 3457 bp->b_edev, bn, ud_slave_done, 3458 &ms->ms_buf, KM_NOSLEEP); 3459 ASSERT(rbp == &ms->ms_buf); 3460 mm->mm_resid += contig; 3461 io_count++; 3462 ms ++; 3463 } 3464 } 3465 3466 /* 3467 * Start I/O's 3468 */ 3469 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t)); 3470 for (i = 0; i < io_count; i++) { 3471 (void) bdev_strategy(&ms->ms_buf); 3472 ms ++; 3473 } 3474 } 3475 3476 end: 3477 if (error != 0) { 3478 bp->b_flags |= B_ERROR; 3479 bp->b_error = error; 3480 if (mm != NULL) { 3481 mutex_destroy(&mm->mm_mutex); 3482 kmem_free(mm, mm->mm_size); 3483 } 3484 } 3485 return (error); 3486 } 3487 3488 int32_t 3489 ud_slave_done(struct buf *bp) 3490 { 3491 mio_master_t *mm; 3492 int32_t resid; 3493 3494 ASSERT(SEMA_HELD(&bp->b_sem)); 3495 ASSERT((bp->b_flags & B_DONE) == 0); 3496 3497 mm = ((mio_slave_t *)bp)->ms_ptr; 3498 3499 /* 3500 * Propagate error and byte count info from slave struct to 3501 * the master struct 3502 */ 3503 mutex_enter(&mm->mm_mutex); 3504 if (bp->b_flags & B_ERROR) { 3505 3506 /* 3507 * If multiple slave buffers get 3508 * error we forget the old errors 3509 * this is ok because we any way 3510 * cannot return multiple errors 3511 */ 3512 mm->mm_error = bp->b_error; 3513 } 3514 mm->mm_resid -= bp->b_bcount; 3515 resid = mm->mm_resid; 3516 mutex_exit(&mm->mm_mutex); 3517 3518 /* 3519 * free up the resources allocated to cloned buffers. 3520 */ 3521 bp_mapout(bp); 3522 biofini(bp); 3523 3524 if (resid == 0) { 3525 3526 /* 3527 * This is the last I/O operation 3528 * clean up and return the original buffer 3529 */ 3530 if (mm->mm_error) { 3531 mm->mm_bp->b_flags |= B_ERROR; 3532 mm->mm_bp->b_error = mm->mm_error; 3533 } 3534 biodone(mm->mm_bp); 3535 mutex_destroy(&mm->mm_mutex); 3536 kmem_free(mm, mm->mm_size); 3537 } 3538 return (0); 3539 }