1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2015, Joyent, Inc.
28 */
29
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vfs_opreg.h>
43 #include <sys/stat.h>
44 #include <sys/vnode.h>
45 #include <sys/mode.h>
46 #include <sys/proc.h>
47 #include <sys/disp.h>
48 #include <sys/file.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/kmem.h>
52 #include <sys/uio.h>
53 #include <sys/dnlc.h>
54 #include <sys/conf.h>
55 #include <sys/errno.h>
56 #include <sys/mman.h>
57 #include <sys/fbuf.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/dirent.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/statvfs.h>
66 #include <sys/mount.h>
67 #include <sys/sunddi.h>
68 #include <sys/bootconf.h>
69 #include <sys/policy.h>
70
71 #include <vm/hat.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_map.h>
77 #include <vm/seg_kmem.h>
78 #include <vm/seg_vn.h>
79 #include <vm/rm.h>
80 #include <vm/page.h>
81 #include <sys/swap.h>
82
83 #include <fs/fs_subr.h>
84
85 #include <sys/fs/udf_volume.h>
86 #include <sys/fs/udf_inode.h>
87
88 static int32_t udf_open(struct vnode **,
89 int32_t, struct cred *, caller_context_t *);
90 static int32_t udf_close(struct vnode *,
91 int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
92 static int32_t udf_read(struct vnode *,
93 struct uio *, int32_t, struct cred *, caller_context_t *);
94 static int32_t udf_write(struct vnode *,
95 struct uio *, int32_t, struct cred *, caller_context_t *);
96 static int32_t udf_ioctl(struct vnode *,
97 int32_t, intptr_t, int32_t, struct cred *, int32_t *,
98 caller_context_t *);
99 static int32_t udf_getattr(struct vnode *,
100 struct vattr *, int32_t, struct cred *, caller_context_t *);
101 static int32_t udf_setattr(struct vnode *,
102 struct vattr *, int32_t, struct cred *, caller_context_t *);
103 static int32_t udf_access(struct vnode *,
104 int32_t, int32_t, struct cred *, caller_context_t *);
105 static int32_t udf_lookup(struct vnode *,
106 char *, struct vnode **, struct pathname *,
107 int32_t, struct vnode *, struct cred *,
108 caller_context_t *, int *, pathname_t *);
109 static int32_t udf_create(struct vnode *,
110 char *, struct vattr *, enum vcexcl,
111 int32_t, struct vnode **, struct cred *, int32_t,
112 caller_context_t *, vsecattr_t *);
113 static int32_t udf_remove(struct vnode *,
114 char *, struct cred *, caller_context_t *, int);
115 static int32_t udf_link(struct vnode *,
116 struct vnode *, char *, struct cred *, caller_context_t *, int);
117 static int32_t udf_rename(struct vnode *,
118 char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
119 static int32_t udf_mkdir(struct vnode *,
120 char *, struct vattr *, struct vnode **, struct cred *,
121 caller_context_t *, int, vsecattr_t *);
122 static int32_t udf_rmdir(struct vnode *,
123 char *, struct vnode *, struct cred *, caller_context_t *, int);
124 static int32_t udf_readdir(struct vnode *,
125 struct uio *, struct cred *, int32_t *, caller_context_t *, int);
126 static int32_t udf_symlink(struct vnode *,
127 char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
128 static int32_t udf_readlink(struct vnode *,
129 struct uio *, struct cred *, caller_context_t *);
130 static int32_t udf_fsync(struct vnode *,
131 int32_t, struct cred *, caller_context_t *);
132 static void udf_inactive(struct vnode *,
133 struct cred *, caller_context_t *);
134 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
135 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
136 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
137 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
138 caller_context_t *);
139 static int32_t udf_frlock(struct vnode *, int32_t,
140 struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
141 caller_context_t *);
142 static int32_t udf_space(struct vnode *, int32_t,
143 struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
144 static int32_t udf_getpage(struct vnode *, offset_t,
145 size_t, uint32_t *, struct page **, size_t,
146 struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
147 static int32_t udf_putpage(struct vnode *, offset_t,
148 size_t, int32_t, struct cred *, caller_context_t *);
149 static int32_t udf_map(struct vnode *, offset_t, struct as *,
150 caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
151 caller_context_t *);
152 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
153 caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
154 caller_context_t *);
155 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
156 caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
157 caller_context_t *);
158 static int32_t udf_l_pathconf(struct vnode *, int32_t,
159 ulong_t *, struct cred *, caller_context_t *);
160 static int32_t udf_pageio(struct vnode *, struct page *,
161 u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
162
163 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
164 size_t, struct seg *, caddr_t, page_t *pl[],
165 size_t, enum seg_rw, int32_t);
166 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
167 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
168 int32_t ud_page_fill(struct ud_inode *, page_t *,
169 u_offset_t, uint32_t, u_offset_t *);
170 int32_t ud_iodone(struct buf *);
171 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
172 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
173 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
174 int32_t ud_slave_done(struct buf *);
175
176 /*
177 * Structures to control multiple IO operations to get or put pages
178 * that are backed by discontiguous blocks. The master struct is
179 * a dummy that holds the original bp from pageio_setup. The
180 * slave struct holds the working bp's to do the actual IO. Once
181 * all the slave IOs complete. The master is processed as if a single
182 * IO op has completed.
183 */
184 uint32_t master_index = 0;
185 typedef struct mio_master {
186 kmutex_t mm_mutex; /* protect the fields below */
187 int32_t mm_size;
188 buf_t *mm_bp; /* original bp */
189 int32_t mm_resid; /* bytes remaining to transfer */
190 int32_t mm_error; /* accumulated error from slaves */
191 int32_t mm_index; /* XXX debugging */
192 } mio_master_t;
193
194 typedef struct mio_slave {
195 buf_t ms_buf; /* working buffer for this IO chunk */
196 mio_master_t *ms_ptr; /* pointer to master */
197 } mio_slave_t;
198
199 struct vnodeops *udf_vnodeops;
200
201 const fs_operation_def_t udf_vnodeops_template[] = {
202 VOPNAME_OPEN, { .vop_open = udf_open },
203 VOPNAME_CLOSE, { .vop_close = udf_close },
204 VOPNAME_READ, { .vop_read = udf_read },
205 VOPNAME_WRITE, { .vop_write = udf_write },
206 VOPNAME_IOCTL, { .vop_ioctl = udf_ioctl },
207 VOPNAME_GETATTR, { .vop_getattr = udf_getattr },
208 VOPNAME_SETATTR, { .vop_setattr = udf_setattr },
209 VOPNAME_ACCESS, { .vop_access = udf_access },
210 VOPNAME_LOOKUP, { .vop_lookup = udf_lookup },
211 VOPNAME_CREATE, { .vop_create = udf_create },
212 VOPNAME_REMOVE, { .vop_remove = udf_remove },
213 VOPNAME_LINK, { .vop_link = udf_link },
214 VOPNAME_RENAME, { .vop_rename = udf_rename },
215 VOPNAME_MKDIR, { .vop_mkdir = udf_mkdir },
216 VOPNAME_RMDIR, { .vop_rmdir = udf_rmdir },
217 VOPNAME_READDIR, { .vop_readdir = udf_readdir },
218 VOPNAME_SYMLINK, { .vop_symlink = udf_symlink },
219 VOPNAME_READLINK, { .vop_readlink = udf_readlink },
220 VOPNAME_FSYNC, { .vop_fsync = udf_fsync },
221 VOPNAME_INACTIVE, { .vop_inactive = udf_inactive },
222 VOPNAME_FID, { .vop_fid = udf_fid },
223 VOPNAME_RWLOCK, { .vop_rwlock = udf_rwlock },
224 VOPNAME_RWUNLOCK, { .vop_rwunlock = udf_rwunlock },
225 VOPNAME_SEEK, { .vop_seek = udf_seek },
226 VOPNAME_FRLOCK, { .vop_frlock = udf_frlock },
227 VOPNAME_SPACE, { .vop_space = udf_space },
228 VOPNAME_GETPAGE, { .vop_getpage = udf_getpage },
229 VOPNAME_PUTPAGE, { .vop_putpage = udf_putpage },
230 VOPNAME_MAP, { .vop_map = udf_map },
231 VOPNAME_ADDMAP, { .vop_addmap = udf_addmap },
232 VOPNAME_DELMAP, { .vop_delmap = udf_delmap },
233 VOPNAME_PATHCONF, { .vop_pathconf = udf_l_pathconf },
234 VOPNAME_PAGEIO, { .vop_pageio = udf_pageio },
235 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
236 NULL, NULL
237 };
238
239 /* ARGSUSED */
240 static int32_t
241 udf_open(
242 struct vnode **vpp,
243 int32_t flag,
244 struct cred *cr,
245 caller_context_t *ct)
246 {
247 ud_printf("udf_open\n");
248
249 return (0);
250 }
251
252 /* ARGSUSED */
253 static int32_t
254 udf_close(
255 struct vnode *vp,
256 int32_t flag,
257 int32_t count,
258 offset_t offset,
259 struct cred *cr,
260 caller_context_t *ct)
261 {
262 struct ud_inode *ip = VTOI(vp);
263
264 ud_printf("udf_close\n");
265
266 ITIMES(ip);
267
268 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
269 cleanshares(vp, ttoproc(curthread)->p_pid);
270
271 /*
272 * Push partially filled cluster at last close.
273 * ``last close'' is approximated because the dnlc
274 * may have a hold on the vnode.
275 */
276 if (vp->v_count <= 2 && vp->v_type != VBAD) {
277 struct ud_inode *ip = VTOI(vp);
278 if (ip->i_delaylen) {
279 (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
280 B_ASYNC | B_FREE, cr);
281 ip->i_delaylen = 0;
282 }
283 }
284
285 return (0);
286 }
287
288 /* ARGSUSED */
289 static int32_t
290 udf_read(
291 struct vnode *vp,
292 struct uio *uiop,
293 int32_t ioflag,
294 struct cred *cr,
295 caller_context_t *ct)
296 {
297 struct ud_inode *ip = VTOI(vp);
298 int32_t error;
299
300 ud_printf("udf_read\n");
301
302 #ifdef __lock_lint
303 rw_enter(&ip->i_rwlock, RW_READER);
304 #endif
305
306 ASSERT(RW_READ_HELD(&ip->i_rwlock));
307
308 if (MANDLOCK(vp, ip->i_char)) {
309 /*
310 * udf_getattr ends up being called by chklock
311 */
312 error = chklock(vp, FREAD, uiop->uio_loffset,
313 uiop->uio_resid, uiop->uio_fmode, ct);
314 if (error) {
315 goto end;
316 }
317 }
318
319 rw_enter(&ip->i_contents, RW_READER);
320 error = ud_rdip(ip, uiop, ioflag, cr);
321 rw_exit(&ip->i_contents);
322
323 end:
324 #ifdef __lock_lint
325 rw_exit(&ip->i_rwlock);
326 #endif
327
328 return (error);
329 }
330
331
332 int32_t ud_WRITES = 1;
333 int32_t ud_HW = 96 * 1024;
334 int32_t ud_LW = 64 * 1024;
335 int32_t ud_throttles = 0;
336
337 /* ARGSUSED */
338 static int32_t
339 udf_write(
340 struct vnode *vp,
341 struct uio *uiop,
342 int32_t ioflag,
343 struct cred *cr,
344 caller_context_t *ct)
345 {
346 struct ud_inode *ip = VTOI(vp);
347 int32_t error = 0;
348
349 ud_printf("udf_write\n");
350
351 #ifdef __lock_lint
352 rw_enter(&ip->i_rwlock, RW_WRITER);
353 #endif
354
355 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
356
357 if (MANDLOCK(vp, ip->i_char)) {
358 /*
359 * ud_getattr ends up being called by chklock
360 */
361 error = chklock(vp, FWRITE, uiop->uio_loffset,
362 uiop->uio_resid, uiop->uio_fmode, ct);
363 if (error) {
364 goto end;
365 }
366 }
367 /*
368 * Throttle writes.
369 */
370 mutex_enter(&ip->i_tlock);
371 if (ud_WRITES && (ip->i_writes > ud_HW)) {
372 while (ip->i_writes > ud_HW) {
373 ud_throttles++;
374 cv_wait(&ip->i_wrcv, &ip->i_tlock);
375 }
376 }
377 mutex_exit(&ip->i_tlock);
378
379 /*
380 * Write to the file
381 */
382 rw_enter(&ip->i_contents, RW_WRITER);
383 if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
384 /*
385 * In append mode start at end of file.
386 */
387 uiop->uio_loffset = ip->i_size;
388 }
389 error = ud_wrip(ip, uiop, ioflag, cr);
390 rw_exit(&ip->i_contents);
391
392 end:
393 #ifdef __lock_lint
394 rw_exit(&ip->i_rwlock);
395 #endif
396
397 return (error);
398 }
399
400 /* ARGSUSED */
401 static int32_t
402 udf_ioctl(
403 struct vnode *vp,
404 int32_t cmd,
405 intptr_t arg,
406 int32_t flag,
407 struct cred *cr,
408 int32_t *rvalp,
409 caller_context_t *ct)
410 {
411 return (ENOTTY);
412 }
413
414 /* ARGSUSED */
415 static int32_t
416 udf_getattr(
417 struct vnode *vp,
418 struct vattr *vap,
419 int32_t flags,
420 struct cred *cr,
421 caller_context_t *ct)
422 {
423 struct ud_inode *ip = VTOI(vp);
424
425 ud_printf("udf_getattr\n");
426
427 if (vap->va_mask == AT_SIZE) {
428 /*
429 * for performance, if only the size is requested don't bother
430 * with anything else.
431 */
432 vap->va_size = ip->i_size;
433 return (0);
434 }
435
436 rw_enter(&ip->i_contents, RW_READER);
437
438 vap->va_type = vp->v_type;
439 vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
440
441 vap->va_uid = ip->i_uid;
442 vap->va_gid = ip->i_gid;
443 vap->va_fsid = ip->i_dev;
444 vap->va_nodeid = ip->i_icb_lbano;
445 vap->va_nlink = ip->i_nlink;
446 vap->va_size = ip->i_size;
447 vap->va_seq = ip->i_seq;
448 if (vp->v_type == VCHR || vp->v_type == VBLK) {
449 vap->va_rdev = ip->i_rdev;
450 } else {
451 vap->va_rdev = 0;
452 }
453
454 mutex_enter(&ip->i_tlock);
455 ITIMES_NOLOCK(ip); /* mark correct time in inode */
456 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
457 vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
458 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
459 vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
460 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
461 vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
462 mutex_exit(&ip->i_tlock);
463
464 switch (ip->i_type) {
465 case VBLK:
466 vap->va_blksize = MAXBSIZE;
467 break;
468 case VCHR:
469 vap->va_blksize = MAXBSIZE;
470 break;
471 default:
472 vap->va_blksize = ip->i_udf->udf_lbsize;
473 break;
474 }
475 vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
476
477 rw_exit(&ip->i_contents);
478
479 return (0);
480 }
481
482 static int
483 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
484 {
485 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
486 }
487
488 /*ARGSUSED4*/
489 static int32_t
490 udf_setattr(
491 struct vnode *vp,
492 struct vattr *vap,
493 int32_t flags,
494 struct cred *cr,
495 caller_context_t *ct)
496 {
497 int32_t error = 0;
498 uint32_t mask = vap->va_mask;
499 struct ud_inode *ip;
500 timestruc_t now;
501 struct vattr ovap;
502
503 ud_printf("udf_setattr\n");
504
505 ip = VTOI(vp);
506
507 /*
508 * not updates allowed to 4096 files
509 */
510 if (ip->i_astrat == STRAT_TYPE4096) {
511 return (EINVAL);
512 }
513
514 /*
515 * Cannot set these attributes
516 */
517 if (mask & AT_NOSET) {
518 return (EINVAL);
519 }
520
521 rw_enter(&ip->i_rwlock, RW_WRITER);
522 rw_enter(&ip->i_contents, RW_WRITER);
523
524 ovap.va_uid = ip->i_uid;
525 ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
526 error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
527 ud_iaccess_vmode, ip);
528 if (error)
529 goto update_inode;
530
531 mask = vap->va_mask;
532 /*
533 * Change file access modes.
534 */
535 if (mask & AT_MODE) {
536 ip->i_perm = VA2UD_PERM(vap->va_mode);
537 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
538 mutex_enter(&ip->i_tlock);
539 ip->i_flag |= ICHG;
540 mutex_exit(&ip->i_tlock);
541 }
542 if (mask & (AT_UID|AT_GID)) {
543 if (mask & AT_UID) {
544 ip->i_uid = vap->va_uid;
545 }
546 if (mask & AT_GID) {
547 ip->i_gid = vap->va_gid;
548 }
549 mutex_enter(&ip->i_tlock);
550 ip->i_flag |= ICHG;
551 mutex_exit(&ip->i_tlock);
552 }
553 /*
554 * Truncate file. Must have write permission and not be a directory.
555 */
556 if (mask & AT_SIZE) {
557 if (vp->v_type == VDIR) {
558 error = EISDIR;
559 goto update_inode;
560 }
561 if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
562 goto update_inode;
563 }
564 if (vap->va_size > MAXOFFSET_T) {
565 error = EFBIG;
566 goto update_inode;
567 }
568 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
569 goto update_inode;
570 }
571
572 if (vap->va_size == 0)
573 vnevent_truncate(vp, ct);
574 }
575 /*
576 * Change file access or modified times.
577 */
578 if (mask & (AT_ATIME|AT_MTIME)) {
579 mutex_enter(&ip->i_tlock);
580 if (mask & AT_ATIME) {
581 ip->i_atime.tv_sec = vap->va_atime.tv_sec;
582 ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
583 ip->i_flag &= ~IACC;
584 }
585 if (mask & AT_MTIME) {
586 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
587 ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
588 gethrestime(&now);
589 ip->i_ctime.tv_sec = now.tv_sec;
590 ip->i_ctime.tv_nsec = now.tv_nsec;
591 ip->i_flag &= ~(IUPD|ICHG);
592 ip->i_flag |= IMODTIME;
593 }
594 ip->i_flag |= IMOD;
595 mutex_exit(&ip->i_tlock);
596 }
597
598 update_inode:
599 if (curthread->t_flag & T_DONTPEND) {
600 ud_iupdat(ip, 1);
601 } else {
602 ITIMES_NOLOCK(ip);
603 }
604 rw_exit(&ip->i_contents);
605 rw_exit(&ip->i_rwlock);
606
607 return (error);
608 }
609
610 /* ARGSUSED */
611 static int32_t
612 udf_access(
613 struct vnode *vp,
614 int32_t mode,
615 int32_t flags,
616 struct cred *cr,
617 caller_context_t *ct)
618 {
619 struct ud_inode *ip = VTOI(vp);
620
621 ud_printf("udf_access\n");
622
623 if (ip->i_udf == NULL) {
624 return (EIO);
625 }
626
627 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
628 }
629
630 int32_t udfs_stickyhack = 1;
631
632 /* ARGSUSED */
633 static int32_t
634 udf_lookup(
635 struct vnode *dvp,
636 char *nm,
637 struct vnode **vpp,
638 struct pathname *pnp,
639 int32_t flags,
640 struct vnode *rdir,
641 struct cred *cr,
642 caller_context_t *ct,
643 int *direntflags,
644 pathname_t *realpnp)
645 {
646 int32_t error;
647 struct vnode *vp;
648 struct ud_inode *ip, *xip;
649
650 ud_printf("udf_lookup\n");
651 /*
652 * Null component name is a synonym for directory being searched.
653 */
654 if (*nm == '\0') {
655 VN_HOLD(dvp);
656 *vpp = dvp;
657 error = 0;
658 goto out;
659 }
660
661 /*
662 * Fast path: Check the directory name lookup cache.
663 */
664 ip = VTOI(dvp);
665 if (vp = dnlc_lookup(dvp, nm)) {
666 /*
667 * Check accessibility of directory.
668 */
669 if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
670 VN_RELE(vp);
671 }
672 xip = VTOI(vp);
673 } else {
674 error = ud_dirlook(ip, nm, &xip, cr, 1);
675 ITIMES(ip);
676 }
677
678 if (error == 0) {
679 ip = xip;
680 *vpp = ITOV(ip);
681 if ((ip->i_type != VDIR) &&
682 (ip->i_char & ISVTX) &&
683 ((ip->i_perm & IEXEC) == 0) &&
684 udfs_stickyhack) {
685 mutex_enter(&(*vpp)->v_lock);
686 (*vpp)->v_flag |= VISSWAP;
687 mutex_exit(&(*vpp)->v_lock);
688 }
689 ITIMES(ip);
690 /*
691 * If vnode is a device return special vnode instead.
692 */
693 if (IS_DEVVP(*vpp)) {
694 struct vnode *newvp;
695 newvp = specvp(*vpp, (*vpp)->v_rdev,
696 (*vpp)->v_type, cr);
697 VN_RELE(*vpp);
698 if (newvp == NULL) {
699 error = ENOSYS;
700 } else {
701 *vpp = newvp;
702 }
703 }
704 }
705 out:
706 return (error);
707 }
708
709 /* ARGSUSED */
710 static int32_t
711 udf_create(
712 struct vnode *dvp,
713 char *name,
714 struct vattr *vap,
715 enum vcexcl excl,
716 int32_t mode,
717 struct vnode **vpp,
718 struct cred *cr,
719 int32_t flag,
720 caller_context_t *ct,
721 vsecattr_t *vsecp)
722 {
723 int32_t error;
724 struct ud_inode *ip = VTOI(dvp), *xip;
725
726 ud_printf("udf_create\n");
727
728 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
729 vap->va_mode &= ~VSVTX;
730
731 if (*name == '\0') {
732 /*
733 * Null component name refers to the directory itself.
734 */
735 VN_HOLD(dvp);
736 ITIMES(ip);
737 error = EEXIST;
738 } else {
739 xip = NULL;
740 rw_enter(&ip->i_rwlock, RW_WRITER);
741 error = ud_direnter(ip, name, DE_CREATE,
742 (struct ud_inode *)0, (struct ud_inode *)0,
743 vap, &xip, cr, ct);
744 rw_exit(&ip->i_rwlock);
745 ITIMES(ip);
746 ip = xip;
747 }
748 #ifdef __lock_lint
749 rw_enter(&ip->i_contents, RW_WRITER);
750 #else
751 if (ip != NULL) {
752 rw_enter(&ip->i_contents, RW_WRITER);
753 }
754 #endif
755
756 /*
757 * If the file already exists and this is a non-exclusive create,
758 * check permissions and allow access for non-directories.
759 * Read-only create of an existing directory is also allowed.
760 * We fail an exclusive create of anything which already exists.
761 */
762 if (error == EEXIST) {
763 if (excl == NONEXCL) {
764 if ((ip->i_type == VDIR) && (mode & VWRITE)) {
765 error = EISDIR;
766 } else if (mode) {
767 error = ud_iaccess(ip,
768 UD_UPERM2DPERM(mode), cr, 0);
769 } else {
770 error = 0;
771 }
772 }
773 if (error) {
774 rw_exit(&ip->i_contents);
775 VN_RELE(ITOV(ip));
776 goto out;
777 } else if ((ip->i_type == VREG) &&
778 (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
779 /*
780 * Truncate regular files, if requested by caller.
781 * Grab i_rwlock to make sure no one else is
782 * currently writing to the file (we promised
783 * bmap we would do this).
784 * Must get the locks in the correct order.
785 */
786 if (ip->i_size == 0) {
787 ip->i_flag |= ICHG | IUPD;
788 } else {
789 rw_exit(&ip->i_contents);
790 rw_enter(&ip->i_rwlock, RW_WRITER);
791 rw_enter(&ip->i_contents, RW_WRITER);
792 (void) ud_itrunc(ip, 0, 0, cr);
793 rw_exit(&ip->i_rwlock);
794 }
795 vnevent_create(ITOV(ip), ct);
796 }
797 }
798
799 if (error == 0) {
800 *vpp = ITOV(ip);
801 ITIMES(ip);
802 }
803 #ifdef __lock_lint
804 rw_exit(&ip->i_contents);
805 #else
806 if (ip != NULL) {
807 rw_exit(&ip->i_contents);
808 }
809 #endif
810 if (error) {
811 goto out;
812 }
813
814 /*
815 * If vnode is a device return special vnode instead.
816 */
817 if (!error && IS_DEVVP(*vpp)) {
818 struct vnode *newvp;
819
820 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
821 VN_RELE(*vpp);
822 if (newvp == NULL) {
823 error = ENOSYS;
824 goto out;
825 }
826 *vpp = newvp;
827 }
828 out:
829 return (error);
830 }
831
832 /* ARGSUSED */
833 static int32_t
834 udf_remove(
835 struct vnode *vp,
836 char *nm,
837 struct cred *cr,
838 caller_context_t *ct,
839 int flags)
840 {
841 int32_t error;
842 struct ud_inode *ip = VTOI(vp);
843
844 ud_printf("udf_remove\n");
845
846 rw_enter(&ip->i_rwlock, RW_WRITER);
847 error = ud_dirremove(ip, nm,
848 (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
849 rw_exit(&ip->i_rwlock);
850 ITIMES(ip);
851
852 return (error);
853 }
854
855 /* ARGSUSED */
856 static int32_t
857 udf_link(
858 struct vnode *tdvp,
859 struct vnode *svp,
860 char *tnm,
861 struct cred *cr,
862 caller_context_t *ct,
863 int flags)
864 {
865 int32_t error;
866 struct vnode *realvp;
867 struct ud_inode *sip;
868 struct ud_inode *tdp;
869
870 ud_printf("udf_link\n");
871 if (VOP_REALVP(svp, &realvp, ct) == 0) {
872 svp = realvp;
873 }
874
875 /*
876 * Do not allow links to directories
877 */
878 if (svp->v_type == VDIR) {
879 return (EPERM);
880 }
881
882 sip = VTOI(svp);
883
884 if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
885 return (EPERM);
886
887 tdp = VTOI(tdvp);
888
889 rw_enter(&tdp->i_rwlock, RW_WRITER);
890 error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
891 sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
892 rw_exit(&tdp->i_rwlock);
893 ITIMES(sip);
894 ITIMES(tdp);
895
896 if (error == 0) {
897 vnevent_link(svp, ct);
898 }
899
900 return (error);
901 }
902
903 /* ARGSUSED */
904 static int32_t
905 udf_rename(
906 struct vnode *sdvp,
907 char *snm,
908 struct vnode *tdvp,
909 char *tnm,
910 struct cred *cr,
911 caller_context_t *ct,
912 int flags)
913 {
914 int32_t error = 0;
915 struct udf_vfs *udf_vfsp;
916 struct ud_inode *sip; /* source inode */
917 struct ud_inode *tip; /* target inode */
918 struct ud_inode *sdp, *tdp; /* source and target parent inode */
919 struct vnode *realvp;
920
921 ud_printf("udf_rename\n");
922
923 if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
924 tdvp = realvp;
925 }
926
927 sdp = VTOI(sdvp);
928 tdp = VTOI(tdvp);
929
930 udf_vfsp = sdp->i_udf;
931
932 mutex_enter(&udf_vfsp->udf_rename_lck);
933 /*
934 * Look up inode of file we're supposed to rename.
935 */
936 if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
937 mutex_exit(&udf_vfsp->udf_rename_lck);
938 return (error);
939 }
940 /*
941 * be sure this is not a directory with another file system mounted
942 * over it. If it is just give up the locks, and return with
943 * EBUSY
944 */
945 if (vn_mountedvfs(ITOV(sip)) != NULL) {
946 error = EBUSY;
947 goto errout;
948 }
949 /*
950 * Make sure we can delete the source entry. This requires
951 * write permission on the containing directory. If that
952 * directory is "sticky" it further requires (except for
953 * privileged users) that the user own the directory or the
954 * source entry, or else have permission to write the source
955 * entry.
956 */
957 rw_enter(&sdp->i_contents, RW_READER);
958 rw_enter(&sip->i_contents, RW_READER);
959 if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
960 (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
961 rw_exit(&sip->i_contents);
962 rw_exit(&sdp->i_contents);
963 ITIMES(sip);
964 goto errout;
965 }
966
967 /*
968 * Check for renaming '.' or '..' or alias of '.'
969 */
970 if ((strcmp(snm, ".") == 0) ||
971 (strcmp(snm, "..") == 0) ||
972 (sdp == sip)) {
973 error = EINVAL;
974 rw_exit(&sip->i_contents);
975 rw_exit(&sdp->i_contents);
976 goto errout;
977 }
978
979 rw_exit(&sip->i_contents);
980 rw_exit(&sdp->i_contents);
981
982 if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
983 vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
984 VN_RELE(ITOV(tip));
985 }
986
987 /* Notify the target dir. if not the same as the source dir. */
988 if (sdvp != tdvp)
989 vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
990
991 vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
992
993 /*
994 * Link source to the target.
995 */
996 rw_enter(&tdp->i_rwlock, RW_WRITER);
997 if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
998 (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
999 /*
1000 * ESAME isn't really an error; it indicates that the
1001 * operation should not be done because the source and target
1002 * are the same file, but that no error should be reported.
1003 */
1004 if (error == ESAME) {
1005 error = 0;
1006 }
1007 rw_exit(&tdp->i_rwlock);
1008 goto errout;
1009 }
1010 rw_exit(&tdp->i_rwlock);
1011
1012 rw_enter(&sdp->i_rwlock, RW_WRITER);
1013 /*
1014 * Unlink the source.
1015 * Remove the source entry. ud_dirremove() checks that the entry
1016 * still reflects sip, and returns an error if it doesn't.
1017 * If the entry has changed just forget about it. Release
1018 * the source inode.
1019 */
1020 if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
1021 DR_RENAME, cr, ct)) == ENOENT) {
1022 error = 0;
1023 }
1024 rw_exit(&sdp->i_rwlock);
1025
1026 if (error == 0) {
1027 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1028 /*
1029 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1030 * in ud_direnter().
1031 */
1032 }
1033
1034 errout:
1035 ITIMES(sdp);
1036 ITIMES(tdp);
1037 VN_RELE(ITOV(sip));
1038 mutex_exit(&udf_vfsp->udf_rename_lck);
1039
1040 return (error);
1041 }
1042
1043 /* ARGSUSED */
1044 static int32_t
1045 udf_mkdir(
1046 struct vnode *dvp,
1047 char *dirname,
1048 struct vattr *vap,
1049 struct vnode **vpp,
1050 struct cred *cr,
1051 caller_context_t *ct,
1052 int flags,
1053 vsecattr_t *vsecp)
1054 {
1055 int32_t error;
1056 struct ud_inode *ip;
1057 struct ud_inode *xip;
1058
1059 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1060
1061 ud_printf("udf_mkdir\n");
1062
1063 ip = VTOI(dvp);
1064 rw_enter(&ip->i_rwlock, RW_WRITER);
1065 error = ud_direnter(ip, dirname, DE_MKDIR,
1066 (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1067 rw_exit(&ip->i_rwlock);
1068 ITIMES(ip);
1069 if (error == 0) {
1070 ip = xip;
1071 *vpp = ITOV(ip);
1072 ITIMES(ip);
1073 } else if (error == EEXIST) {
1074 ITIMES(xip);
1075 VN_RELE(ITOV(xip));
1076 }
1077
1078 return (error);
1079 }
1080
1081 /* ARGSUSED */
1082 static int32_t
1083 udf_rmdir(
1084 struct vnode *vp,
1085 char *nm,
1086 struct vnode *cdir,
1087 struct cred *cr,
1088 caller_context_t *ct,
1089 int flags)
1090 {
1091 int32_t error;
1092 struct ud_inode *ip = VTOI(vp);
1093
1094 ud_printf("udf_rmdir\n");
1095
1096 rw_enter(&ip->i_rwlock, RW_WRITER);
1097 error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1098 cr, ct);
1099 rw_exit(&ip->i_rwlock);
1100 ITIMES(ip);
1101
1102 return (error);
1103 }
1104
1105 /* ARGSUSED */
1106 static int32_t
1107 udf_readdir(
1108 struct vnode *vp,
1109 struct uio *uiop,
1110 struct cred *cr,
1111 int32_t *eofp,
1112 caller_context_t *ct,
1113 int flags)
1114 {
1115 struct ud_inode *ip;
1116 struct dirent64 *nd;
1117 struct udf_vfs *udf_vfsp;
1118 int32_t error = 0, len, outcount = 0;
1119 uint32_t dirsiz, offset;
1120 uint32_t bufsize, ndlen, dummy;
1121 caddr_t outbuf;
1122 caddr_t outb, end_outb;
1123 struct iovec *iovp;
1124
1125 uint8_t *dname;
1126 int32_t length;
1127
1128 uint8_t *buf = NULL;
1129
1130 struct fbuf *fbp = NULL;
1131 struct file_id *fid;
1132 uint8_t *name;
1133
1134
1135 ud_printf("udf_readdir\n");
1136
1137 ip = VTOI(vp);
1138 udf_vfsp = ip->i_udf;
1139
1140 dirsiz = ip->i_size;
1141 if ((uiop->uio_offset >= dirsiz) ||
1142 (ip->i_nlink <= 0)) {
1143 if (eofp) {
1144 *eofp = 1;
1145 }
1146 return (0);
1147 }
1148
1149 offset = uiop->uio_offset;
1150 iovp = uiop->uio_iov;
1151 bufsize = iovp->iov_len;
1152
1153 outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1154 end_outb = outb + bufsize;
1155 nd = (struct dirent64 *)outbuf;
1156
1157 dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1158 buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1159
1160 if (offset == 0) {
1161 len = DIRENT64_RECLEN(1);
1162 if (((caddr_t)nd + len) >= end_outb) {
1163 error = EINVAL;
1164 goto end;
1165 }
1166 nd->d_ino = ip->i_icb_lbano;
1167 nd->d_reclen = (uint16_t)len;
1168 nd->d_off = 0x10;
1169 nd->d_name[0] = '.';
1170 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1171 nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1172 outcount++;
1173 } else if (offset == 0x10) {
1174 offset = 0;
1175 }
1176
1177 while (offset < dirsiz) {
1178 error = ud_get_next_fid(ip, &fbp,
1179 offset, &fid, &name, buf);
1180 if (error != 0) {
1181 break;
1182 }
1183
1184 if ((fid->fid_flags & FID_DELETED) == 0) {
1185 if (fid->fid_flags & FID_PARENT) {
1186
1187 len = DIRENT64_RECLEN(2);
1188 if (((caddr_t)nd + len) >= end_outb) {
1189 error = EINVAL;
1190 break;
1191 }
1192
1193 nd->d_ino = ip->i_icb_lbano;
1194 nd->d_reclen = (uint16_t)len;
1195 nd->d_off = offset + FID_LEN(fid);
1196 nd->d_name[0] = '.';
1197 nd->d_name[1] = '.';
1198 bzero(&nd->d_name[2],
1199 DIRENT64_NAMELEN(len) - 2);
1200 nd = (struct dirent64 *)
1201 ((char *)nd + nd->d_reclen);
1202 } else {
1203 if ((error = ud_uncompress(fid->fid_idlen,
1204 &length, name, dname)) != 0) {
1205 break;
1206 }
1207 if (length == 0) {
1208 offset += FID_LEN(fid);
1209 continue;
1210 }
1211 len = DIRENT64_RECLEN(length);
1212 if (((caddr_t)nd + len) >= end_outb) {
1213 if (!outcount) {
1214 error = EINVAL;
1215 }
1216 break;
1217 }
1218 (void) strncpy(nd->d_name,
1219 (caddr_t)dname, length);
1220 bzero(&nd->d_name[length],
1221 DIRENT64_NAMELEN(len) - length);
1222 nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1223 SWAP_16(fid->fid_icb.lad_ext_prn),
1224 SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1225 &dummy);
1226 nd->d_reclen = (uint16_t)len;
1227 nd->d_off = offset + FID_LEN(fid);
1228 nd = (struct dirent64 *)
1229 ((char *)nd + nd->d_reclen);
1230 }
1231 outcount++;
1232 }
1233
1234 offset += FID_LEN(fid);
1235 }
1236
1237 end:
1238 if (fbp != NULL) {
1239 fbrelse(fbp, S_OTHER);
1240 }
1241 ndlen = ((char *)nd - outbuf);
1242 /*
1243 * In case of error do not call uiomove.
1244 * Return the error to the caller.
1245 */
1246 if ((error == 0) && (ndlen != 0)) {
1247 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1248 uiop->uio_offset = offset;
1249 }
1250 kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1251 kmem_free((caddr_t)dname, 1024);
1252 kmem_free(outbuf, (uint32_t)bufsize);
1253 if (eofp && error == 0) {
1254 *eofp = (uiop->uio_offset >= dirsiz);
1255 }
1256 return (error);
1257 }
1258
1259 /* ARGSUSED */
1260 static int32_t
1261 udf_symlink(
1262 struct vnode *dvp,
1263 char *linkname,
1264 struct vattr *vap,
1265 char *target,
1266 struct cred *cr,
1267 caller_context_t *ct,
1268 int flags)
1269 {
1270 int32_t error = 0, outlen;
1271 uint32_t ioflag = 0;
1272 struct ud_inode *ip, *dip = VTOI(dvp);
1273
1274 struct path_comp *pc;
1275 int8_t *dname = NULL, *uname = NULL, *sp;
1276
1277 ud_printf("udf_symlink\n");
1278
1279 ip = (struct ud_inode *)0;
1280 vap->va_type = VLNK;
1281 vap->va_rdev = 0;
1282
1283 rw_enter(&dip->i_rwlock, RW_WRITER);
1284 error = ud_direnter(dip, linkname, DE_CREATE,
1285 (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1286 rw_exit(&dip->i_rwlock);
1287 if (error == 0) {
1288 dname = kmem_zalloc(1024, KM_SLEEP);
1289 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1290
1291 pc = (struct path_comp *)uname;
1292 /*
1293 * If the first character in target is "/"
1294 * then skip it and create entry for it
1295 */
1296 if (*target == '/') {
1297 pc->pc_type = 2;
1298 pc->pc_len = 0;
1299 pc = (struct path_comp *)(((char *)pc) + 4);
1300 while (*target == '/') {
1301 target++;
1302 }
1303 }
1304
1305 while (*target != NULL) {
1306 sp = target;
1307 while ((*target != '/') && (*target != '\0')) {
1308 target ++;
1309 }
1310 /*
1311 * We got the next component of the
1312 * path name. Create path_comp of
1313 * appropriate type
1314 */
1315 if (((target - sp) == 1) && (*sp == '.')) {
1316 /*
1317 * Dot entry.
1318 */
1319 pc->pc_type = 4;
1320 pc = (struct path_comp *)(((char *)pc) + 4);
1321 } else if (((target - sp) == 2) &&
1322 (*sp == '.') && ((*(sp + 1)) == '.')) {
1323 /*
1324 * DotDot entry.
1325 */
1326 pc->pc_type = 3;
1327 pc = (struct path_comp *)(((char *)pc) + 4);
1328 } else {
1329 /*
1330 * convert the user given name
1331 * into appropriate form to be put
1332 * on the media
1333 */
1334 outlen = 1024; /* set to size of dname */
1335 if (error = ud_compress(target - sp, &outlen,
1336 (uint8_t *)sp, (uint8_t *)dname)) {
1337 break;
1338 }
1339 pc->pc_type = 5;
1340 /* LINTED */
1341 pc->pc_len = outlen;
1342 dname[outlen] = '\0';
1343 (void) strcpy((char *)pc->pc_id, dname);
1344 pc = (struct path_comp *)
1345 (((char *)pc) + 4 + outlen);
1346 }
1347 while (*target == '/') {
1348 target++;
1349 }
1350 if (*target == NULL) {
1351 break;
1352 }
1353 }
1354
1355 rw_enter(&ip->i_contents, RW_WRITER);
1356 if (error == 0) {
1357 ioflag = FWRITE;
1358 if (curthread->t_flag & T_DONTPEND) {
1359 ioflag |= FDSYNC;
1360 }
1361 error = ud_rdwri(UIO_WRITE, ioflag, ip,
1362 uname, ((int8_t *)pc) - uname,
1363 (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1364 }
1365 if (error) {
1366 ud_idrop(ip);
1367 rw_exit(&ip->i_contents);
1368 rw_enter(&dip->i_rwlock, RW_WRITER);
1369 (void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1370 (struct vnode *)0, DR_REMOVE, cr, ct);
1371 rw_exit(&dip->i_rwlock);
1372 goto update_inode;
1373 }
1374 rw_exit(&ip->i_contents);
1375 }
1376
1377 if ((error == 0) || (error == EEXIST)) {
1378 VN_RELE(ITOV(ip));
1379 }
1380
1381 update_inode:
1382 ITIMES(VTOI(dvp));
1383 if (uname != NULL) {
1384 kmem_free(uname, PAGESIZE);
1385 }
1386 if (dname != NULL) {
1387 kmem_free(dname, 1024);
1388 }
1389
1390 return (error);
1391 }
1392
1393 /* ARGSUSED */
1394 static int32_t
1395 udf_readlink(
1396 struct vnode *vp,
1397 struct uio *uiop,
1398 struct cred *cr,
1399 caller_context_t *ct)
1400 {
1401 int32_t error = 0, off, id_len, size, len;
1402 int8_t *dname = NULL, *uname = NULL;
1403 struct ud_inode *ip;
1404 struct fbuf *fbp = NULL;
1405 struct path_comp *pc;
1406
1407 ud_printf("udf_readlink\n");
1408
1409 if (vp->v_type != VLNK) {
1410 return (EINVAL);
1411 }
1412
1413 ip = VTOI(vp);
1414 size = ip->i_size;
1415 if (size > PAGESIZE) {
1416 return (EIO);
1417 }
1418
1419 if (size == 0) {
1420 return (0);
1421 }
1422
1423 dname = kmem_zalloc(1024, KM_SLEEP);
1424 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1425
1426 rw_enter(&ip->i_contents, RW_READER);
1427
1428 if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1429 goto end;
1430 }
1431
1432 off = 0;
1433
1434 while (off < size) {
1435 pc = (struct path_comp *)(fbp->fb_addr + off);
1436 switch (pc->pc_type) {
1437 case 1 :
1438 (void) strcpy(uname, ip->i_udf->udf_fsmnt);
1439 (void) strcat(uname, "/");
1440 break;
1441 case 2 :
1442 if (pc->pc_len != 0) {
1443 goto end;
1444 }
1445 uname[0] = '/';
1446 uname[1] = '\0';
1447 break;
1448 case 3 :
1449 (void) strcat(uname, "../");
1450 break;
1451 case 4 :
1452 (void) strcat(uname, "./");
1453 break;
1454 case 5 :
1455 if ((error = ud_uncompress(pc->pc_len, &id_len,
1456 pc->pc_id, (uint8_t *)dname)) != 0) {
1457 break;
1458 }
1459 dname[id_len] = '\0';
1460 (void) strcat(uname, dname);
1461 (void) strcat(uname, "/");
1462 break;
1463 default :
1464 error = EINVAL;
1465 goto end;
1466 }
1467 off += 4 + pc->pc_len;
1468 }
1469 len = strlen(uname) - 1;
1470 if (uname[len] == '/') {
1471 if (len == 0) {
1472 /*
1473 * special case link to /
1474 */
1475 len = 1;
1476 } else {
1477 uname[len] = '\0';
1478 }
1479 }
1480
1481 error = uiomove(uname, len, UIO_READ, uiop);
1482
1483 ITIMES(ip);
1484
1485 end:
1486 if (fbp != NULL) {
1487 fbrelse(fbp, S_OTHER);
1488 }
1489 rw_exit(&ip->i_contents);
1490 if (uname != NULL) {
1491 kmem_free(uname, PAGESIZE);
1492 }
1493 if (dname != NULL) {
1494 kmem_free(dname, 1024);
1495 }
1496 return (error);
1497 }
1498
1499 /* ARGSUSED */
1500 static int32_t
1501 udf_fsync(
1502 struct vnode *vp,
1503 int32_t syncflag,
1504 struct cred *cr,
1505 caller_context_t *ct)
1506 {
1507 int32_t error = 0;
1508 struct ud_inode *ip = VTOI(vp);
1509
1510 ud_printf("udf_fsync\n");
1511
1512 rw_enter(&ip->i_contents, RW_WRITER);
1513 if (!(IS_SWAPVP(vp))) {
1514 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1515 }
1516 if (error == 0) {
1517 error = ud_sync_indir(ip);
1518 }
1519 ITIMES(ip); /* XXX: is this necessary ??? */
1520 rw_exit(&ip->i_contents);
1521
1522 return (error);
1523 }
1524
1525 /* ARGSUSED */
1526 static void
1527 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1528 {
1529 ud_printf("udf_iinactive\n");
1530
1531 ud_iinactive(VTOI(vp), cr);
1532 }
1533
1534 /* ARGSUSED */
1535 static int32_t
1536 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1537 {
1538 struct udf_fid *udfidp;
1539 struct ud_inode *ip = VTOI(vp);
1540
1541 ud_printf("udf_fid\n");
1542
1543 if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1544 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1545 return (ENOSPC);
1546 }
1547
1548 udfidp = (struct udf_fid *)fidp;
1549 bzero((char *)udfidp, sizeof (struct udf_fid));
1550 rw_enter(&ip->i_contents, RW_READER);
1551 udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1552 udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1553 udfidp->udfid_prn = ip->i_icb_prn;
1554 udfidp->udfid_icb_lbn = ip->i_icb_block;
1555 rw_exit(&ip->i_contents);
1556
1557 return (0);
1558 }
1559
1560 /* ARGSUSED2 */
1561 static int
1562 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1563 {
1564 struct ud_inode *ip = VTOI(vp);
1565
1566 ud_printf("udf_rwlock\n");
1567
1568 if (write_lock) {
1569 rw_enter(&ip->i_rwlock, RW_WRITER);
1570 } else {
1571 rw_enter(&ip->i_rwlock, RW_READER);
1572 }
1573 #ifdef __lock_lint
1574 rw_exit(&ip->i_rwlock);
1575 #endif
1576 return (write_lock);
1577 }
1578
1579 /* ARGSUSED */
1580 static void
1581 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1582 {
1583 struct ud_inode *ip = VTOI(vp);
1584
1585 ud_printf("udf_rwunlock\n");
1586
1587 #ifdef __lock_lint
1588 rw_enter(&ip->i_rwlock, RW_WRITER);
1589 #endif
1590
1591 rw_exit(&ip->i_rwlock);
1592
1593 }
1594
1595 /* ARGSUSED */
1596 static int32_t
1597 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1598 {
1599 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1600 }
1601
1602 static int32_t
1603 udf_frlock(
1604 struct vnode *vp,
1605 int32_t cmd,
1606 struct flock64 *bfp,
1607 int32_t flag,
1608 offset_t offset,
1609 struct flk_callback *flk_cbp,
1610 cred_t *cr,
1611 caller_context_t *ct)
1612 {
1613 struct ud_inode *ip = VTOI(vp);
1614
1615 ud_printf("udf_frlock\n");
1616
1617 /*
1618 * If file is being mapped, disallow frlock.
1619 * XXX I am not holding tlock while checking i_mapcnt because the
1620 * current locking strategy drops all locks before calling fs_frlock.
1621 * So, mapcnt could change before we enter fs_frlock making is
1622 * meaningless to have held tlock in the first place.
1623 */
1624 if ((ip->i_mapcnt > 0) &&
1625 (MANDLOCK(vp, ip->i_char))) {
1626 return (EAGAIN);
1627 }
1628
1629 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1630 }
1631
1632 /*ARGSUSED6*/
1633 static int32_t
1634 udf_space(
1635 struct vnode *vp,
1636 int32_t cmd,
1637 struct flock64 *bfp,
1638 int32_t flag,
1639 offset_t offset,
1640 cred_t *cr,
1641 caller_context_t *ct)
1642 {
1643 int32_t error = 0;
1644
1645 ud_printf("udf_space\n");
1646
1647 if (cmd != F_FREESP) {
1648 error = EINVAL;
1649 } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1650 error = ud_freesp(vp, bfp, flag, cr);
1651
1652 if (error == 0 && bfp->l_start == 0)
1653 vnevent_truncate(vp, ct);
1654 }
1655
1656 return (error);
1657 }
1658
1659 /* ARGSUSED */
1660 static int32_t
1661 udf_getpage(
1662 struct vnode *vp,
1663 offset_t off,
1664 size_t len,
1665 uint32_t *protp,
1666 struct page **plarr,
1667 size_t plsz,
1668 struct seg *seg,
1669 caddr_t addr,
1670 enum seg_rw rw,
1671 struct cred *cr,
1672 caller_context_t *ct)
1673 {
1674 struct ud_inode *ip = VTOI(vp);
1675 int32_t error, has_holes, beyond_eof, seqmode, dolock;
1676 int32_t pgsize = PAGESIZE;
1677 struct udf_vfs *udf_vfsp = ip->i_udf;
1678 page_t **pl;
1679 u_offset_t pgoff, eoff, uoff;
1680 krw_t rwtype;
1681 caddr_t pgaddr;
1682
1683 ud_printf("udf_getpage\n");
1684
1685 uoff = (u_offset_t)off; /* type conversion */
1686 if (protp) {
1687 *protp = PROT_ALL;
1688 }
1689 if (vp->v_flag & VNOMAP) {
1690 return (ENOSYS);
1691 }
1692 seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1693
1694 rwtype = RW_READER;
1695 dolock = (rw_owner(&ip->i_contents) != curthread);
1696 retrylock:
1697 #ifdef __lock_lint
1698 rw_enter(&ip->i_contents, rwtype);
1699 #else
1700 if (dolock) {
1701 rw_enter(&ip->i_contents, rwtype);
1702 }
1703 #endif
1704
1705 /*
1706 * We may be getting called as a side effect of a bmap using
1707 * fbread() when the blocks might be being allocated and the
1708 * size has not yet been up'ed. In this case we want to be
1709 * able to return zero pages if we get back UDF_HOLE from
1710 * calling bmap for a non write case here. We also might have
1711 * to read some frags from the disk into a page if we are
1712 * extending the number of frags for a given lbn in bmap().
1713 */
1714 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1715 if (beyond_eof && seg != segkmap) {
1716 #ifdef __lock_lint
1717 rw_exit(&ip->i_contents);
1718 #else
1719 if (dolock) {
1720 rw_exit(&ip->i_contents);
1721 }
1722 #endif
1723 return (EFAULT);
1724 }
1725
1726 /*
1727 * Must hold i_contents lock throughout the call to pvn_getpages
1728 * since locked pages are returned from each call to ud_getapage.
1729 * Must *not* return locked pages and then try for contents lock
1730 * due to lock ordering requirements (inode > page)
1731 */
1732
1733 has_holes = ud_bmap_has_holes(ip);
1734
1735 if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1736 int32_t blk_size, count;
1737 u_offset_t offset;
1738
1739 /*
1740 * We must acquire the RW_WRITER lock in order to
1741 * call bmap_write().
1742 */
1743 if (dolock && rwtype == RW_READER) {
1744 rwtype = RW_WRITER;
1745
1746 if (!rw_tryupgrade(&ip->i_contents)) {
1747
1748 rw_exit(&ip->i_contents);
1749
1750 goto retrylock;
1751 }
1752 }
1753
1754 /*
1755 * May be allocating disk blocks for holes here as
1756 * a result of mmap faults. write(2) does the bmap_write
1757 * in rdip/wrip, not here. We are not dealing with frags
1758 * in this case.
1759 */
1760 offset = uoff;
1761 while ((offset < uoff + len) &&
1762 (offset < ip->i_size)) {
1763 /*
1764 * the variable "bnp" is to simplify the expression for
1765 * the compiler; * just passing in &bn to bmap_write
1766 * causes a compiler "loop"
1767 */
1768
1769 blk_size = udf_vfsp->udf_lbsize;
1770 if ((offset + blk_size) > ip->i_size) {
1771 count = ip->i_size - offset;
1772 } else {
1773 count = blk_size;
1774 }
1775 error = ud_bmap_write(ip, offset, count, 0, cr);
1776 if (error) {
1777 goto update_inode;
1778 }
1779 offset += count; /* XXX - make this contig */
1780 }
1781 }
1782
1783 /*
1784 * Can be a reader from now on.
1785 */
1786 #ifdef __lock_lint
1787 if (rwtype == RW_WRITER) {
1788 rw_downgrade(&ip->i_contents);
1789 }
1790 #else
1791 if (dolock && rwtype == RW_WRITER) {
1792 rw_downgrade(&ip->i_contents);
1793 }
1794 #endif
1795
1796 /*
1797 * We remove PROT_WRITE in cases when the file has UDF holes
1798 * because we don't want to call bmap_read() to check each
1799 * page if it is backed with a disk block.
1800 */
1801 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1802 *protp &= ~PROT_WRITE;
1803 }
1804
1805 error = 0;
1806
1807 /*
1808 * The loop looks up pages in the range <off, off + len).
1809 * For each page, we first check if we should initiate an asynchronous
1810 * read ahead before we call page_lookup (we may sleep in page_lookup
1811 * for a previously initiated disk read).
1812 */
1813 eoff = (uoff + len);
1814 for (pgoff = uoff, pgaddr = addr, pl = plarr;
1815 pgoff < eoff; /* empty */) {
1816 page_t *pp;
1817 u_offset_t nextrio;
1818 se_t se;
1819
1820 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1821
1822 /*
1823 * Handle async getpage (faultahead)
1824 */
1825 if (plarr == NULL) {
1826 ip->i_nextrio = pgoff;
1827 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1828 pgoff += pgsize;
1829 pgaddr += pgsize;
1830 continue;
1831 }
1832
1833 /*
1834 * Check if we should initiate read ahead of next cluster.
1835 * We call page_exists only when we need to confirm that
1836 * we have the current page before we initiate the read ahead.
1837 */
1838 nextrio = ip->i_nextrio;
1839 if (seqmode &&
1840 pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1841 nextrio < ip->i_size && page_exists(vp, pgoff))
1842 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1843
1844 if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1845
1846 /*
1847 * We found the page in the page cache.
1848 */
1849 *pl++ = pp;
1850 pgoff += pgsize;
1851 pgaddr += pgsize;
1852 len -= pgsize;
1853 plsz -= pgsize;
1854 } else {
1855
1856 /*
1857 * We have to create the page, or read it from disk.
1858 */
1859 if (error = ud_getpage_miss(vp, pgoff, len,
1860 seg, pgaddr, pl, plsz, rw, seqmode)) {
1861 goto error_out;
1862 }
1863
1864 while (*pl != NULL) {
1865 pl++;
1866 pgoff += pgsize;
1867 pgaddr += pgsize;
1868 len -= pgsize;
1869 plsz -= pgsize;
1870 }
1871 }
1872 }
1873
1874 /*
1875 * Return pages up to plsz if they are in the page cache.
1876 * We cannot return pages if there is a chance that they are
1877 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1878 */
1879 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1880
1881 ASSERT((protp == NULL) ||
1882 !(has_holes && (*protp & PROT_WRITE)));
1883
1884 eoff = pgoff + plsz;
1885 while (pgoff < eoff) {
1886 page_t *pp;
1887
1888 if ((pp = page_lookup_nowait(vp, pgoff,
1889 SE_SHARED)) == NULL)
1890 break;
1891
1892 *pl++ = pp;
1893 pgoff += pgsize;
1894 plsz -= pgsize;
1895 }
1896 }
1897
1898 if (plarr)
1899 *pl = NULL; /* Terminate page list */
1900 ip->i_nextr = pgoff;
1901
1902 error_out:
1903 if (error && plarr) {
1904 /*
1905 * Release any pages we have locked.
1906 */
1907 while (pl > &plarr[0])
1908 page_unlock(*--pl);
1909
1910 plarr[0] = NULL;
1911 }
1912
1913 update_inode:
1914 #ifdef __lock_lint
1915 rw_exit(&ip->i_contents);
1916 #else
1917 if (dolock) {
1918 rw_exit(&ip->i_contents);
1919 }
1920 #endif
1921
1922 /*
1923 * If the inode is not already marked for IACC (in rwip() for read)
1924 * and the inode is not marked for no access time update (in rwip()
1925 * for write) then update the inode access time and mod time now.
1926 */
1927 mutex_enter(&ip->i_tlock);
1928 if ((ip->i_flag & (IACC | INOACC)) == 0) {
1929 if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1930 ip->i_flag |= IACC;
1931 }
1932 if (rw == S_WRITE) {
1933 ip->i_flag |= IUPD;
1934 }
1935 ITIMES_NOLOCK(ip);
1936 }
1937 mutex_exit(&ip->i_tlock);
1938
1939 return (error);
1940 }
1941
1942 int32_t ud_delay = 1;
1943
1944 /* ARGSUSED */
1945 static int32_t
1946 udf_putpage(
1947 struct vnode *vp,
1948 offset_t off,
1949 size_t len,
1950 int32_t flags,
1951 struct cred *cr,
1952 caller_context_t *ct)
1953 {
1954 struct ud_inode *ip;
1955 int32_t error = 0;
1956
1957 ud_printf("udf_putpage\n");
1958
1959 ip = VTOI(vp);
1960 #ifdef __lock_lint
1961 rw_enter(&ip->i_contents, RW_WRITER);
1962 #endif
1963
1964 if (vp->v_count == 0) {
1965 cmn_err(CE_WARN, "ud_putpage : bad v_count");
1966 error = EINVAL;
1967 goto out;
1968 }
1969
1970 if (vp->v_flag & VNOMAP) {
1971 error = ENOSYS;
1972 goto out;
1973 }
1974
1975 if (flags & B_ASYNC) {
1976 if (ud_delay && len &&
1977 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1978 mutex_enter(&ip->i_tlock);
1979
1980 /*
1981 * If nobody stalled, start a new cluster.
1982 */
1983 if (ip->i_delaylen == 0) {
1984 ip->i_delayoff = off;
1985 ip->i_delaylen = len;
1986 mutex_exit(&ip->i_tlock);
1987 goto out;
1988 }
1989
1990 /*
1991 * If we have a full cluster or they are not contig,
1992 * then push last cluster and start over.
1993 */
1994 if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1995 ip->i_delayoff + ip->i_delaylen != off) {
1996 u_offset_t doff;
1997 size_t dlen;
1998
1999 doff = ip->i_delayoff;
2000 dlen = ip->i_delaylen;
2001 ip->i_delayoff = off;
2002 ip->i_delaylen = len;
2003 mutex_exit(&ip->i_tlock);
2004 error = ud_putpages(vp, doff, dlen, flags, cr);
2005 /* LMXXX - flags are new val, not old */
2006 goto out;
2007 }
2008
2009 /*
2010 * There is something there, it's not full, and
2011 * it is contig.
2012 */
2013 ip->i_delaylen += len;
2014 mutex_exit(&ip->i_tlock);
2015 goto out;
2016 }
2017
2018 /*
2019 * Must have weird flags or we are not clustering.
2020 */
2021 }
2022
2023 error = ud_putpages(vp, off, len, flags, cr);
2024
2025 out:
2026 #ifdef __lock_lint
2027 rw_exit(&ip->i_contents);
2028 #endif
2029 return (error);
2030 }
2031
2032 /* ARGSUSED */
2033 static int32_t
2034 udf_map(
2035 struct vnode *vp,
2036 offset_t off,
2037 struct as *as,
2038 caddr_t *addrp,
2039 size_t len,
2040 uint8_t prot,
2041 uint8_t maxprot,
2042 uint32_t flags,
2043 struct cred *cr,
2044 caller_context_t *ct)
2045 {
2046 struct segvn_crargs vn_a;
2047 int32_t error = 0;
2048
2049 ud_printf("udf_map\n");
2050
2051 if (vp->v_flag & VNOMAP) {
2052 error = ENOSYS;
2053 goto end;
2054 }
2055
2056 if ((off < (offset_t)0) ||
2057 ((off + len) < (offset_t)0)) {
2058 error = EINVAL;
2059 goto end;
2060 }
2061
2062 if (vp->v_type != VREG) {
2063 error = ENODEV;
2064 goto end;
2065 }
2066
2067 /*
2068 * If file is being locked, disallow mapping.
2069 */
2070 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2071 error = EAGAIN;
2072 goto end;
2073 }
2074
2075 as_rangelock(as);
2076 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2077 if (error != 0) {
2078 as_rangeunlock(as);
2079 goto end;
2080 }
2081
2082 vn_a.vp = vp;
2083 vn_a.offset = off;
2084 vn_a.type = flags & MAP_TYPE;
2085 vn_a.prot = prot;
2086 vn_a.maxprot = maxprot;
2087 vn_a.cred = cr;
2088 vn_a.amp = NULL;
2089 vn_a.flags = flags & ~MAP_TYPE;
2090 vn_a.szc = 0;
2091 vn_a.lgrp_mem_policy_flags = 0;
2092
2093 error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2094 as_rangeunlock(as);
2095
2096 end:
2097 return (error);
2098 }
2099
2100 /* ARGSUSED */
2101 static int32_t
2102 udf_addmap(struct vnode *vp,
2103 offset_t off,
2104 struct as *as,
2105 caddr_t addr,
2106 size_t len,
2107 uint8_t prot,
2108 uint8_t maxprot,
2109 uint32_t flags,
2110 struct cred *cr,
2111 caller_context_t *ct)
2112 {
2113 struct ud_inode *ip = VTOI(vp);
2114
2115 ud_printf("udf_addmap\n");
2116
2117 if (vp->v_flag & VNOMAP) {
2118 return (ENOSYS);
2119 }
2120
2121 mutex_enter(&ip->i_tlock);
2122 ip->i_mapcnt += btopr(len);
2123 mutex_exit(&ip->i_tlock);
2124
2125 return (0);
2126 }
2127
2128 /* ARGSUSED */
2129 static int32_t
2130 udf_delmap(
2131 struct vnode *vp, offset_t off,
2132 struct as *as,
2133 caddr_t addr,
2134 size_t len,
2135 uint32_t prot,
2136 uint32_t maxprot,
2137 uint32_t flags,
2138 struct cred *cr,
2139 caller_context_t *ct)
2140 {
2141 struct ud_inode *ip = VTOI(vp);
2142
2143 ud_printf("udf_delmap\n");
2144
2145 if (vp->v_flag & VNOMAP) {
2146 return (ENOSYS);
2147 }
2148
2149 mutex_enter(&ip->i_tlock);
2150 ip->i_mapcnt -= btopr(len); /* Count released mappings */
2151 ASSERT(ip->i_mapcnt >= 0);
2152 mutex_exit(&ip->i_tlock);
2153
2154 return (0);
2155 }
2156
2157 /* ARGSUSED */
2158 static int32_t
2159 udf_l_pathconf(
2160 struct vnode *vp,
2161 int32_t cmd,
2162 ulong_t *valp,
2163 struct cred *cr,
2164 caller_context_t *ct)
2165 {
2166 int32_t error = 0;
2167
2168 ud_printf("udf_l_pathconf\n");
2169
2170 if (cmd == _PC_FILESIZEBITS) {
2171 /*
2172 * udf supports 64 bits as file size
2173 * but there are several other restrictions
2174 * it only supports 32-bit block numbers and
2175 * daddr32_t is only and int32_t so taking these
2176 * into account we can stay just as where ufs is
2177 */
2178 *valp = 41;
2179 } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2180 /* nanosecond timestamp resolution */
2181 *valp = 1L;
2182 } else {
2183 error = fs_pathconf(vp, cmd, valp, cr, ct);
2184 }
2185
2186 return (error);
2187 }
2188
2189 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2190 #ifndef __lint
2191 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2192 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2193 #endif
2194 /*
2195 * Assumption is that there will not be a pageio request
2196 * to a enbedded file
2197 */
2198 /* ARGSUSED */
2199 static int32_t
2200 udf_pageio(
2201 struct vnode *vp,
2202 struct page *pp,
2203 u_offset_t io_off,
2204 size_t io_len,
2205 int32_t flags,
2206 struct cred *cr,
2207 caller_context_t *ct)
2208 {
2209 daddr_t bn;
2210 struct buf *bp;
2211 struct ud_inode *ip = VTOI(vp);
2212 int32_t dolock, error = 0, contig, multi_io;
2213 size_t done_len = 0, cur_len = 0;
2214 page_t *npp = NULL, *opp = NULL, *cpp = pp;
2215
2216 if (pp == NULL) {
2217 return (EINVAL);
2218 }
2219
2220 dolock = (rw_owner(&ip->i_contents) != curthread);
2221
2222 /*
2223 * We need a better check. Ideally, we would use another
2224 * vnodeops so that hlocked and forcibly unmounted file
2225 * systems would return EIO where appropriate and w/o the
2226 * need for these checks.
2227 */
2228 if (ip->i_udf == NULL) {
2229 return (EIO);
2230 }
2231
2232 #ifdef __lock_lint
2233 rw_enter(&ip->i_contents, RW_READER);
2234 #else
2235 if (dolock) {
2236 rw_enter(&ip->i_contents, RW_READER);
2237 }
2238 #endif
2239
2240 /*
2241 * Break the io request into chunks, one for each contiguous
2242 * stretch of disk blocks in the target file.
2243 */
2244 while (done_len < io_len) {
2245 ASSERT(cpp);
2246 bp = NULL;
2247 contig = 0;
2248 if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2249 &bn, &contig)) {
2250 break;
2251 }
2252
2253 if (bn == UDF_HOLE) { /* No holey swapfiles */
2254 cmn_err(CE_WARN, "SWAP file has HOLES");
2255 error = EINVAL;
2256 break;
2257 }
2258
2259 cur_len = MIN(io_len - done_len, contig);
2260
2261 /*
2262 * Check if more than one I/O is
2263 * required to complete the given
2264 * I/O operation
2265 */
2266 if (ip->i_udf->udf_lbsize < PAGESIZE) {
2267 if (cur_len >= PAGESIZE) {
2268 multi_io = 0;
2269 cur_len &= PAGEMASK;
2270 } else {
2271 multi_io = 1;
2272 cur_len = MIN(io_len - done_len, PAGESIZE);
2273 }
2274 }
2275 page_list_break(&cpp, &npp, btop(cur_len));
2276
2277 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2278 ASSERT(bp != NULL);
2279
2280 bp->b_edev = ip->i_dev;
2281 bp->b_dev = cmpdev(ip->i_dev);
2282 bp->b_blkno = bn;
2283 bp->b_un.b_addr = (caddr_t)0;
2284 bp->b_file = vp;
2285 bp->b_offset = (offset_t)(io_off + done_len);
2286
2287 /*
2288 * ub.ub_pageios.value.ul++;
2289 */
2290 if (multi_io == 0) {
2291 (void) bdev_strategy(bp);
2292 } else {
2293 error = ud_multi_strat(ip, cpp, bp,
2294 (u_offset_t)(io_off + done_len));
2295 if (error != 0) {
2296 pageio_done(bp);
2297 break;
2298 }
2299 }
2300 if (flags & B_READ) {
2301 ud_pageio_reads++;
2302 } else {
2303 ud_pageio_writes++;
2304 }
2305
2306 /*
2307 * If the request is not B_ASYNC, wait for i/o to complete
2308 * and re-assemble the page list to return to the caller.
2309 * If it is B_ASYNC we leave the page list in pieces and
2310 * cleanup() will dispose of them.
2311 */
2312 if ((flags & B_ASYNC) == 0) {
2313 error = biowait(bp);
2314 pageio_done(bp);
2315 if (error) {
2316 break;
2317 }
2318 page_list_concat(&opp, &cpp);
2319 }
2320 cpp = npp;
2321 npp = NULL;
2322 done_len += cur_len;
2323 }
2324
2325 ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2326 if (error) {
2327 if (flags & B_ASYNC) {
2328 /* Cleanup unprocessed parts of list */
2329 page_list_concat(&cpp, &npp);
2330 if (flags & B_READ) {
2331 pvn_read_done(cpp, B_ERROR);
2332 } else {
2333 pvn_write_done(cpp, B_ERROR);
2334 }
2335 } else {
2336 /* Re-assemble list and let caller clean up */
2337 page_list_concat(&opp, &cpp);
2338 page_list_concat(&opp, &npp);
2339 }
2340 }
2341
2342 #ifdef __lock_lint
2343 rw_exit(&ip->i_contents);
2344 #else
2345 if (dolock) {
2346 rw_exit(&ip->i_contents);
2347 }
2348 #endif
2349 return (error);
2350 }
2351
2352
2353
2354
2355 /* -------------------- local functions --------------------------- */
2356
2357
2358
2359 int32_t
2360 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2361 struct ud_inode *ip, caddr_t base, int32_t len,
2362 offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2363 {
2364 int32_t error;
2365 struct uio auio;
2366 struct iovec aiov;
2367
2368 ud_printf("ud_rdwri\n");
2369
2370 bzero((caddr_t)&auio, sizeof (uio_t));
2371 bzero((caddr_t)&aiov, sizeof (iovec_t));
2372
2373 aiov.iov_base = base;
2374 aiov.iov_len = len;
2375 auio.uio_iov = &aiov;
2376 auio.uio_iovcnt = 1;
2377 auio.uio_loffset = offset;
2378 auio.uio_segflg = (int16_t)seg;
2379 auio.uio_resid = len;
2380
2381 if (rw == UIO_WRITE) {
2382 auio.uio_fmode = FWRITE;
2383 auio.uio_extflg = UIO_COPY_DEFAULT;
2384 auio.uio_llimit = curproc->p_fsz_ctl;
2385 error = ud_wrip(ip, &auio, ioflag, cr);
2386 } else {
2387 auio.uio_fmode = FREAD;
2388 auio.uio_extflg = UIO_COPY_CACHED;
2389 auio.uio_llimit = MAXOFFSET_T;
2390 error = ud_rdip(ip, &auio, ioflag, cr);
2391 }
2392
2393 if (aresid) {
2394 *aresid = auio.uio_resid;
2395 } else if (auio.uio_resid) {
2396 error = EIO;
2397 }
2398 return (error);
2399 }
2400
2401 /*
2402 * Free behind hacks. The pager is busted.
2403 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2404 * or B_FREE_IF_TIGHT_ON_MEMORY.
2405 */
2406 int32_t ud_freebehind = 1;
2407 int32_t ud_smallfile = 32 * 1024;
2408
2409 /* ARGSUSED */
2410 int32_t
2411 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2412 size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2413 size_t plsz, enum seg_rw rw, int32_t seq)
2414 {
2415 struct ud_inode *ip = VTOI(vp);
2416 int32_t err = 0;
2417 size_t io_len;
2418 u_offset_t io_off;
2419 u_offset_t pgoff;
2420 page_t *pp;
2421
2422 pl[0] = NULL;
2423
2424 /*
2425 * Figure out whether the page can be created, or must be
2426 * read from the disk
2427 */
2428 if (rw == S_CREATE) {
2429 if ((pp = page_create_va(vp, off,
2430 PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2431 cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2432 return (EINVAL);
2433 }
2434 io_len = PAGESIZE;
2435 } else {
2436 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2437 &io_len, off, PAGESIZE, 0);
2438
2439 /*
2440 * Some other thread has entered the page.
2441 * ud_getpage will retry page_lookup.
2442 */
2443 if (pp == NULL) {
2444 return (0);
2445 }
2446
2447 /*
2448 * Fill the page with as much data as we can from the file.
2449 */
2450 err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2451 if (err) {
2452 pvn_read_done(pp, B_ERROR);
2453 return (err);
2454 }
2455
2456 /*
2457 * XXX ??? ufs has io_len instead of pgoff below
2458 */
2459 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2460
2461 /*
2462 * If the file access is sequential, initiate read ahead
2463 * of the next cluster.
2464 */
2465 if (seq && ip->i_nextrio < ip->i_size) {
2466 ud_getpage_ra(vp, off, seg, addr);
2467 }
2468 }
2469
2470 outmiss:
2471 pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2472 return (err);
2473 }
2474
2475 /* ARGSUSED */
2476 void
2477 ud_getpage_ra(struct vnode *vp,
2478 u_offset_t off, struct seg *seg, caddr_t addr)
2479 {
2480 page_t *pp;
2481 size_t io_len;
2482 struct ud_inode *ip = VTOI(vp);
2483 u_offset_t io_off = ip->i_nextrio, pgoff;
2484 caddr_t addr2 = addr + (io_off - off);
2485 daddr_t bn;
2486 int32_t contig = 0;
2487
2488 /*
2489 * Is this test needed?
2490 */
2491
2492 if (addr2 >= seg->s_base + seg->s_size) {
2493 return;
2494 }
2495
2496 contig = 0;
2497 if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2498 return;
2499 }
2500
2501 pp = pvn_read_kluster(vp, io_off, seg, addr2,
2502 &io_off, &io_len, io_off, PAGESIZE, 1);
2503
2504 /*
2505 * Some other thread has entered the page.
2506 * So no read head done here (ie we will have to and wait
2507 * for the read when needed).
2508 */
2509
2510 if (pp == NULL) {
2511 return;
2512 }
2513
2514 (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2515 ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2516 }
2517
2518 int
2519 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2520 uint32_t bflgs, u_offset_t *pg_off)
2521 {
2522 daddr_t bn;
2523 struct buf *bp;
2524 caddr_t kaddr, caddr;
2525 int32_t error = 0, contig = 0, multi_io = 0;
2526 int32_t lbsize = ip->i_udf->udf_lbsize;
2527 int32_t lbmask = ip->i_udf->udf_lbmask;
2528 uint64_t isize;
2529
2530 isize = (ip->i_size + lbmask) & (~lbmask);
2531 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2532
2533 /*
2534 * Embedded file read file_entry
2535 * from buffer cache and copy the required
2536 * portions
2537 */
2538 bp = ud_bread(ip->i_dev,
2539 ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2540 if ((bp->b_error == 0) &&
2541 (bp->b_resid == 0)) {
2542
2543 caddr = bp->b_un.b_addr + ip->i_data_off;
2544
2545 /*
2546 * mapin to kvm
2547 */
2548 kaddr = (caddr_t)ppmapin(pp,
2549 PROT_READ | PROT_WRITE, (caddr_t)-1);
2550 (void) kcopy(caddr, kaddr, ip->i_size);
2551
2552 /*
2553 * mapout of kvm
2554 */
2555 ppmapout(kaddr);
2556 }
2557 brelse(bp);
2558 contig = ip->i_size;
2559 } else {
2560
2561 /*
2562 * Get the continuous size and block number
2563 * at offset "off"
2564 */
2565 if (error = ud_bmap_read(ip, off, &bn, &contig))
2566 goto out;
2567 contig = MIN(contig, PAGESIZE);
2568 contig = (contig + lbmask) & (~lbmask);
2569
2570 /*
2571 * Zero part of the page which we are not
2572 * going to read from the disk.
2573 */
2574
2575 if (bn == UDF_HOLE) {
2576
2577 /*
2578 * This is a HOLE. Just zero out
2579 * the page
2580 */
2581 if (((off + contig) == isize) ||
2582 (contig == PAGESIZE)) {
2583 pagezero(pp->p_prev, 0, PAGESIZE);
2584 goto out;
2585 }
2586 }
2587
2588 if (contig < PAGESIZE) {
2589 uint64_t count;
2590
2591 count = isize - off;
2592 if (contig != count) {
2593 multi_io = 1;
2594 contig = (int32_t)(MIN(count, PAGESIZE));
2595 } else {
2596 pagezero(pp->p_prev, contig, PAGESIZE - contig);
2597 }
2598 }
2599
2600 /*
2601 * Get a bp and initialize it
2602 */
2603 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2604 ASSERT(bp != NULL);
2605
2606 bp->b_edev = ip->i_dev;
2607 bp->b_dev = cmpdev(ip->i_dev);
2608 bp->b_blkno = bn;
2609 bp->b_un.b_addr = 0;
2610 bp->b_file = ip->i_vnode;
2611
2612 /*
2613 * Start I/O
2614 */
2615 if (multi_io == 0) {
2616
2617 /*
2618 * Single I/O is sufficient for this page
2619 */
2620 (void) bdev_strategy(bp);
2621 } else {
2622
2623 /*
2624 * We need to do the I/O in
2625 * piece's
2626 */
2627 error = ud_multi_strat(ip, pp, bp, off);
2628 if (error != 0) {
2629 goto out;
2630 }
2631 }
2632 if ((bflgs & B_ASYNC) == 0) {
2633
2634 /*
2635 * Wait for i/o to complete.
2636 */
2637
2638 error = biowait(bp);
2639 pageio_done(bp);
2640 if (error) {
2641 goto out;
2642 }
2643 }
2644 }
2645 if ((off + contig) >= ip->i_size) {
2646 contig = ip->i_size - off;
2647 }
2648
2649 out:
2650 *pg_off = contig;
2651 return (error);
2652 }
2653
2654 int32_t
2655 ud_putpages(struct vnode *vp, offset_t off,
2656 size_t len, int32_t flags, struct cred *cr)
2657 {
2658 struct ud_inode *ip;
2659 page_t *pp;
2660 u_offset_t io_off;
2661 size_t io_len;
2662 u_offset_t eoff;
2663 int32_t err = 0;
2664 int32_t dolock;
2665
2666 ud_printf("ud_putpages\n");
2667
2668 if (vp->v_count == 0) {
2669 cmn_err(CE_WARN, "ud_putpages: bad v_count");
2670 return (EINVAL);
2671 }
2672
2673 ip = VTOI(vp);
2674
2675 /*
2676 * Acquire the readers/write inode lock before locking
2677 * any pages in this inode.
2678 * The inode lock is held during i/o.
2679 */
2680 if (len == 0) {
2681 mutex_enter(&ip->i_tlock);
2682 ip->i_delayoff = ip->i_delaylen = 0;
2683 mutex_exit(&ip->i_tlock);
2684 }
2685 #ifdef __lock_lint
2686 rw_enter(&ip->i_contents, RW_READER);
2687 #else
2688 dolock = (rw_owner(&ip->i_contents) != curthread);
2689 if (dolock) {
2690 rw_enter(&ip->i_contents, RW_READER);
2691 }
2692 #endif
2693
2694 if (!vn_has_cached_data(vp)) {
2695 #ifdef __lock_lint
2696 rw_exit(&ip->i_contents);
2697 #else
2698 if (dolock) {
2699 rw_exit(&ip->i_contents);
2700 }
2701 #endif
2702 return (0);
2703 }
2704
2705 if (len == 0) {
2706 /*
2707 * Search the entire vp list for pages >= off.
2708 */
2709 err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2710 flags, cr);
2711 } else {
2712 /*
2713 * Loop over all offsets in the range looking for
2714 * pages to deal with.
2715 */
2716 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2717 eoff = MIN(off + len, eoff);
2718 } else {
2719 eoff = off + len;
2720 }
2721
2722 for (io_off = off; io_off < eoff; io_off += io_len) {
2723 /*
2724 * If we are not invalidating, synchronously
2725 * freeing or writing pages, use the routine
2726 * page_lookup_nowait() to prevent reclaiming
2727 * them from the free list.
2728 */
2729 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2730 pp = page_lookup(vp, io_off,
2731 (flags & (B_INVAL | B_FREE)) ?
2732 SE_EXCL : SE_SHARED);
2733 } else {
2734 pp = page_lookup_nowait(vp, io_off,
2735 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2736 }
2737
2738 if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2739 io_len = PAGESIZE;
2740 } else {
2741
2742 err = ud_putapage(vp, pp,
2743 &io_off, &io_len, flags, cr);
2744 if (err != 0) {
2745 break;
2746 }
2747 /*
2748 * "io_off" and "io_len" are returned as
2749 * the range of pages we actually wrote.
2750 * This allows us to skip ahead more quickly
2751 * since several pages may've been dealt
2752 * with by this iteration of the loop.
2753 */
2754 }
2755 }
2756 }
2757 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2758 /*
2759 * We have just sync'ed back all the pages on
2760 * the inode, turn off the IMODTIME flag.
2761 */
2762 mutex_enter(&ip->i_tlock);
2763 ip->i_flag &= ~IMODTIME;
2764 mutex_exit(&ip->i_tlock);
2765 }
2766 #ifdef __lock_lint
2767 rw_exit(&ip->i_contents);
2768 #else
2769 if (dolock) {
2770 rw_exit(&ip->i_contents);
2771 }
2772 #endif
2773 return (err);
2774 }
2775
2776 /* ARGSUSED */
2777 int32_t
2778 ud_putapage(struct vnode *vp,
2779 page_t *pp, u_offset_t *offp,
2780 size_t *lenp, int32_t flags, struct cred *cr)
2781 {
2782 daddr_t bn;
2783 size_t io_len;
2784 struct ud_inode *ip;
2785 int32_t error = 0, contig, multi_io = 0;
2786 struct udf_vfs *udf_vfsp;
2787 u_offset_t off, io_off;
2788 caddr_t kaddr, caddr;
2789 struct buf *bp = NULL;
2790 int32_t lbmask;
2791 uint64_t isize;
2792 uint16_t crc_len;
2793 struct file_entry *fe;
2794
2795 ud_printf("ud_putapage\n");
2796
2797 ip = VTOI(vp);
2798 ASSERT(ip);
2799 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2800 lbmask = ip->i_udf->udf_lbmask;
2801 isize = (ip->i_size + lbmask) & (~lbmask);
2802
2803 udf_vfsp = ip->i_udf;
2804 ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2805
2806 /*
2807 * If the modified time on the inode has not already been
2808 * set elsewhere (e.g. for write/setattr) we set the time now.
2809 * This gives us approximate modified times for mmap'ed files
2810 * which are modified via stores in the user address space.
2811 */
2812 if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2813 mutex_enter(&ip->i_tlock);
2814 ip->i_flag |= IUPD;
2815 ITIMES_NOLOCK(ip);
2816 mutex_exit(&ip->i_tlock);
2817 }
2818
2819
2820 /*
2821 * Align the request to a block boundry (for old file systems),
2822 * and go ask bmap() how contiguous things are for this file.
2823 */
2824 off = pp->p_offset & ~(offset_t)lbmask;
2825 /* block align it */
2826
2827
2828 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2829 ASSERT(ip->i_size <= ip->i_max_emb);
2830
2831 pp = pvn_write_kluster(vp, pp, &io_off,
2832 &io_len, off, PAGESIZE, flags);
2833 if (io_len == 0) {
2834 io_len = PAGESIZE;
2835 }
2836
2837 bp = ud_bread(ip->i_dev,
2838 ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2839 udf_vfsp->udf_lbsize);
2840 fe = (struct file_entry *)bp->b_un.b_addr;
2841 if ((bp->b_flags & B_ERROR) ||
2842 (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2843 ip->i_icb_block,
2844 1, udf_vfsp->udf_lbsize) != 0)) {
2845 if (pp != NULL)
2846 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2847 if (bp->b_flags & B_ERROR) {
2848 error = EIO;
2849 } else {
2850 error = EINVAL;
2851 }
2852 brelse(bp);
2853 return (error);
2854 }
2855 if ((bp->b_error == 0) &&
2856 (bp->b_resid == 0)) {
2857
2858 caddr = bp->b_un.b_addr + ip->i_data_off;
2859 kaddr = (caddr_t)ppmapin(pp,
2860 PROT_READ | PROT_WRITE, (caddr_t)-1);
2861 (void) kcopy(kaddr, caddr, ip->i_size);
2862 ppmapout(kaddr);
2863 }
2864 crc_len = offsetof(struct file_entry, fe_spec) +
2865 SWAP_32(fe->fe_len_ear);
2866 crc_len += ip->i_size;
2867 ud_make_tag(ip->i_udf, &fe->fe_tag,
2868 UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2869
2870 bwrite(bp);
2871
2872 if (flags & B_ASYNC) {
2873 pvn_write_done(pp, flags);
2874 }
2875 contig = ip->i_size;
2876 } else {
2877
2878 if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2879 goto out;
2880 }
2881 contig = MIN(contig, PAGESIZE);
2882 contig = (contig + lbmask) & (~lbmask);
2883
2884 if (contig < PAGESIZE) {
2885 uint64_t count;
2886
2887 count = isize - off;
2888 if (contig != count) {
2889 multi_io = 1;
2890 contig = (int32_t)(MIN(count, PAGESIZE));
2891 }
2892 }
2893
2894 if ((off + contig) > isize) {
2895 contig = isize - off;
2896 }
2897
2898 if (contig > PAGESIZE) {
2899 if (contig & PAGEOFFSET) {
2900 contig &= PAGEMASK;
2901 }
2902 }
2903
2904 pp = pvn_write_kluster(vp, pp, &io_off,
2905 &io_len, off, contig, flags);
2906 if (io_len == 0) {
2907 io_len = PAGESIZE;
2908 }
2909
2910 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2911 ASSERT(bp != NULL);
2912
2913 bp->b_edev = ip->i_dev;
2914 bp->b_dev = cmpdev(ip->i_dev);
2915 bp->b_blkno = bn;
2916 bp->b_un.b_addr = 0;
2917 bp->b_file = vp;
2918 bp->b_offset = (offset_t)off;
2919
2920
2921 /*
2922 * write throttle
2923 */
2924 ASSERT(bp->b_iodone == NULL);
2925 bp->b_iodone = ud_iodone;
2926 mutex_enter(&ip->i_tlock);
2927 ip->i_writes += bp->b_bcount;
2928 mutex_exit(&ip->i_tlock);
2929
2930 if (multi_io == 0) {
2931
2932 (void) bdev_strategy(bp);
2933 } else {
2934 error = ud_multi_strat(ip, pp, bp, off);
2935 if (error != 0) {
2936 goto out;
2937 }
2938 }
2939
2940 if ((flags & B_ASYNC) == 0) {
2941 /*
2942 * Wait for i/o to complete.
2943 */
2944 error = biowait(bp);
2945 pageio_done(bp);
2946 }
2947 }
2948
2949 if ((flags & B_ASYNC) == 0) {
2950 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2951 }
2952
2953 pp = NULL;
2954
2955 out:
2956 if (error != 0 && pp != NULL) {
2957 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2958 }
2959
2960 if (offp) {
2961 *offp = io_off;
2962 }
2963 if (lenp) {
2964 *lenp = io_len;
2965 }
2966
2967 return (error);
2968 }
2969
2970
2971 int32_t
2972 ud_iodone(struct buf *bp)
2973 {
2974 struct ud_inode *ip;
2975
2976 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2977
2978 bp->b_iodone = NULL;
2979
2980 ip = VTOI(bp->b_pages->p_vnode);
2981
2982 mutex_enter(&ip->i_tlock);
2983 if (ip->i_writes >= ud_LW) {
2984 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2985 if (ud_WRITES) {
2986 cv_broadcast(&ip->i_wrcv); /* wake all up */
2987 }
2988 }
2989 } else {
2990 ip->i_writes -= bp->b_bcount;
2991 }
2992 mutex_exit(&ip->i_tlock);
2993 iodone(bp);
2994 return (0);
2995 }
2996
2997 /* ARGSUSED3 */
2998 int32_t
2999 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
3000 {
3001 struct vnode *vp;
3002 struct udf_vfs *udf_vfsp;
3003 krw_t rwtype;
3004 caddr_t base;
3005 uint32_t flags;
3006 int32_t error, n, on, mapon, dofree;
3007 u_offset_t off;
3008 long oresid = uio->uio_resid;
3009
3010 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3011 if ((ip->i_type != VREG) &&
3012 (ip->i_type != VDIR) &&
3013 (ip->i_type != VLNK)) {
3014 return (EIO);
3015 }
3016
3017 if (uio->uio_loffset > MAXOFFSET_T) {
3018 return (0);
3019 }
3020
3021 if ((uio->uio_loffset < (offset_t)0) ||
3022 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3023 return (EINVAL);
3024 }
3025 if (uio->uio_resid == 0) {
3026 return (0);
3027 }
3028
3029 vp = ITOV(ip);
3030 udf_vfsp = ip->i_udf;
3031 mutex_enter(&ip->i_tlock);
3032 ip->i_flag |= IACC;
3033 mutex_exit(&ip->i_tlock);
3034
3035 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3036
3037 do {
3038 offset_t diff;
3039 u_offset_t uoff = uio->uio_loffset;
3040 off = uoff & (offset_t)MAXBMASK;
3041 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3042 on = (int)blkoff(udf_vfsp, uoff);
3043 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3044
3045 diff = ip->i_size - uoff;
3046
3047 if (diff <= (offset_t)0) {
3048 error = 0;
3049 goto out;
3050 }
3051 if (diff < (offset_t)n) {
3052 n = (int)diff;
3053 }
3054 dofree = ud_freebehind &&
3055 ip->i_nextr == (off & PAGEMASK) &&
3056 off > ud_smallfile;
3057
3058 #ifndef __lock_lint
3059 if (rwtype == RW_READER) {
3060 rw_exit(&ip->i_contents);
3061 }
3062 #endif
3063
3064 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3065 (uint32_t)n, 1, S_READ);
3066 error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3067
3068 flags = 0;
3069 if (!error) {
3070 /*
3071 * If read a whole block, or read to eof,
3072 * won't need this buffer again soon.
3073 */
3074 if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3075 freemem < lotsfree + pages_before_pager) {
3076 flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3077 }
3078 /*
3079 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3080 * we want to make sure that the page which has
3081 * been read, is written on disk if it is dirty.
3082 * And corresponding indirect blocks should also
3083 * be flushed out.
3084 */
3085 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3086 flags &= ~SM_ASYNC;
3087 flags |= SM_WRITE;
3088 }
3089 error = segmap_release(segkmap, base, flags);
3090 } else {
3091 (void) segmap_release(segkmap, base, flags);
3092 }
3093
3094 #ifndef __lock_lint
3095 if (rwtype == RW_READER) {
3096 rw_enter(&ip->i_contents, rwtype);
3097 }
3098 #endif
3099 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3100 out:
3101 /*
3102 * Inode is updated according to this table if FRSYNC is set.
3103 *
3104 * FSYNC FDSYNC(posix.4)
3105 * --------------------------
3106 * always IATTCHG|IBDWRITE
3107 */
3108 if (ioflag & FRSYNC) {
3109 if ((ioflag & FSYNC) ||
3110 ((ioflag & FDSYNC) &&
3111 (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3112 rw_exit(&ip->i_contents);
3113 rw_enter(&ip->i_contents, RW_WRITER);
3114 ud_iupdat(ip, 1);
3115 }
3116 }
3117 /*
3118 * If we've already done a partial read, terminate
3119 * the read but return no error.
3120 */
3121 if (oresid != uio->uio_resid) {
3122 error = 0;
3123 }
3124 ITIMES(ip);
3125
3126 return (error);
3127 }
3128
3129 int32_t
3130 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3131 {
3132 caddr_t base;
3133 struct vnode *vp;
3134 struct udf_vfs *udf_vfsp;
3135 uint32_t flags;
3136 int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3137 int32_t pagecreate, newpage;
3138 uint64_t old_i_size;
3139 u_offset_t off;
3140 long start_resid = uio->uio_resid, premove_resid;
3141 rlim64_t limit = uio->uio_limit;
3142
3143
3144 ASSERT(RW_WRITE_HELD(&ip->i_contents));
3145 if ((ip->i_type != VREG) &&
3146 (ip->i_type != VDIR) &&
3147 (ip->i_type != VLNK)) {
3148 return (EIO);
3149 }
3150
3151 if (uio->uio_loffset >= MAXOFFSET_T) {
3152 return (EFBIG);
3153 }
3154 /*
3155 * see udf_l_pathconf
3156 */
3157 if (limit > (((uint64_t)1 << 40) - 1)) {
3158 limit = ((uint64_t)1 << 40) - 1;
3159 }
3160 if (uio->uio_loffset >= limit) {
3161 proc_t *p = ttoproc(curthread);
3162
3163 mutex_enter(&p->p_lock);
3164 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3165 p, RCA_UNSAFE_SIGINFO);
3166 mutex_exit(&p->p_lock);
3167 return (EFBIG);
3168 }
3169 if ((uio->uio_loffset < (offset_t)0) ||
3170 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3171 return (EINVAL);
3172 }
3173 if (uio->uio_resid == 0) {
3174 return (0);
3175 }
3176
3177 mutex_enter(&ip->i_tlock);
3178 ip->i_flag |= INOACC;
3179
3180 if (ioflag & (FSYNC | FDSYNC)) {
3181 ip->i_flag |= ISYNC;
3182 iupdat_flag = 1;
3183 }
3184 mutex_exit(&ip->i_tlock);
3185
3186 udf_vfsp = ip->i_udf;
3187 vp = ITOV(ip);
3188
3189 do {
3190 u_offset_t uoff = uio->uio_loffset;
3191 off = uoff & (offset_t)MAXBMASK;
3192 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3193 on = (int)blkoff(udf_vfsp, uoff);
3194 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3195
3196 if (ip->i_type == VREG && uoff + n >= limit) {
3197 if (uoff >= limit) {
3198 error = EFBIG;
3199 goto out;
3200 }
3201 n = (int)(limit - (rlim64_t)uoff);
3202 }
3203 if (uoff + n > ip->i_size) {
3204 /*
3205 * We are extending the length of the file.
3206 * bmap is used so that we are sure that
3207 * if we need to allocate new blocks, that it
3208 * is done here before we up the file size.
3209 */
3210 error = ud_bmap_write(ip, uoff,
3211 (int)(on + n), mapon == 0, cr);
3212 if (error) {
3213 break;
3214 }
3215 i_size_changed = 1;
3216 old_i_size = ip->i_size;
3217 ip->i_size = uoff + n;
3218 /*
3219 * If we are writing from the beginning of
3220 * the mapping, we can just create the
3221 * pages without having to read them.
3222 */
3223 pagecreate = (mapon == 0);
3224 } else if (n == MAXBSIZE) {
3225 /*
3226 * Going to do a whole mappings worth,
3227 * so we can just create the pages w/o
3228 * having to read them in. But before
3229 * we do that, we need to make sure any
3230 * needed blocks are allocated first.
3231 */
3232 error = ud_bmap_write(ip, uoff,
3233 (int)(on + n), 1, cr);
3234 if (error) {
3235 break;
3236 }
3237 pagecreate = 1;
3238 } else {
3239 pagecreate = 0;
3240 }
3241
3242 rw_exit(&ip->i_contents);
3243
3244 /*
3245 * Touch the page and fault it in if it is not in
3246 * core before segmap_getmapflt can lock it. This
3247 * is to avoid the deadlock if the buffer is mapped
3248 * to the same file through mmap which we want to
3249 * write to.
3250 */
3251 uio_prefaultpages((long)n, uio);
3252
3253 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3254 (uint32_t)n, !pagecreate, S_WRITE);
3255
3256 /*
3257 * segmap_pagecreate() returns 1 if it calls
3258 * page_create_va() to allocate any pages.
3259 */
3260 newpage = 0;
3261 if (pagecreate) {
3262 newpage = segmap_pagecreate(segkmap, base,
3263 (size_t)n, 0);
3264 }
3265
3266 premove_resid = uio->uio_resid;
3267 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3268
3269 if (pagecreate &&
3270 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3271 /*
3272 * We created pages w/o initializing them completely,
3273 * thus we need to zero the part that wasn't set up.
3274 * This happens on most EOF write cases and if
3275 * we had some sort of error during the uiomove.
3276 */
3277 int nzero, nmoved;
3278
3279 nmoved = (int)(uio->uio_loffset - (off + mapon));
3280 ASSERT(nmoved >= 0 && nmoved <= n);
3281 nzero = roundup(on + n, PAGESIZE) - nmoved;
3282 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3283 (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3284 }
3285
3286 /*
3287 * Unlock the pages allocated by page_create_va()
3288 * in segmap_pagecreate()
3289 */
3290 if (newpage) {
3291 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3292 }
3293
3294 if (error) {
3295 /*
3296 * If we failed on a write, we may have already
3297 * allocated file blocks as well as pages. It's
3298 * hard to undo the block allocation, but we must
3299 * be sure to invalidate any pages that may have
3300 * been allocated.
3301 */
3302 (void) segmap_release(segkmap, base, SM_INVAL);
3303 } else {
3304 flags = 0;
3305 /*
3306 * Force write back for synchronous write cases.
3307 */
3308 if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3309 /*
3310 * If the sticky bit is set but the
3311 * execute bit is not set, we do a
3312 * synchronous write back and free
3313 * the page when done. We set up swap
3314 * files to be handled this way to
3315 * prevent servers from keeping around
3316 * the client's swap pages too long.
3317 * XXX - there ought to be a better way.
3318 */
3319 if (IS_SWAPVP(vp)) {
3320 flags = SM_WRITE | SM_FREE |
3321 SM_DONTNEED;
3322 iupdat_flag = 0;
3323 } else {
3324 flags = SM_WRITE;
3325 }
3326 } else if (((mapon + n) == MAXBSIZE) ||
3327 IS_SWAPVP(vp)) {
3328 /*
3329 * Have written a whole block.
3330 * Start an asynchronous write and
3331 * mark the buffer to indicate that
3332 * it won't be needed again soon.
3333 */
3334 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3335 }
3336 error = segmap_release(segkmap, base, flags);
3337
3338 /*
3339 * If the operation failed and is synchronous,
3340 * then we need to unwind what uiomove() last
3341 * did so we can potentially return an error to
3342 * the caller. If this write operation was
3343 * done in two pieces and the first succeeded,
3344 * then we won't return an error for the second
3345 * piece that failed. However, we only want to
3346 * return a resid value that reflects what was
3347 * really done.
3348 *
3349 * Failures for non-synchronous operations can
3350 * be ignored since the page subsystem will
3351 * retry the operation until it succeeds or the
3352 * file system is unmounted.
3353 */
3354 if (error) {
3355 if ((ioflag & (FSYNC | FDSYNC)) ||
3356 ip->i_type == VDIR) {
3357 uio->uio_resid = premove_resid;
3358 } else {
3359 error = 0;
3360 }
3361 }
3362 }
3363
3364 /*
3365 * Re-acquire contents lock.
3366 */
3367 rw_enter(&ip->i_contents, RW_WRITER);
3368 /*
3369 * If the uiomove() failed or if a synchronous
3370 * page push failed, fix up i_size.
3371 */
3372 if (error) {
3373 if (i_size_changed) {
3374 /*
3375 * The uiomove failed, and we
3376 * allocated blocks,so get rid
3377 * of them.
3378 */
3379 (void) ud_itrunc(ip, old_i_size, 0, cr);
3380 }
3381 } else {
3382 /*
3383 * XXX - Can this be out of the loop?
3384 */
3385 ip->i_flag |= IUPD | ICHG;
3386 if (i_size_changed) {
3387 ip->i_flag |= IATTCHG;
3388 }
3389 if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3390 (IEXEC >> 10))) != 0 &&
3391 (ip->i_char & (ISUID | ISGID)) != 0 &&
3392 secpolicy_vnode_setid_retain(cr,
3393 (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3394 /*
3395 * Clear Set-UID & Set-GID bits on
3396 * successful write if not privileged
3397 * and at least one of the execute bits
3398 * is set. If we always clear Set-GID,
3399 * mandatory file and record locking is
3400 * unuseable.
3401 */
3402 ip->i_char &= ~(ISUID | ISGID);
3403 }
3404 }
3405 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3406
3407 out:
3408 /*
3409 * Inode is updated according to this table -
3410 *
3411 * FSYNC FDSYNC(posix.4)
3412 * --------------------------
3413 * always@ IATTCHG|IBDWRITE
3414 *
3415 * @ - If we are doing synchronous write the only time we should
3416 * not be sync'ing the ip here is if we have the stickyhack
3417 * activated, the file is marked with the sticky bit and
3418 * no exec bit, the file length has not been changed and
3419 * no new blocks have been allocated during this write.
3420 */
3421 if ((ip->i_flag & ISYNC) != 0) {
3422 /*
3423 * we have eliminated nosync
3424 */
3425 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3426 ((ioflag & FSYNC) && iupdat_flag)) {
3427 ud_iupdat(ip, 1);
3428 }
3429 }
3430
3431 /*
3432 * If we've already done a partial-write, terminate
3433 * the write but return no error.
3434 */
3435 if (start_resid != uio->uio_resid) {
3436 error = 0;
3437 }
3438 ip->i_flag &= ~(INOACC | ISYNC);
3439 ITIMES_NOLOCK(ip);
3440
3441 return (error);
3442 }
3443
3444 int32_t
3445 ud_multi_strat(struct ud_inode *ip,
3446 page_t *pp, struct buf *bp, u_offset_t start)
3447 {
3448 daddr_t bn;
3449 int32_t error = 0, io_count, contig, alloc_sz, i;
3450 uint32_t io_off;
3451 mio_master_t *mm = NULL;
3452 mio_slave_t *ms = NULL;
3453 struct buf *rbp;
3454
3455 ASSERT(!(start & PAGEOFFSET));
3456
3457 /*
3458 * Figure out how many buffers to allocate
3459 */
3460 io_count = 0;
3461 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3462 contig = 0;
3463 if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3464 &bn, &contig)) {
3465 goto end;
3466 }
3467 if (contig == 0) {
3468 goto end;
3469 }
3470 contig = MIN(contig, PAGESIZE - io_off);
3471 if (bn != UDF_HOLE) {
3472 io_count ++;
3473 } else {
3474 /*
3475 * HOLE
3476 */
3477 if (bp->b_flags & B_READ) {
3478
3479 /*
3480 * This is a hole and is read
3481 * it should be filled with 0's
3482 */
3483 pagezero(pp, io_off, contig);
3484 }
3485 }
3486 }
3487
3488
3489 if (io_count != 0) {
3490
3491 /*
3492 * Allocate memory for all the
3493 * required number of buffers
3494 */
3495 alloc_sz = sizeof (mio_master_t) +
3496 (sizeof (mio_slave_t) * io_count);
3497 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3498 if (mm == NULL) {
3499 error = ENOMEM;
3500 goto end;
3501 }
3502
3503 /*
3504 * initialize master
3505 */
3506 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3507 mm->mm_size = alloc_sz;
3508 mm->mm_bp = bp;
3509 mm->mm_resid = 0;
3510 mm->mm_error = 0;
3511 mm->mm_index = master_index++;
3512
3513 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3514
3515 /*
3516 * Initialize buffers
3517 */
3518 io_count = 0;
3519 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3520 contig = 0;
3521 if (error = ud_bmap_read(ip,
3522 (u_offset_t)(start + io_off),
3523 &bn, &contig)) {
3524 goto end;
3525 }
3526 ASSERT(contig);
3527 if ((io_off + contig) > bp->b_bcount) {
3528 contig = bp->b_bcount - io_off;
3529 }
3530 if (bn != UDF_HOLE) {
3531 /*
3532 * Clone the buffer
3533 * and prepare to start I/O
3534 */
3535 ms->ms_ptr = mm;
3536 bioinit(&ms->ms_buf);
3537 rbp = bioclone(bp, io_off, (size_t)contig,
3538 bp->b_edev, bn, ud_slave_done,
3539 &ms->ms_buf, KM_NOSLEEP);
3540 ASSERT(rbp == &ms->ms_buf);
3541 mm->mm_resid += contig;
3542 io_count++;
3543 ms ++;
3544 }
3545 }
3546
3547 /*
3548 * Start I/O's
3549 */
3550 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3551 for (i = 0; i < io_count; i++) {
3552 (void) bdev_strategy(&ms->ms_buf);
3553 ms ++;
3554 }
3555 }
3556
3557 end:
3558 if (error != 0) {
3559 bp->b_flags |= B_ERROR;
3560 bp->b_error = error;
3561 if (mm != NULL) {
3562 mutex_destroy(&mm->mm_mutex);
3563 kmem_free(mm, mm->mm_size);
3564 }
3565 }
3566 return (error);
3567 }
3568
3569 int32_t
3570 ud_slave_done(struct buf *bp)
3571 {
3572 mio_master_t *mm;
3573 int32_t resid;
3574
3575 ASSERT(SEMA_HELD(&bp->b_sem));
3576 ASSERT((bp->b_flags & B_DONE) == 0);
3577
3578 mm = ((mio_slave_t *)bp)->ms_ptr;
3579
3580 /*
3581 * Propagate error and byte count info from slave struct to
3582 * the master struct
3583 */
3584 mutex_enter(&mm->mm_mutex);
3585 if (bp->b_flags & B_ERROR) {
3586
3587 /*
3588 * If multiple slave buffers get
3589 * error we forget the old errors
3590 * this is ok because we any way
3591 * cannot return multiple errors
3592 */
3593 mm->mm_error = bp->b_error;
3594 }
3595 mm->mm_resid -= bp->b_bcount;
3596 resid = mm->mm_resid;
3597 mutex_exit(&mm->mm_mutex);
3598
3599 /*
3600 * free up the resources allocated to cloned buffers.
3601 */
3602 bp_mapout(bp);
3603 biofini(bp);
3604
3605 if (resid == 0) {
3606
3607 /*
3608 * This is the last I/O operation
3609 * clean up and return the original buffer
3610 */
3611 if (mm->mm_error) {
3612 mm->mm_bp->b_flags |= B_ERROR;
3613 mm->mm_bp->b_error = mm->mm_error;
3614 }
3615 biodone(mm->mm_bp);
3616 mutex_destroy(&mm->mm_mutex);
3617 kmem_free(mm, mm->mm_size);
3618 }
3619 return (0);
3620 }