1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2015, Joyent, Inc.
28 */
29
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vfs_opreg.h>
43 #include <sys/stat.h>
44 #include <sys/vnode.h>
45 #include <sys/mode.h>
46 #include <sys/proc.h>
47 #include <sys/disp.h>
48 #include <sys/file.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/kmem.h>
52 #include <sys/uio.h>
53 #include <sys/dnlc.h>
54 #include <sys/conf.h>
55 #include <sys/errno.h>
56 #include <sys/mman.h>
57 #include <sys/fbuf.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/dirent.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/statvfs.h>
66 #include <sys/mount.h>
67 #include <sys/sunddi.h>
68 #include <sys/bootconf.h>
69 #include <sys/policy.h>
70
71 #include <vm/hat.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_map.h>
77 #include <vm/seg_kmem.h>
78 #include <vm/seg_vn.h>
79 #include <vm/rm.h>
80 #include <vm/page.h>
81 #include <sys/swap.h>
82
83 #include <fs/fs_subr.h>
84
85 #include <sys/fs/udf_volume.h>
86 #include <sys/fs/udf_inode.h>
87
88 static int32_t udf_open(struct vnode **,
89 int32_t, struct cred *, caller_context_t *);
90 static int32_t udf_close(struct vnode *,
91 int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
92 static int32_t udf_read(struct vnode *,
93 struct uio *, int32_t, struct cred *, caller_context_t *);
94 static int32_t udf_write(struct vnode *,
95 struct uio *, int32_t, struct cred *, caller_context_t *);
96 static int32_t udf_ioctl(struct vnode *,
97 int32_t, intptr_t, int32_t, struct cred *, int32_t *,
98 caller_context_t *);
99 static int32_t udf_getattr(struct vnode *,
100 struct vattr *, int32_t, struct cred *, caller_context_t *);
101 static int32_t udf_setattr(struct vnode *,
102 struct vattr *, int32_t, struct cred *, caller_context_t *);
103 static int32_t udf_access(struct vnode *,
104 int32_t, int32_t, struct cred *, caller_context_t *);
105 static int32_t udf_lookup(struct vnode *,
106 char *, struct vnode **, struct pathname *,
107 int32_t, struct vnode *, struct cred *,
108 caller_context_t *, int *, pathname_t *);
109 static int32_t udf_create(struct vnode *,
110 char *, struct vattr *, enum vcexcl,
111 int32_t, struct vnode **, struct cred *, int32_t,
112 caller_context_t *, vsecattr_t *);
113 static int32_t udf_remove(struct vnode *,
114 char *, struct cred *, caller_context_t *, int);
115 static int32_t udf_link(struct vnode *,
116 struct vnode *, char *, struct cred *, caller_context_t *, int);
117 static int32_t udf_rename(struct vnode *,
118 char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
119 static int32_t udf_mkdir(struct vnode *,
120 char *, struct vattr *, struct vnode **, struct cred *,
121 caller_context_t *, int, vsecattr_t *);
122 static int32_t udf_rmdir(struct vnode *,
123 char *, struct vnode *, struct cred *, caller_context_t *, int);
124 static int32_t udf_readdir(struct vnode *,
125 struct uio *, struct cred *, int32_t *, caller_context_t *, int);
126 static int32_t udf_symlink(struct vnode *,
127 char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
128 static int32_t udf_readlink(struct vnode *,
129 struct uio *, struct cred *, caller_context_t *);
130 static int32_t udf_fsync(struct vnode *,
131 int32_t, struct cred *, caller_context_t *);
132 static void udf_inactive(struct vnode *,
133 struct cred *, caller_context_t *);
134 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
135 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
136 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
137 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
138 caller_context_t *);
139 static int32_t udf_frlock(struct vnode *, int32_t,
140 struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
141 caller_context_t *);
142 static int32_t udf_space(struct vnode *, int32_t,
143 struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
144 static int32_t udf_getpage(struct vnode *, offset_t,
145 size_t, uint32_t *, struct page **, size_t,
146 struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
147 static int32_t udf_putpage(struct vnode *, offset_t,
148 size_t, int32_t, struct cred *, caller_context_t *);
149 static int32_t udf_map(struct vnode *, offset_t, struct as *,
150 caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
151 caller_context_t *);
152 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
153 caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
154 caller_context_t *);
155 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
156 caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
157 caller_context_t *);
158 static int32_t udf_l_pathconf(struct vnode *, int32_t,
159 ulong_t *, struct cred *, caller_context_t *);
160 static int32_t udf_pageio(struct vnode *, struct page *,
161 u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
162
163 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
164 size_t, struct seg *, caddr_t, page_t *pl[],
165 size_t, enum seg_rw, int32_t);
166 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
167 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
168 int32_t ud_page_fill(struct ud_inode *, page_t *,
169 u_offset_t, uint32_t, u_offset_t *);
170 int32_t ud_iodone(struct buf *);
171 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
172 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
173 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
174 int32_t ud_slave_done(struct buf *);
175
176 /*
177 * Structures to control multiple IO operations to get or put pages
178 * that are backed by discontiguous blocks. The master struct is
179 * a dummy that holds the original bp from pageio_setup. The
180 * slave struct holds the working bp's to do the actual IO. Once
181 * all the slave IOs complete. The master is processed as if a single
182 * IO op has completed.
183 */
184 uint32_t master_index = 0;
185 typedef struct mio_master {
186 kmutex_t mm_mutex; /* protect the fields below */
187 int32_t mm_size;
188 buf_t *mm_bp; /* original bp */
189 int32_t mm_resid; /* bytes remaining to transfer */
190 int32_t mm_error; /* accumulated error from slaves */
191 int32_t mm_index; /* XXX debugging */
192 } mio_master_t;
193
194 typedef struct mio_slave {
195 buf_t ms_buf; /* working buffer for this IO chunk */
196 mio_master_t *ms_ptr; /* pointer to master */
197 } mio_slave_t;
198
199 struct vnodeops *udf_vnodeops;
200
201 const fs_operation_def_t udf_vnodeops_template[] = {
202 VOPNAME_OPEN, { .vop_open = udf_open },
203 VOPNAME_CLOSE, { .vop_close = udf_close },
204 VOPNAME_READ, { .vop_read = udf_read },
205 VOPNAME_WRITE, { .vop_write = udf_write },
206 VOPNAME_IOCTL, { .vop_ioctl = udf_ioctl },
207 VOPNAME_GETATTR, { .vop_getattr = udf_getattr },
208 VOPNAME_SETATTR, { .vop_setattr = udf_setattr },
209 VOPNAME_ACCESS, { .vop_access = udf_access },
210 VOPNAME_LOOKUP, { .vop_lookup = udf_lookup },
211 VOPNAME_CREATE, { .vop_create = udf_create },
212 VOPNAME_REMOVE, { .vop_remove = udf_remove },
213 VOPNAME_LINK, { .vop_link = udf_link },
214 VOPNAME_RENAME, { .vop_rename = udf_rename },
215 VOPNAME_MKDIR, { .vop_mkdir = udf_mkdir },
216 VOPNAME_RMDIR, { .vop_rmdir = udf_rmdir },
217 VOPNAME_READDIR, { .vop_readdir = udf_readdir },
218 VOPNAME_SYMLINK, { .vop_symlink = udf_symlink },
219 VOPNAME_READLINK, { .vop_readlink = udf_readlink },
220 VOPNAME_FSYNC, { .vop_fsync = udf_fsync },
221 VOPNAME_INACTIVE, { .vop_inactive = udf_inactive },
222 VOPNAME_FID, { .vop_fid = udf_fid },
223 VOPNAME_RWLOCK, { .vop_rwlock = udf_rwlock },
224 VOPNAME_RWUNLOCK, { .vop_rwunlock = udf_rwunlock },
225 VOPNAME_SEEK, { .vop_seek = udf_seek },
226 VOPNAME_FRLOCK, { .vop_frlock = udf_frlock },
227 VOPNAME_SPACE, { .vop_space = udf_space },
228 VOPNAME_GETPAGE, { .vop_getpage = udf_getpage },
229 VOPNAME_PUTPAGE, { .vop_putpage = udf_putpage },
230 VOPNAME_MAP, { .vop_map = udf_map },
231 VOPNAME_ADDMAP, { .vop_addmap = udf_addmap },
232 VOPNAME_DELMAP, { .vop_delmap = udf_delmap },
233 VOPNAME_PATHCONF, { .vop_pathconf = udf_l_pathconf },
234 VOPNAME_PAGEIO, { .vop_pageio = udf_pageio },
235 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
236 NULL, NULL
237 };
238
239 /* ARGSUSED */
240 static int32_t
241 udf_open(
242 struct vnode **vpp,
243 int32_t flag,
244 struct cred *cr,
245 caller_context_t *ct)
246 {
247 ud_printf("udf_open\n");
248
249 return (0);
250 }
251
252 /* ARGSUSED */
253 static int32_t
254 udf_close(
255 struct vnode *vp,
256 int32_t flag,
257 int32_t count,
258 offset_t offset,
259 struct cred *cr,
260 caller_context_t *ct)
261 {
262 struct ud_inode *ip = VTOI(vp);
263
264 ud_printf("udf_close\n");
265
266 ITIMES(ip);
267
268 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
269 cleanshares(vp, ttoproc(curthread)->p_pid);
270
271 /*
272 * Push partially filled cluster at last close.
273 * ``last close'' is approximated because the dnlc
274 * may have a hold on the vnode.
275 */
276 if (vp->v_count <= 2 && vp->v_type != VBAD) {
277 struct ud_inode *ip = VTOI(vp);
278 if (ip->i_delaylen) {
279 (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
280 B_ASYNC | B_FREE, cr);
281 ip->i_delaylen = 0;
282 }
283 }
284
285 return (0);
286 }
287
288 /* ARGSUSED */
289 static int32_t
290 udf_read(
291 struct vnode *vp,
292 struct uio *uiop,
293 int32_t ioflag,
294 struct cred *cr,
295 caller_context_t *ct)
296 {
297 struct ud_inode *ip = VTOI(vp);
298 int32_t error;
299
300 ud_printf("udf_read\n");
301
302 ASSERT(RW_READ_HELD(&ip->i_rwlock));
303
304 if (MANDLOCK(vp, ip->i_char)) {
305 /*
306 * udf_getattr ends up being called by chklock
307 */
308 error = chklock(vp, FREAD, uiop->uio_loffset,
309 uiop->uio_resid, uiop->uio_fmode, ct);
310 if (error) {
311 goto end;
312 }
313 }
314
315 rw_enter(&ip->i_contents, RW_READER);
316 error = ud_rdip(ip, uiop, ioflag, cr);
317 rw_exit(&ip->i_contents);
318
319 end:
320 return (error);
321 }
322
323
324 int32_t ud_WRITES = 1;
325 int32_t ud_HW = 96 * 1024;
326 int32_t ud_LW = 64 * 1024;
327 int32_t ud_throttles = 0;
328
329 /* ARGSUSED */
330 static int32_t
331 udf_write(
332 struct vnode *vp,
333 struct uio *uiop,
334 int32_t ioflag,
335 struct cred *cr,
336 caller_context_t *ct)
337 {
338 struct ud_inode *ip = VTOI(vp);
339 int32_t error = 0;
340
341 ud_printf("udf_write\n");
342
343 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
344
345 if (MANDLOCK(vp, ip->i_char)) {
346 /*
347 * ud_getattr ends up being called by chklock
348 */
349 error = chklock(vp, FWRITE, uiop->uio_loffset,
350 uiop->uio_resid, uiop->uio_fmode, ct);
351 if (error) {
352 goto end;
353 }
354 }
355 /*
356 * Throttle writes.
357 */
358 mutex_enter(&ip->i_tlock);
359 if (ud_WRITES && (ip->i_writes > ud_HW)) {
360 while (ip->i_writes > ud_HW) {
361 ud_throttles++;
362 cv_wait(&ip->i_wrcv, &ip->i_tlock);
363 }
364 }
365 mutex_exit(&ip->i_tlock);
366
367 /*
368 * Write to the file
369 */
370 rw_enter(&ip->i_contents, RW_WRITER);
371 if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
372 /*
373 * In append mode start at end of file.
374 */
375 uiop->uio_loffset = ip->i_size;
376 }
377 error = ud_wrip(ip, uiop, ioflag, cr);
378 rw_exit(&ip->i_contents);
379
380 end:
381 return (error);
382 }
383
384 /* ARGSUSED */
385 static int32_t
386 udf_ioctl(
387 struct vnode *vp,
388 int32_t cmd,
389 intptr_t arg,
390 int32_t flag,
391 struct cred *cr,
392 int32_t *rvalp,
393 caller_context_t *ct)
394 {
395 return (ENOTTY);
396 }
397
398 /* ARGSUSED */
399 static int32_t
400 udf_getattr(
401 struct vnode *vp,
402 struct vattr *vap,
403 int32_t flags,
404 struct cred *cr,
405 caller_context_t *ct)
406 {
407 struct ud_inode *ip = VTOI(vp);
408
409 ud_printf("udf_getattr\n");
410
411 if (vap->va_mask == AT_SIZE) {
412 /*
413 * for performance, if only the size is requested don't bother
414 * with anything else.
415 */
416 vap->va_size = ip->i_size;
417 return (0);
418 }
419
420 rw_enter(&ip->i_contents, RW_READER);
421
422 vap->va_type = vp->v_type;
423 vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
424
425 vap->va_uid = ip->i_uid;
426 vap->va_gid = ip->i_gid;
427 vap->va_fsid = ip->i_dev;
428 vap->va_nodeid = ip->i_icb_lbano;
429 vap->va_nlink = ip->i_nlink;
430 vap->va_size = ip->i_size;
431 vap->va_seq = ip->i_seq;
432 if (vp->v_type == VCHR || vp->v_type == VBLK) {
433 vap->va_rdev = ip->i_rdev;
434 } else {
435 vap->va_rdev = 0;
436 }
437
438 mutex_enter(&ip->i_tlock);
439 ITIMES_NOLOCK(ip); /* mark correct time in inode */
440 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
441 vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
442 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
443 vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
444 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
445 vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
446 mutex_exit(&ip->i_tlock);
447
448 switch (ip->i_type) {
449 case VBLK:
450 vap->va_blksize = MAXBSIZE;
451 break;
452 case VCHR:
453 vap->va_blksize = MAXBSIZE;
454 break;
455 default:
456 vap->va_blksize = ip->i_udf->udf_lbsize;
457 break;
458 }
459 vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
460
461 rw_exit(&ip->i_contents);
462
463 return (0);
464 }
465
466 static int
467 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
468 {
469 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
470 }
471
472 /*ARGSUSED4*/
473 static int32_t
474 udf_setattr(
475 struct vnode *vp,
476 struct vattr *vap,
477 int32_t flags,
478 struct cred *cr,
479 caller_context_t *ct)
480 {
481 int32_t error = 0;
482 uint32_t mask = vap->va_mask;
483 struct ud_inode *ip;
484 timestruc_t now;
485 struct vattr ovap;
486
487 ud_printf("udf_setattr\n");
488
489 ip = VTOI(vp);
490
491 /*
492 * not updates allowed to 4096 files
493 */
494 if (ip->i_astrat == STRAT_TYPE4096) {
495 return (EINVAL);
496 }
497
498 /*
499 * Cannot set these attributes
500 */
501 if (mask & AT_NOSET) {
502 return (EINVAL);
503 }
504
505 rw_enter(&ip->i_rwlock, RW_WRITER);
506 rw_enter(&ip->i_contents, RW_WRITER);
507
508 ovap.va_uid = ip->i_uid;
509 ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
510 error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
511 ud_iaccess_vmode, ip);
512 if (error)
513 goto update_inode;
514
515 mask = vap->va_mask;
516 /*
517 * Change file access modes.
518 */
519 if (mask & AT_MODE) {
520 ip->i_perm = VA2UD_PERM(vap->va_mode);
521 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
522 mutex_enter(&ip->i_tlock);
523 ip->i_flag |= ICHG;
524 mutex_exit(&ip->i_tlock);
525 }
526 if (mask & (AT_UID|AT_GID)) {
527 if (mask & AT_UID) {
528 ip->i_uid = vap->va_uid;
529 }
530 if (mask & AT_GID) {
531 ip->i_gid = vap->va_gid;
532 }
533 mutex_enter(&ip->i_tlock);
534 ip->i_flag |= ICHG;
535 mutex_exit(&ip->i_tlock);
536 }
537 /*
538 * Truncate file. Must have write permission and not be a directory.
539 */
540 if (mask & AT_SIZE) {
541 if (vp->v_type == VDIR) {
542 error = EISDIR;
543 goto update_inode;
544 }
545 if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
546 goto update_inode;
547 }
548 if (vap->va_size > MAXOFFSET_T) {
549 error = EFBIG;
550 goto update_inode;
551 }
552 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
553 goto update_inode;
554 }
555
556 if (vap->va_size == 0)
557 vnevent_truncate(vp, ct);
558 }
559 /*
560 * Change file access or modified times.
561 */
562 if (mask & (AT_ATIME|AT_MTIME)) {
563 mutex_enter(&ip->i_tlock);
564 if (mask & AT_ATIME) {
565 ip->i_atime.tv_sec = vap->va_atime.tv_sec;
566 ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
567 ip->i_flag &= ~IACC;
568 }
569 if (mask & AT_MTIME) {
570 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
571 ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
572 gethrestime(&now);
573 ip->i_ctime.tv_sec = now.tv_sec;
574 ip->i_ctime.tv_nsec = now.tv_nsec;
575 ip->i_flag &= ~(IUPD|ICHG);
576 ip->i_flag |= IMODTIME;
577 }
578 ip->i_flag |= IMOD;
579 mutex_exit(&ip->i_tlock);
580 }
581
582 update_inode:
583 if (curthread->t_flag & T_DONTPEND) {
584 ud_iupdat(ip, 1);
585 } else {
586 ITIMES_NOLOCK(ip);
587 }
588 rw_exit(&ip->i_contents);
589 rw_exit(&ip->i_rwlock);
590
591 return (error);
592 }
593
594 /* ARGSUSED */
595 static int32_t
596 udf_access(
597 struct vnode *vp,
598 int32_t mode,
599 int32_t flags,
600 struct cred *cr,
601 caller_context_t *ct)
602 {
603 struct ud_inode *ip = VTOI(vp);
604
605 ud_printf("udf_access\n");
606
607 if (ip->i_udf == NULL) {
608 return (EIO);
609 }
610
611 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
612 }
613
614 int32_t udfs_stickyhack = 1;
615
616 /* ARGSUSED */
617 static int32_t
618 udf_lookup(
619 struct vnode *dvp,
620 char *nm,
621 struct vnode **vpp,
622 struct pathname *pnp,
623 int32_t flags,
624 struct vnode *rdir,
625 struct cred *cr,
626 caller_context_t *ct,
627 int *direntflags,
628 pathname_t *realpnp)
629 {
630 int32_t error;
631 struct vnode *vp;
632 struct ud_inode *ip, *xip;
633
634 ud_printf("udf_lookup\n");
635 /*
636 * Null component name is a synonym for directory being searched.
637 */
638 if (*nm == '\0') {
639 VN_HOLD(dvp);
640 *vpp = dvp;
641 error = 0;
642 goto out;
643 }
644
645 /*
646 * Fast path: Check the directory name lookup cache.
647 */
648 ip = VTOI(dvp);
649 if (vp = dnlc_lookup(dvp, nm)) {
650 /*
651 * Check accessibility of directory.
652 */
653 if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
654 VN_RELE(vp);
655 }
656 xip = VTOI(vp);
657 } else {
658 error = ud_dirlook(ip, nm, &xip, cr, 1);
659 ITIMES(ip);
660 }
661
662 if (error == 0) {
663 ip = xip;
664 *vpp = ITOV(ip);
665 if ((ip->i_type != VDIR) &&
666 (ip->i_char & ISVTX) &&
667 ((ip->i_perm & IEXEC) == 0) &&
668 udfs_stickyhack) {
669 mutex_enter(&(*vpp)->v_lock);
670 (*vpp)->v_flag |= VISSWAP;
671 mutex_exit(&(*vpp)->v_lock);
672 }
673 ITIMES(ip);
674 /*
675 * If vnode is a device return special vnode instead.
676 */
677 if (IS_DEVVP(*vpp)) {
678 struct vnode *newvp;
679 newvp = specvp(*vpp, (*vpp)->v_rdev,
680 (*vpp)->v_type, cr);
681 VN_RELE(*vpp);
682 if (newvp == NULL) {
683 error = ENOSYS;
684 } else {
685 *vpp = newvp;
686 }
687 }
688 }
689 out:
690 return (error);
691 }
692
693 /* ARGSUSED */
694 static int32_t
695 udf_create(
696 struct vnode *dvp,
697 char *name,
698 struct vattr *vap,
699 enum vcexcl excl,
700 int32_t mode,
701 struct vnode **vpp,
702 struct cred *cr,
703 int32_t flag,
704 caller_context_t *ct,
705 vsecattr_t *vsecp)
706 {
707 int32_t error;
708 struct ud_inode *ip = VTOI(dvp), *xip;
709
710 ud_printf("udf_create\n");
711
712 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
713 vap->va_mode &= ~VSVTX;
714
715 if (*name == '\0') {
716 /*
717 * Null component name refers to the directory itself.
718 */
719 VN_HOLD(dvp);
720 ITIMES(ip);
721 error = EEXIST;
722 } else {
723 xip = NULL;
724 rw_enter(&ip->i_rwlock, RW_WRITER);
725 error = ud_direnter(ip, name, DE_CREATE,
726 (struct ud_inode *)0, (struct ud_inode *)0,
727 vap, &xip, cr, ct);
728 rw_exit(&ip->i_rwlock);
729 ITIMES(ip);
730 ip = xip;
731 }
732 if (ip != NULL) {
733 rw_enter(&ip->i_contents, RW_WRITER);
734 }
735
736 /*
737 * If the file already exists and this is a non-exclusive create,
738 * check permissions and allow access for non-directories.
739 * Read-only create of an existing directory is also allowed.
740 * We fail an exclusive create of anything which already exists.
741 */
742 if (error == EEXIST) {
743 if (excl == NONEXCL) {
744 if ((ip->i_type == VDIR) && (mode & VWRITE)) {
745 error = EISDIR;
746 } else if (mode) {
747 error = ud_iaccess(ip,
748 UD_UPERM2DPERM(mode), cr, 0);
749 } else {
750 error = 0;
751 }
752 }
753 if (error) {
754 rw_exit(&ip->i_contents);
755 VN_RELE(ITOV(ip));
756 goto out;
757 } else if ((ip->i_type == VREG) &&
758 (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
759 /*
760 * Truncate regular files, if requested by caller.
761 * Grab i_rwlock to make sure no one else is
762 * currently writing to the file (we promised
763 * bmap we would do this).
764 * Must get the locks in the correct order.
765 */
766 if (ip->i_size == 0) {
767 ip->i_flag |= ICHG | IUPD;
768 } else {
769 rw_exit(&ip->i_contents);
770 rw_enter(&ip->i_rwlock, RW_WRITER);
771 rw_enter(&ip->i_contents, RW_WRITER);
772 (void) ud_itrunc(ip, 0, 0, cr);
773 rw_exit(&ip->i_rwlock);
774 }
775 vnevent_create(ITOV(ip), ct);
776 }
777 }
778
779 if (error == 0) {
780 *vpp = ITOV(ip);
781 ITIMES(ip);
782 }
783 if (ip != NULL) {
784 rw_exit(&ip->i_contents);
785 }
786 if (error) {
787 goto out;
788 }
789
790 /*
791 * If vnode is a device return special vnode instead.
792 */
793 if (!error && IS_DEVVP(*vpp)) {
794 struct vnode *newvp;
795
796 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
797 VN_RELE(*vpp);
798 if (newvp == NULL) {
799 error = ENOSYS;
800 goto out;
801 }
802 *vpp = newvp;
803 }
804 out:
805 return (error);
806 }
807
808 /* ARGSUSED */
809 static int32_t
810 udf_remove(
811 struct vnode *vp,
812 char *nm,
813 struct cred *cr,
814 caller_context_t *ct,
815 int flags)
816 {
817 int32_t error;
818 struct ud_inode *ip = VTOI(vp);
819
820 ud_printf("udf_remove\n");
821
822 rw_enter(&ip->i_rwlock, RW_WRITER);
823 error = ud_dirremove(ip, nm,
824 (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
825 rw_exit(&ip->i_rwlock);
826 ITIMES(ip);
827
828 return (error);
829 }
830
831 /* ARGSUSED */
832 static int32_t
833 udf_link(
834 struct vnode *tdvp,
835 struct vnode *svp,
836 char *tnm,
837 struct cred *cr,
838 caller_context_t *ct,
839 int flags)
840 {
841 int32_t error;
842 struct vnode *realvp;
843 struct ud_inode *sip;
844 struct ud_inode *tdp;
845
846 ud_printf("udf_link\n");
847 if (VOP_REALVP(svp, &realvp, ct) == 0) {
848 svp = realvp;
849 }
850
851 /*
852 * Do not allow links to directories
853 */
854 if (svp->v_type == VDIR) {
855 return (EPERM);
856 }
857
858 sip = VTOI(svp);
859
860 if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
861 return (EPERM);
862
863 tdp = VTOI(tdvp);
864
865 rw_enter(&tdp->i_rwlock, RW_WRITER);
866 error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
867 sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
868 rw_exit(&tdp->i_rwlock);
869 ITIMES(sip);
870 ITIMES(tdp);
871
872 if (error == 0) {
873 vnevent_link(svp, ct);
874 }
875
876 return (error);
877 }
878
879 /* ARGSUSED */
880 static int32_t
881 udf_rename(
882 struct vnode *sdvp,
883 char *snm,
884 struct vnode *tdvp,
885 char *tnm,
886 struct cred *cr,
887 caller_context_t *ct,
888 int flags)
889 {
890 int32_t error = 0;
891 struct udf_vfs *udf_vfsp;
892 struct ud_inode *sip; /* source inode */
893 struct ud_inode *tip; /* target inode */
894 struct ud_inode *sdp, *tdp; /* source and target parent inode */
895 struct vnode *realvp;
896
897 ud_printf("udf_rename\n");
898
899 if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
900 tdvp = realvp;
901 }
902
903 sdp = VTOI(sdvp);
904 tdp = VTOI(tdvp);
905
906 udf_vfsp = sdp->i_udf;
907
908 mutex_enter(&udf_vfsp->udf_rename_lck);
909 /*
910 * Look up inode of file we're supposed to rename.
911 */
912 if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
913 mutex_exit(&udf_vfsp->udf_rename_lck);
914 return (error);
915 }
916 /*
917 * be sure this is not a directory with another file system mounted
918 * over it. If it is just give up the locks, and return with
919 * EBUSY
920 */
921 if (vn_mountedvfs(ITOV(sip)) != NULL) {
922 error = EBUSY;
923 goto errout;
924 }
925 /*
926 * Make sure we can delete the source entry. This requires
927 * write permission on the containing directory. If that
928 * directory is "sticky" it further requires (except for
929 * privileged users) that the user own the directory or the
930 * source entry, or else have permission to write the source
931 * entry.
932 */
933 rw_enter(&sdp->i_contents, RW_READER);
934 rw_enter(&sip->i_contents, RW_READER);
935 if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
936 (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
937 rw_exit(&sip->i_contents);
938 rw_exit(&sdp->i_contents);
939 ITIMES(sip);
940 goto errout;
941 }
942
943 /*
944 * Check for renaming '.' or '..' or alias of '.'
945 */
946 if ((strcmp(snm, ".") == 0) ||
947 (strcmp(snm, "..") == 0) ||
948 (sdp == sip)) {
949 error = EINVAL;
950 rw_exit(&sip->i_contents);
951 rw_exit(&sdp->i_contents);
952 goto errout;
953 }
954
955 rw_exit(&sip->i_contents);
956 rw_exit(&sdp->i_contents);
957
958 if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
959 vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
960 VN_RELE(ITOV(tip));
961 }
962
963 /* Notify the target dir. if not the same as the source dir. */
964 if (sdvp != tdvp)
965 vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
966
967 vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
968
969 /*
970 * Link source to the target.
971 */
972 rw_enter(&tdp->i_rwlock, RW_WRITER);
973 if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
974 (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
975 /*
976 * ESAME isn't really an error; it indicates that the
977 * operation should not be done because the source and target
978 * are the same file, but that no error should be reported.
979 */
980 if (error == ESAME) {
981 error = 0;
982 }
983 rw_exit(&tdp->i_rwlock);
984 goto errout;
985 }
986 rw_exit(&tdp->i_rwlock);
987
988 rw_enter(&sdp->i_rwlock, RW_WRITER);
989 /*
990 * Unlink the source.
991 * Remove the source entry. ud_dirremove() checks that the entry
992 * still reflects sip, and returns an error if it doesn't.
993 * If the entry has changed just forget about it. Release
994 * the source inode.
995 */
996 if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
997 DR_RENAME, cr, ct)) == ENOENT) {
998 error = 0;
999 }
1000 rw_exit(&sdp->i_rwlock);
1001
1002 if (error == 0) {
1003 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1004 /*
1005 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1006 * in ud_direnter().
1007 */
1008 }
1009
1010 errout:
1011 ITIMES(sdp);
1012 ITIMES(tdp);
1013 VN_RELE(ITOV(sip));
1014 mutex_exit(&udf_vfsp->udf_rename_lck);
1015
1016 return (error);
1017 }
1018
1019 /* ARGSUSED */
1020 static int32_t
1021 udf_mkdir(
1022 struct vnode *dvp,
1023 char *dirname,
1024 struct vattr *vap,
1025 struct vnode **vpp,
1026 struct cred *cr,
1027 caller_context_t *ct,
1028 int flags,
1029 vsecattr_t *vsecp)
1030 {
1031 int32_t error;
1032 struct ud_inode *ip;
1033 struct ud_inode *xip;
1034
1035 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1036
1037 ud_printf("udf_mkdir\n");
1038
1039 ip = VTOI(dvp);
1040 rw_enter(&ip->i_rwlock, RW_WRITER);
1041 error = ud_direnter(ip, dirname, DE_MKDIR,
1042 (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1043 rw_exit(&ip->i_rwlock);
1044 ITIMES(ip);
1045 if (error == 0) {
1046 ip = xip;
1047 *vpp = ITOV(ip);
1048 ITIMES(ip);
1049 } else if (error == EEXIST) {
1050 ITIMES(xip);
1051 VN_RELE(ITOV(xip));
1052 }
1053
1054 return (error);
1055 }
1056
1057 /* ARGSUSED */
1058 static int32_t
1059 udf_rmdir(
1060 struct vnode *vp,
1061 char *nm,
1062 struct vnode *cdir,
1063 struct cred *cr,
1064 caller_context_t *ct,
1065 int flags)
1066 {
1067 int32_t error;
1068 struct ud_inode *ip = VTOI(vp);
1069
1070 ud_printf("udf_rmdir\n");
1071
1072 rw_enter(&ip->i_rwlock, RW_WRITER);
1073 error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1074 cr, ct);
1075 rw_exit(&ip->i_rwlock);
1076 ITIMES(ip);
1077
1078 return (error);
1079 }
1080
1081 /* ARGSUSED */
1082 static int32_t
1083 udf_readdir(
1084 struct vnode *vp,
1085 struct uio *uiop,
1086 struct cred *cr,
1087 int32_t *eofp,
1088 caller_context_t *ct,
1089 int flags)
1090 {
1091 struct ud_inode *ip;
1092 struct dirent64 *nd;
1093 struct udf_vfs *udf_vfsp;
1094 int32_t error = 0, len, outcount = 0;
1095 uint32_t dirsiz, offset;
1096 uint32_t bufsize, ndlen, dummy;
1097 caddr_t outbuf;
1098 caddr_t outb, end_outb;
1099 struct iovec *iovp;
1100
1101 uint8_t *dname;
1102 int32_t length;
1103
1104 uint8_t *buf = NULL;
1105
1106 struct fbuf *fbp = NULL;
1107 struct file_id *fid;
1108 uint8_t *name;
1109
1110
1111 ud_printf("udf_readdir\n");
1112
1113 ip = VTOI(vp);
1114 udf_vfsp = ip->i_udf;
1115
1116 dirsiz = ip->i_size;
1117 if ((uiop->uio_offset >= dirsiz) ||
1118 (ip->i_nlink <= 0)) {
1119 if (eofp) {
1120 *eofp = 1;
1121 }
1122 return (0);
1123 }
1124
1125 offset = uiop->uio_offset;
1126 iovp = uiop->uio_iov;
1127 bufsize = iovp->iov_len;
1128
1129 outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1130 end_outb = outb + bufsize;
1131 nd = (struct dirent64 *)outbuf;
1132
1133 dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1134 buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1135
1136 if (offset == 0) {
1137 len = DIRENT64_RECLEN(1);
1138 if (((caddr_t)nd + len) >= end_outb) {
1139 error = EINVAL;
1140 goto end;
1141 }
1142 nd->d_ino = ip->i_icb_lbano;
1143 nd->d_reclen = (uint16_t)len;
1144 nd->d_off = 0x10;
1145 nd->d_name[0] = '.';
1146 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1147 nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1148 outcount++;
1149 } else if (offset == 0x10) {
1150 offset = 0;
1151 }
1152
1153 while (offset < dirsiz) {
1154 error = ud_get_next_fid(ip, &fbp,
1155 offset, &fid, &name, buf);
1156 if (error != 0) {
1157 break;
1158 }
1159
1160 if ((fid->fid_flags & FID_DELETED) == 0) {
1161 if (fid->fid_flags & FID_PARENT) {
1162
1163 len = DIRENT64_RECLEN(2);
1164 if (((caddr_t)nd + len) >= end_outb) {
1165 error = EINVAL;
1166 break;
1167 }
1168
1169 nd->d_ino = ip->i_icb_lbano;
1170 nd->d_reclen = (uint16_t)len;
1171 nd->d_off = offset + FID_LEN(fid);
1172 nd->d_name[0] = '.';
1173 nd->d_name[1] = '.';
1174 bzero(&nd->d_name[2],
1175 DIRENT64_NAMELEN(len) - 2);
1176 nd = (struct dirent64 *)
1177 ((char *)nd + nd->d_reclen);
1178 } else {
1179 if ((error = ud_uncompress(fid->fid_idlen,
1180 &length, name, dname)) != 0) {
1181 break;
1182 }
1183 if (length == 0) {
1184 offset += FID_LEN(fid);
1185 continue;
1186 }
1187 len = DIRENT64_RECLEN(length);
1188 if (((caddr_t)nd + len) >= end_outb) {
1189 if (!outcount) {
1190 error = EINVAL;
1191 }
1192 break;
1193 }
1194 (void) strncpy(nd->d_name,
1195 (caddr_t)dname, length);
1196 bzero(&nd->d_name[length],
1197 DIRENT64_NAMELEN(len) - length);
1198 nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1199 SWAP_16(fid->fid_icb.lad_ext_prn),
1200 SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1201 &dummy);
1202 nd->d_reclen = (uint16_t)len;
1203 nd->d_off = offset + FID_LEN(fid);
1204 nd = (struct dirent64 *)
1205 ((char *)nd + nd->d_reclen);
1206 }
1207 outcount++;
1208 }
1209
1210 offset += FID_LEN(fid);
1211 }
1212
1213 end:
1214 if (fbp != NULL) {
1215 fbrelse(fbp, S_OTHER);
1216 }
1217 ndlen = ((char *)nd - outbuf);
1218 /*
1219 * In case of error do not call uiomove.
1220 * Return the error to the caller.
1221 */
1222 if ((error == 0) && (ndlen != 0)) {
1223 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1224 uiop->uio_offset = offset;
1225 }
1226 kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1227 kmem_free((caddr_t)dname, 1024);
1228 kmem_free(outbuf, (uint32_t)bufsize);
1229 if (eofp && error == 0) {
1230 *eofp = (uiop->uio_offset >= dirsiz);
1231 }
1232 return (error);
1233 }
1234
1235 /* ARGSUSED */
1236 static int32_t
1237 udf_symlink(
1238 struct vnode *dvp,
1239 char *linkname,
1240 struct vattr *vap,
1241 char *target,
1242 struct cred *cr,
1243 caller_context_t *ct,
1244 int flags)
1245 {
1246 int32_t error = 0, outlen;
1247 uint32_t ioflag = 0;
1248 struct ud_inode *ip, *dip = VTOI(dvp);
1249
1250 struct path_comp *pc;
1251 int8_t *dname = NULL, *uname = NULL, *sp;
1252
1253 ud_printf("udf_symlink\n");
1254
1255 ip = (struct ud_inode *)0;
1256 vap->va_type = VLNK;
1257 vap->va_rdev = 0;
1258
1259 rw_enter(&dip->i_rwlock, RW_WRITER);
1260 error = ud_direnter(dip, linkname, DE_CREATE,
1261 (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1262 rw_exit(&dip->i_rwlock);
1263 if (error == 0) {
1264 dname = kmem_zalloc(1024, KM_SLEEP);
1265 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1266
1267 pc = (struct path_comp *)uname;
1268 /*
1269 * If the first character in target is "/"
1270 * then skip it and create entry for it
1271 */
1272 if (*target == '/') {
1273 pc->pc_type = 2;
1274 pc->pc_len = 0;
1275 pc = (struct path_comp *)(((char *)pc) + 4);
1276 while (*target == '/') {
1277 target++;
1278 }
1279 }
1280
1281 while (*target != NULL) {
1282 sp = target;
1283 while ((*target != '/') && (*target != '\0')) {
1284 target ++;
1285 }
1286 /*
1287 * We got the next component of the
1288 * path name. Create path_comp of
1289 * appropriate type
1290 */
1291 if (((target - sp) == 1) && (*sp == '.')) {
1292 /*
1293 * Dot entry.
1294 */
1295 pc->pc_type = 4;
1296 pc = (struct path_comp *)(((char *)pc) + 4);
1297 } else if (((target - sp) == 2) &&
1298 (*sp == '.') && ((*(sp + 1)) == '.')) {
1299 /*
1300 * DotDot entry.
1301 */
1302 pc->pc_type = 3;
1303 pc = (struct path_comp *)(((char *)pc) + 4);
1304 } else {
1305 /*
1306 * convert the user given name
1307 * into appropriate form to be put
1308 * on the media
1309 */
1310 outlen = 1024; /* set to size of dname */
1311 if (error = ud_compress(target - sp, &outlen,
1312 (uint8_t *)sp, (uint8_t *)dname)) {
1313 break;
1314 }
1315 pc->pc_type = 5;
1316 /* LINTED */
1317 pc->pc_len = outlen;
1318 dname[outlen] = '\0';
1319 (void) strcpy((char *)pc->pc_id, dname);
1320 pc = (struct path_comp *)
1321 (((char *)pc) + 4 + outlen);
1322 }
1323 while (*target == '/') {
1324 target++;
1325 }
1326 if (*target == NULL) {
1327 break;
1328 }
1329 }
1330
1331 rw_enter(&ip->i_contents, RW_WRITER);
1332 if (error == 0) {
1333 ioflag = FWRITE;
1334 if (curthread->t_flag & T_DONTPEND) {
1335 ioflag |= FDSYNC;
1336 }
1337 error = ud_rdwri(UIO_WRITE, ioflag, ip,
1338 uname, ((int8_t *)pc) - uname,
1339 (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1340 }
1341 if (error) {
1342 ud_idrop(ip);
1343 rw_exit(&ip->i_contents);
1344 rw_enter(&dip->i_rwlock, RW_WRITER);
1345 (void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1346 (struct vnode *)0, DR_REMOVE, cr, ct);
1347 rw_exit(&dip->i_rwlock);
1348 goto update_inode;
1349 }
1350 rw_exit(&ip->i_contents);
1351 }
1352
1353 if ((error == 0) || (error == EEXIST)) {
1354 VN_RELE(ITOV(ip));
1355 }
1356
1357 update_inode:
1358 ITIMES(VTOI(dvp));
1359 if (uname != NULL) {
1360 kmem_free(uname, PAGESIZE);
1361 }
1362 if (dname != NULL) {
1363 kmem_free(dname, 1024);
1364 }
1365
1366 return (error);
1367 }
1368
1369 /* ARGSUSED */
1370 static int32_t
1371 udf_readlink(
1372 struct vnode *vp,
1373 struct uio *uiop,
1374 struct cred *cr,
1375 caller_context_t *ct)
1376 {
1377 int32_t error = 0, off, id_len, size, len;
1378 int8_t *dname = NULL, *uname = NULL;
1379 struct ud_inode *ip;
1380 struct fbuf *fbp = NULL;
1381 struct path_comp *pc;
1382
1383 ud_printf("udf_readlink\n");
1384
1385 if (vp->v_type != VLNK) {
1386 return (EINVAL);
1387 }
1388
1389 ip = VTOI(vp);
1390 size = ip->i_size;
1391 if (size > PAGESIZE) {
1392 return (EIO);
1393 }
1394
1395 if (size == 0) {
1396 return (0);
1397 }
1398
1399 dname = kmem_zalloc(1024, KM_SLEEP);
1400 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1401
1402 rw_enter(&ip->i_contents, RW_READER);
1403
1404 if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1405 goto end;
1406 }
1407
1408 off = 0;
1409
1410 while (off < size) {
1411 pc = (struct path_comp *)(fbp->fb_addr + off);
1412 switch (pc->pc_type) {
1413 case 1 :
1414 (void) strcpy(uname, ip->i_udf->udf_fsmnt);
1415 (void) strcat(uname, "/");
1416 break;
1417 case 2 :
1418 if (pc->pc_len != 0) {
1419 goto end;
1420 }
1421 uname[0] = '/';
1422 uname[1] = '\0';
1423 break;
1424 case 3 :
1425 (void) strcat(uname, "../");
1426 break;
1427 case 4 :
1428 (void) strcat(uname, "./");
1429 break;
1430 case 5 :
1431 if ((error = ud_uncompress(pc->pc_len, &id_len,
1432 pc->pc_id, (uint8_t *)dname)) != 0) {
1433 break;
1434 }
1435 dname[id_len] = '\0';
1436 (void) strcat(uname, dname);
1437 (void) strcat(uname, "/");
1438 break;
1439 default :
1440 error = EINVAL;
1441 goto end;
1442 }
1443 off += 4 + pc->pc_len;
1444 }
1445 len = strlen(uname) - 1;
1446 if (uname[len] == '/') {
1447 if (len == 0) {
1448 /*
1449 * special case link to /
1450 */
1451 len = 1;
1452 } else {
1453 uname[len] = '\0';
1454 }
1455 }
1456
1457 error = uiomove(uname, len, UIO_READ, uiop);
1458
1459 ITIMES(ip);
1460
1461 end:
1462 if (fbp != NULL) {
1463 fbrelse(fbp, S_OTHER);
1464 }
1465 rw_exit(&ip->i_contents);
1466 if (uname != NULL) {
1467 kmem_free(uname, PAGESIZE);
1468 }
1469 if (dname != NULL) {
1470 kmem_free(dname, 1024);
1471 }
1472 return (error);
1473 }
1474
1475 /* ARGSUSED */
1476 static int32_t
1477 udf_fsync(
1478 struct vnode *vp,
1479 int32_t syncflag,
1480 struct cred *cr,
1481 caller_context_t *ct)
1482 {
1483 int32_t error = 0;
1484 struct ud_inode *ip = VTOI(vp);
1485
1486 ud_printf("udf_fsync\n");
1487
1488 rw_enter(&ip->i_contents, RW_WRITER);
1489 if (!(IS_SWAPVP(vp))) {
1490 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1491 }
1492 if (error == 0) {
1493 error = ud_sync_indir(ip);
1494 }
1495 ITIMES(ip); /* XXX: is this necessary ??? */
1496 rw_exit(&ip->i_contents);
1497
1498 return (error);
1499 }
1500
1501 /* ARGSUSED */
1502 static void
1503 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1504 {
1505 ud_printf("udf_iinactive\n");
1506
1507 ud_iinactive(VTOI(vp), cr);
1508 }
1509
1510 /* ARGSUSED */
1511 static int32_t
1512 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1513 {
1514 struct udf_fid *udfidp;
1515 struct ud_inode *ip = VTOI(vp);
1516
1517 ud_printf("udf_fid\n");
1518
1519 if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1520 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1521 return (ENOSPC);
1522 }
1523
1524 udfidp = (struct udf_fid *)fidp;
1525 bzero((char *)udfidp, sizeof (struct udf_fid));
1526 rw_enter(&ip->i_contents, RW_READER);
1527 udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1528 udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1529 udfidp->udfid_prn = ip->i_icb_prn;
1530 udfidp->udfid_icb_lbn = ip->i_icb_block;
1531 rw_exit(&ip->i_contents);
1532
1533 return (0);
1534 }
1535
1536 /* ARGSUSED2 */
1537 static int
1538 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1539 {
1540 struct ud_inode *ip = VTOI(vp);
1541
1542 ud_printf("udf_rwlock\n");
1543
1544 if (write_lock) {
1545 rw_enter(&ip->i_rwlock, RW_WRITER);
1546 } else {
1547 rw_enter(&ip->i_rwlock, RW_READER);
1548 }
1549 return (write_lock);
1550 }
1551
1552 /* ARGSUSED */
1553 static void
1554 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1555 {
1556 struct ud_inode *ip = VTOI(vp);
1557
1558 ud_printf("udf_rwunlock\n");
1559
1560 rw_exit(&ip->i_rwlock);
1561
1562 }
1563
1564 /* ARGSUSED */
1565 static int32_t
1566 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1567 {
1568 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1569 }
1570
1571 static int32_t
1572 udf_frlock(
1573 struct vnode *vp,
1574 int32_t cmd,
1575 struct flock64 *bfp,
1576 int32_t flag,
1577 offset_t offset,
1578 struct flk_callback *flk_cbp,
1579 cred_t *cr,
1580 caller_context_t *ct)
1581 {
1582 struct ud_inode *ip = VTOI(vp);
1583
1584 ud_printf("udf_frlock\n");
1585
1586 /*
1587 * If file is being mapped, disallow frlock.
1588 * XXX I am not holding tlock while checking i_mapcnt because the
1589 * current locking strategy drops all locks before calling fs_frlock.
1590 * So, mapcnt could change before we enter fs_frlock making is
1591 * meaningless to have held tlock in the first place.
1592 */
1593 if ((ip->i_mapcnt > 0) &&
1594 (MANDLOCK(vp, ip->i_char))) {
1595 return (EAGAIN);
1596 }
1597
1598 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1599 }
1600
1601 /*ARGSUSED6*/
1602 static int32_t
1603 udf_space(
1604 struct vnode *vp,
1605 int32_t cmd,
1606 struct flock64 *bfp,
1607 int32_t flag,
1608 offset_t offset,
1609 cred_t *cr,
1610 caller_context_t *ct)
1611 {
1612 int32_t error = 0;
1613
1614 ud_printf("udf_space\n");
1615
1616 if (cmd != F_FREESP) {
1617 error = EINVAL;
1618 } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1619 error = ud_freesp(vp, bfp, flag, cr);
1620
1621 if (error == 0 && bfp->l_start == 0)
1622 vnevent_truncate(vp, ct);
1623 }
1624
1625 return (error);
1626 }
1627
1628 /* ARGSUSED */
1629 static int32_t
1630 udf_getpage(
1631 struct vnode *vp,
1632 offset_t off,
1633 size_t len,
1634 uint32_t *protp,
1635 struct page **plarr,
1636 size_t plsz,
1637 struct seg *seg,
1638 caddr_t addr,
1639 enum seg_rw rw,
1640 struct cred *cr,
1641 caller_context_t *ct)
1642 {
1643 struct ud_inode *ip = VTOI(vp);
1644 int32_t error, has_holes, beyond_eof, seqmode, dolock;
1645 int32_t pgsize = PAGESIZE;
1646 struct udf_vfs *udf_vfsp = ip->i_udf;
1647 page_t **pl;
1648 u_offset_t pgoff, eoff, uoff;
1649 krw_t rwtype;
1650 caddr_t pgaddr;
1651
1652 ud_printf("udf_getpage\n");
1653
1654 uoff = (u_offset_t)off; /* type conversion */
1655 if (protp) {
1656 *protp = PROT_ALL;
1657 }
1658 if (vp->v_flag & VNOMAP) {
1659 return (ENOSYS);
1660 }
1661 seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1662
1663 rwtype = RW_READER;
1664 dolock = (rw_owner(&ip->i_contents) != curthread);
1665 retrylock:
1666 if (dolock) {
1667 rw_enter(&ip->i_contents, rwtype);
1668 }
1669
1670 /*
1671 * We may be getting called as a side effect of a bmap using
1672 * fbread() when the blocks might be being allocated and the
1673 * size has not yet been up'ed. In this case we want to be
1674 * able to return zero pages if we get back UDF_HOLE from
1675 * calling bmap for a non write case here. We also might have
1676 * to read some frags from the disk into a page if we are
1677 * extending the number of frags for a given lbn in bmap().
1678 */
1679 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1680 if (beyond_eof && seg != segkmap) {
1681 if (dolock) {
1682 rw_exit(&ip->i_contents);
1683 }
1684 return (EFAULT);
1685 }
1686
1687 /*
1688 * Must hold i_contents lock throughout the call to pvn_getpages
1689 * since locked pages are returned from each call to ud_getapage.
1690 * Must *not* return locked pages and then try for contents lock
1691 * due to lock ordering requirements (inode > page)
1692 */
1693
1694 has_holes = ud_bmap_has_holes(ip);
1695
1696 if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1697 int32_t blk_size, count;
1698 u_offset_t offset;
1699
1700 /*
1701 * We must acquire the RW_WRITER lock in order to
1702 * call bmap_write().
1703 */
1704 if (dolock && rwtype == RW_READER) {
1705 rwtype = RW_WRITER;
1706
1707 if (!rw_tryupgrade(&ip->i_contents)) {
1708
1709 rw_exit(&ip->i_contents);
1710
1711 goto retrylock;
1712 }
1713 }
1714
1715 /*
1716 * May be allocating disk blocks for holes here as
1717 * a result of mmap faults. write(2) does the bmap_write
1718 * in rdip/wrip, not here. We are not dealing with frags
1719 * in this case.
1720 */
1721 offset = uoff;
1722 while ((offset < uoff + len) &&
1723 (offset < ip->i_size)) {
1724 /*
1725 * the variable "bnp" is to simplify the expression for
1726 * the compiler; * just passing in &bn to bmap_write
1727 * causes a compiler "loop"
1728 */
1729
1730 blk_size = udf_vfsp->udf_lbsize;
1731 if ((offset + blk_size) > ip->i_size) {
1732 count = ip->i_size - offset;
1733 } else {
1734 count = blk_size;
1735 }
1736 error = ud_bmap_write(ip, offset, count, 0, cr);
1737 if (error) {
1738 goto update_inode;
1739 }
1740 offset += count; /* XXX - make this contig */
1741 }
1742 }
1743
1744 /*
1745 * Can be a reader from now on.
1746 */
1747 if (dolock && rwtype == RW_WRITER) {
1748 rw_downgrade(&ip->i_contents);
1749 }
1750
1751 /*
1752 * We remove PROT_WRITE in cases when the file has UDF holes
1753 * because we don't want to call bmap_read() to check each
1754 * page if it is backed with a disk block.
1755 */
1756 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1757 *protp &= ~PROT_WRITE;
1758 }
1759
1760 error = 0;
1761
1762 /*
1763 * The loop looks up pages in the range <off, off + len).
1764 * For each page, we first check if we should initiate an asynchronous
1765 * read ahead before we call page_lookup (we may sleep in page_lookup
1766 * for a previously initiated disk read).
1767 */
1768 eoff = (uoff + len);
1769 for (pgoff = uoff, pgaddr = addr, pl = plarr;
1770 pgoff < eoff; /* empty */) {
1771 page_t *pp;
1772 u_offset_t nextrio;
1773 se_t se;
1774
1775 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1776
1777 /*
1778 * Handle async getpage (faultahead)
1779 */
1780 if (plarr == NULL) {
1781 ip->i_nextrio = pgoff;
1782 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1783 pgoff += pgsize;
1784 pgaddr += pgsize;
1785 continue;
1786 }
1787
1788 /*
1789 * Check if we should initiate read ahead of next cluster.
1790 * We call page_exists only when we need to confirm that
1791 * we have the current page before we initiate the read ahead.
1792 */
1793 nextrio = ip->i_nextrio;
1794 if (seqmode &&
1795 pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1796 nextrio < ip->i_size && page_exists(vp, pgoff))
1797 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1798
1799 if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1800
1801 /*
1802 * We found the page in the page cache.
1803 */
1804 *pl++ = pp;
1805 pgoff += pgsize;
1806 pgaddr += pgsize;
1807 len -= pgsize;
1808 plsz -= pgsize;
1809 } else {
1810
1811 /*
1812 * We have to create the page, or read it from disk.
1813 */
1814 if (error = ud_getpage_miss(vp, pgoff, len,
1815 seg, pgaddr, pl, plsz, rw, seqmode)) {
1816 goto error_out;
1817 }
1818
1819 while (*pl != NULL) {
1820 pl++;
1821 pgoff += pgsize;
1822 pgaddr += pgsize;
1823 len -= pgsize;
1824 plsz -= pgsize;
1825 }
1826 }
1827 }
1828
1829 /*
1830 * Return pages up to plsz if they are in the page cache.
1831 * We cannot return pages if there is a chance that they are
1832 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1833 */
1834 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1835
1836 ASSERT((protp == NULL) ||
1837 !(has_holes && (*protp & PROT_WRITE)));
1838
1839 eoff = pgoff + plsz;
1840 while (pgoff < eoff) {
1841 page_t *pp;
1842
1843 if ((pp = page_lookup_nowait(vp, pgoff,
1844 SE_SHARED)) == NULL)
1845 break;
1846
1847 *pl++ = pp;
1848 pgoff += pgsize;
1849 plsz -= pgsize;
1850 }
1851 }
1852
1853 if (plarr)
1854 *pl = NULL; /* Terminate page list */
1855 ip->i_nextr = pgoff;
1856
1857 error_out:
1858 if (error && plarr) {
1859 /*
1860 * Release any pages we have locked.
1861 */
1862 while (pl > &plarr[0])
1863 page_unlock(*--pl);
1864
1865 plarr[0] = NULL;
1866 }
1867
1868 update_inode:
1869 if (dolock) {
1870 rw_exit(&ip->i_contents);
1871 }
1872
1873 /*
1874 * If the inode is not already marked for IACC (in rwip() for read)
1875 * and the inode is not marked for no access time update (in rwip()
1876 * for write) then update the inode access time and mod time now.
1877 */
1878 mutex_enter(&ip->i_tlock);
1879 if ((ip->i_flag & (IACC | INOACC)) == 0) {
1880 if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1881 ip->i_flag |= IACC;
1882 }
1883 if (rw == S_WRITE) {
1884 ip->i_flag |= IUPD;
1885 }
1886 ITIMES_NOLOCK(ip);
1887 }
1888 mutex_exit(&ip->i_tlock);
1889
1890 return (error);
1891 }
1892
1893 int32_t ud_delay = 1;
1894
1895 /* ARGSUSED */
1896 static int32_t
1897 udf_putpage(
1898 struct vnode *vp,
1899 offset_t off,
1900 size_t len,
1901 int32_t flags,
1902 struct cred *cr,
1903 caller_context_t *ct)
1904 {
1905 struct ud_inode *ip;
1906 int32_t error = 0;
1907
1908 ud_printf("udf_putpage\n");
1909
1910 ip = VTOI(vp);
1911
1912 if (vp->v_count == 0) {
1913 cmn_err(CE_WARN, "ud_putpage : bad v_count");
1914 error = EINVAL;
1915 goto out;
1916 }
1917
1918 if (vp->v_flag & VNOMAP) {
1919 error = ENOSYS;
1920 goto out;
1921 }
1922
1923 if (flags & B_ASYNC) {
1924 if (ud_delay && len &&
1925 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1926 mutex_enter(&ip->i_tlock);
1927
1928 /*
1929 * If nobody stalled, start a new cluster.
1930 */
1931 if (ip->i_delaylen == 0) {
1932 ip->i_delayoff = off;
1933 ip->i_delaylen = len;
1934 mutex_exit(&ip->i_tlock);
1935 goto out;
1936 }
1937
1938 /*
1939 * If we have a full cluster or they are not contig,
1940 * then push last cluster and start over.
1941 */
1942 if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
1943 ip->i_delayoff + ip->i_delaylen != off) {
1944 u_offset_t doff;
1945 size_t dlen;
1946
1947 doff = ip->i_delayoff;
1948 dlen = ip->i_delaylen;
1949 ip->i_delayoff = off;
1950 ip->i_delaylen = len;
1951 mutex_exit(&ip->i_tlock);
1952 error = ud_putpages(vp, doff, dlen, flags, cr);
1953 /* LMXXX - flags are new val, not old */
1954 goto out;
1955 }
1956
1957 /*
1958 * There is something there, it's not full, and
1959 * it is contig.
1960 */
1961 ip->i_delaylen += len;
1962 mutex_exit(&ip->i_tlock);
1963 goto out;
1964 }
1965
1966 /*
1967 * Must have weird flags or we are not clustering.
1968 */
1969 }
1970
1971 error = ud_putpages(vp, off, len, flags, cr);
1972
1973 out:
1974 return (error);
1975 }
1976
1977 /* ARGSUSED */
1978 static int32_t
1979 udf_map(
1980 struct vnode *vp,
1981 offset_t off,
1982 struct as *as,
1983 caddr_t *addrp,
1984 size_t len,
1985 uint8_t prot,
1986 uint8_t maxprot,
1987 uint32_t flags,
1988 struct cred *cr,
1989 caller_context_t *ct)
1990 {
1991 struct segvn_crargs vn_a;
1992 int32_t error = 0;
1993
1994 ud_printf("udf_map\n");
1995
1996 if (vp->v_flag & VNOMAP) {
1997 error = ENOSYS;
1998 goto end;
1999 }
2000
2001 if ((off < (offset_t)0) ||
2002 ((off + len) < (offset_t)0)) {
2003 error = EINVAL;
2004 goto end;
2005 }
2006
2007 if (vp->v_type != VREG) {
2008 error = ENODEV;
2009 goto end;
2010 }
2011
2012 /*
2013 * If file is being locked, disallow mapping.
2014 */
2015 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2016 error = EAGAIN;
2017 goto end;
2018 }
2019
2020 as_rangelock(as);
2021 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2022 if (error != 0) {
2023 as_rangeunlock(as);
2024 goto end;
2025 }
2026
2027 vn_a.vp = vp;
2028 vn_a.offset = off;
2029 vn_a.type = flags & MAP_TYPE;
2030 vn_a.prot = prot;
2031 vn_a.maxprot = maxprot;
2032 vn_a.cred = cr;
2033 vn_a.amp = NULL;
2034 vn_a.flags = flags & ~MAP_TYPE;
2035 vn_a.szc = 0;
2036 vn_a.lgrp_mem_policy_flags = 0;
2037
2038 error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2039 as_rangeunlock(as);
2040
2041 end:
2042 return (error);
2043 }
2044
2045 /* ARGSUSED */
2046 static int32_t
2047 udf_addmap(struct vnode *vp,
2048 offset_t off,
2049 struct as *as,
2050 caddr_t addr,
2051 size_t len,
2052 uint8_t prot,
2053 uint8_t maxprot,
2054 uint32_t flags,
2055 struct cred *cr,
2056 caller_context_t *ct)
2057 {
2058 struct ud_inode *ip = VTOI(vp);
2059
2060 ud_printf("udf_addmap\n");
2061
2062 if (vp->v_flag & VNOMAP) {
2063 return (ENOSYS);
2064 }
2065
2066 mutex_enter(&ip->i_tlock);
2067 ip->i_mapcnt += btopr(len);
2068 mutex_exit(&ip->i_tlock);
2069
2070 return (0);
2071 }
2072
2073 /* ARGSUSED */
2074 static int32_t
2075 udf_delmap(
2076 struct vnode *vp, offset_t off,
2077 struct as *as,
2078 caddr_t addr,
2079 size_t len,
2080 uint32_t prot,
2081 uint32_t maxprot,
2082 uint32_t flags,
2083 struct cred *cr,
2084 caller_context_t *ct)
2085 {
2086 struct ud_inode *ip = VTOI(vp);
2087
2088 ud_printf("udf_delmap\n");
2089
2090 if (vp->v_flag & VNOMAP) {
2091 return (ENOSYS);
2092 }
2093
2094 mutex_enter(&ip->i_tlock);
2095 ip->i_mapcnt -= btopr(len); /* Count released mappings */
2096 ASSERT(ip->i_mapcnt >= 0);
2097 mutex_exit(&ip->i_tlock);
2098
2099 return (0);
2100 }
2101
2102 /* ARGSUSED */
2103 static int32_t
2104 udf_l_pathconf(
2105 struct vnode *vp,
2106 int32_t cmd,
2107 ulong_t *valp,
2108 struct cred *cr,
2109 caller_context_t *ct)
2110 {
2111 int32_t error = 0;
2112
2113 ud_printf("udf_l_pathconf\n");
2114
2115 if (cmd == _PC_FILESIZEBITS) {
2116 /*
2117 * udf supports 64 bits as file size
2118 * but there are several other restrictions
2119 * it only supports 32-bit block numbers and
2120 * daddr32_t is only and int32_t so taking these
2121 * into account we can stay just as where ufs is
2122 */
2123 *valp = 41;
2124 } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2125 /* nanosecond timestamp resolution */
2126 *valp = 1L;
2127 } else {
2128 error = fs_pathconf(vp, cmd, valp, cr, ct);
2129 }
2130
2131 return (error);
2132 }
2133
2134 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2135
2136 /*
2137 * Assumption is that there will not be a pageio request
2138 * to a enbedded file
2139 */
2140 /* ARGSUSED */
2141 static int32_t
2142 udf_pageio(
2143 struct vnode *vp,
2144 struct page *pp,
2145 u_offset_t io_off,
2146 size_t io_len,
2147 int32_t flags,
2148 struct cred *cr,
2149 caller_context_t *ct)
2150 {
2151 daddr_t bn;
2152 struct buf *bp;
2153 struct ud_inode *ip = VTOI(vp);
2154 int32_t dolock, error = 0, contig, multi_io;
2155 size_t done_len = 0, cur_len = 0;
2156 page_t *npp = NULL, *opp = NULL, *cpp = pp;
2157
2158 if (pp == NULL) {
2159 return (EINVAL);
2160 }
2161
2162 dolock = (rw_owner(&ip->i_contents) != curthread);
2163
2164 /*
2165 * We need a better check. Ideally, we would use another
2166 * vnodeops so that hlocked and forcibly unmounted file
2167 * systems would return EIO where appropriate and w/o the
2168 * need for these checks.
2169 */
2170 if (ip->i_udf == NULL) {
2171 return (EIO);
2172 }
2173
2174 if (dolock) {
2175 rw_enter(&ip->i_contents, RW_READER);
2176 }
2177
2178 /*
2179 * Break the io request into chunks, one for each contiguous
2180 * stretch of disk blocks in the target file.
2181 */
2182 while (done_len < io_len) {
2183 ASSERT(cpp);
2184 bp = NULL;
2185 contig = 0;
2186 if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2187 &bn, &contig)) {
2188 break;
2189 }
2190
2191 if (bn == UDF_HOLE) { /* No holey swapfiles */
2192 cmn_err(CE_WARN, "SWAP file has HOLES");
2193 error = EINVAL;
2194 break;
2195 }
2196
2197 cur_len = MIN(io_len - done_len, contig);
2198
2199 /*
2200 * Check if more than one I/O is
2201 * required to complete the given
2202 * I/O operation
2203 */
2204 if (ip->i_udf->udf_lbsize < PAGESIZE) {
2205 if (cur_len >= PAGESIZE) {
2206 multi_io = 0;
2207 cur_len &= PAGEMASK;
2208 } else {
2209 multi_io = 1;
2210 cur_len = MIN(io_len - done_len, PAGESIZE);
2211 }
2212 }
2213 page_list_break(&cpp, &npp, btop(cur_len));
2214
2215 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2216 ASSERT(bp != NULL);
2217
2218 bp->b_edev = ip->i_dev;
2219 bp->b_dev = cmpdev(ip->i_dev);
2220 bp->b_blkno = bn;
2221 bp->b_un.b_addr = (caddr_t)0;
2222 bp->b_file = vp;
2223 bp->b_offset = (offset_t)(io_off + done_len);
2224
2225 /*
2226 * ub.ub_pageios.value.ul++;
2227 */
2228 if (multi_io == 0) {
2229 (void) bdev_strategy(bp);
2230 } else {
2231 error = ud_multi_strat(ip, cpp, bp,
2232 (u_offset_t)(io_off + done_len));
2233 if (error != 0) {
2234 pageio_done(bp);
2235 break;
2236 }
2237 }
2238 if (flags & B_READ) {
2239 ud_pageio_reads++;
2240 } else {
2241 ud_pageio_writes++;
2242 }
2243
2244 /*
2245 * If the request is not B_ASYNC, wait for i/o to complete
2246 * and re-assemble the page list to return to the caller.
2247 * If it is B_ASYNC we leave the page list in pieces and
2248 * cleanup() will dispose of them.
2249 */
2250 if ((flags & B_ASYNC) == 0) {
2251 error = biowait(bp);
2252 pageio_done(bp);
2253 if (error) {
2254 break;
2255 }
2256 page_list_concat(&opp, &cpp);
2257 }
2258 cpp = npp;
2259 npp = NULL;
2260 done_len += cur_len;
2261 }
2262
2263 ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2264 if (error) {
2265 if (flags & B_ASYNC) {
2266 /* Cleanup unprocessed parts of list */
2267 page_list_concat(&cpp, &npp);
2268 if (flags & B_READ) {
2269 pvn_read_done(cpp, B_ERROR);
2270 } else {
2271 pvn_write_done(cpp, B_ERROR);
2272 }
2273 } else {
2274 /* Re-assemble list and let caller clean up */
2275 page_list_concat(&opp, &cpp);
2276 page_list_concat(&opp, &npp);
2277 }
2278 }
2279
2280 if (dolock) {
2281 rw_exit(&ip->i_contents);
2282 }
2283
2284 return (error);
2285 }
2286
2287
2288
2289
2290 /* -------------------- local functions --------------------------- */
2291
2292
2293
2294 int32_t
2295 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2296 struct ud_inode *ip, caddr_t base, int32_t len,
2297 offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2298 {
2299 int32_t error;
2300 struct uio auio;
2301 struct iovec aiov;
2302
2303 ud_printf("ud_rdwri\n");
2304
2305 bzero((caddr_t)&auio, sizeof (uio_t));
2306 bzero((caddr_t)&aiov, sizeof (iovec_t));
2307
2308 aiov.iov_base = base;
2309 aiov.iov_len = len;
2310 auio.uio_iov = &aiov;
2311 auio.uio_iovcnt = 1;
2312 auio.uio_loffset = offset;
2313 auio.uio_segflg = (int16_t)seg;
2314 auio.uio_resid = len;
2315
2316 if (rw == UIO_WRITE) {
2317 auio.uio_fmode = FWRITE;
2318 auio.uio_extflg = UIO_COPY_DEFAULT;
2319 auio.uio_llimit = curproc->p_fsz_ctl;
2320 error = ud_wrip(ip, &auio, ioflag, cr);
2321 } else {
2322 auio.uio_fmode = FREAD;
2323 auio.uio_extflg = UIO_COPY_CACHED;
2324 auio.uio_llimit = MAXOFFSET_T;
2325 error = ud_rdip(ip, &auio, ioflag, cr);
2326 }
2327
2328 if (aresid) {
2329 *aresid = auio.uio_resid;
2330 } else if (auio.uio_resid) {
2331 error = EIO;
2332 }
2333 return (error);
2334 }
2335
2336 /*
2337 * Free behind hacks. The pager is busted.
2338 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2339 * or B_FREE_IF_TIGHT_ON_MEMORY.
2340 */
2341 int32_t ud_freebehind = 1;
2342 int32_t ud_smallfile = 32 * 1024;
2343
2344 /* ARGSUSED */
2345 int32_t
2346 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2347 size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2348 size_t plsz, enum seg_rw rw, int32_t seq)
2349 {
2350 struct ud_inode *ip = VTOI(vp);
2351 int32_t err = 0;
2352 size_t io_len;
2353 u_offset_t io_off;
2354 u_offset_t pgoff;
2355 page_t *pp;
2356
2357 pl[0] = NULL;
2358
2359 /*
2360 * Figure out whether the page can be created, or must be
2361 * read from the disk
2362 */
2363 if (rw == S_CREATE) {
2364 if ((pp = page_create_va(vp, off,
2365 PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2366 cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2367 return (EINVAL);
2368 }
2369 io_len = PAGESIZE;
2370 } else {
2371 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2372 &io_len, off, PAGESIZE, 0);
2373
2374 /*
2375 * Some other thread has entered the page.
2376 * ud_getpage will retry page_lookup.
2377 */
2378 if (pp == NULL) {
2379 return (0);
2380 }
2381
2382 /*
2383 * Fill the page with as much data as we can from the file.
2384 */
2385 err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2386 if (err) {
2387 pvn_read_done(pp, B_ERROR);
2388 return (err);
2389 }
2390
2391 /*
2392 * XXX ??? ufs has io_len instead of pgoff below
2393 */
2394 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2395
2396 /*
2397 * If the file access is sequential, initiate read ahead
2398 * of the next cluster.
2399 */
2400 if (seq && ip->i_nextrio < ip->i_size) {
2401 ud_getpage_ra(vp, off, seg, addr);
2402 }
2403 }
2404
2405 outmiss:
2406 pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2407 return (err);
2408 }
2409
2410 /* ARGSUSED */
2411 void
2412 ud_getpage_ra(struct vnode *vp,
2413 u_offset_t off, struct seg *seg, caddr_t addr)
2414 {
2415 page_t *pp;
2416 size_t io_len;
2417 struct ud_inode *ip = VTOI(vp);
2418 u_offset_t io_off = ip->i_nextrio, pgoff;
2419 caddr_t addr2 = addr + (io_off - off);
2420 daddr_t bn;
2421 int32_t contig = 0;
2422
2423 /*
2424 * Is this test needed?
2425 */
2426
2427 if (addr2 >= seg->s_base + seg->s_size) {
2428 return;
2429 }
2430
2431 contig = 0;
2432 if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2433 return;
2434 }
2435
2436 pp = pvn_read_kluster(vp, io_off, seg, addr2,
2437 &io_off, &io_len, io_off, PAGESIZE, 1);
2438
2439 /*
2440 * Some other thread has entered the page.
2441 * So no read head done here (ie we will have to and wait
2442 * for the read when needed).
2443 */
2444
2445 if (pp == NULL) {
2446 return;
2447 }
2448
2449 (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2450 ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2451 }
2452
2453 int
2454 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2455 uint32_t bflgs, u_offset_t *pg_off)
2456 {
2457 daddr_t bn;
2458 struct buf *bp;
2459 caddr_t kaddr, caddr;
2460 int32_t error = 0, contig = 0, multi_io = 0;
2461 int32_t lbsize = ip->i_udf->udf_lbsize;
2462 int32_t lbmask = ip->i_udf->udf_lbmask;
2463 uint64_t isize;
2464
2465 isize = (ip->i_size + lbmask) & (~lbmask);
2466 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2467
2468 /*
2469 * Embedded file read file_entry
2470 * from buffer cache and copy the required
2471 * portions
2472 */
2473 bp = ud_bread(ip->i_dev,
2474 ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2475 if ((bp->b_error == 0) &&
2476 (bp->b_resid == 0)) {
2477
2478 caddr = bp->b_un.b_addr + ip->i_data_off;
2479
2480 /*
2481 * mapin to kvm
2482 */
2483 kaddr = (caddr_t)ppmapin(pp,
2484 PROT_READ | PROT_WRITE, (caddr_t)-1);
2485 (void) kcopy(caddr, kaddr, ip->i_size);
2486
2487 /*
2488 * mapout of kvm
2489 */
2490 ppmapout(kaddr);
2491 }
2492 brelse(bp);
2493 contig = ip->i_size;
2494 } else {
2495
2496 /*
2497 * Get the continuous size and block number
2498 * at offset "off"
2499 */
2500 if (error = ud_bmap_read(ip, off, &bn, &contig))
2501 goto out;
2502 contig = MIN(contig, PAGESIZE);
2503 contig = (contig + lbmask) & (~lbmask);
2504
2505 /*
2506 * Zero part of the page which we are not
2507 * going to read from the disk.
2508 */
2509
2510 if (bn == UDF_HOLE) {
2511
2512 /*
2513 * This is a HOLE. Just zero out
2514 * the page
2515 */
2516 if (((off + contig) == isize) ||
2517 (contig == PAGESIZE)) {
2518 pagezero(pp->p_prev, 0, PAGESIZE);
2519 goto out;
2520 }
2521 }
2522
2523 if (contig < PAGESIZE) {
2524 uint64_t count;
2525
2526 count = isize - off;
2527 if (contig != count) {
2528 multi_io = 1;
2529 contig = (int32_t)(MIN(count, PAGESIZE));
2530 } else {
2531 pagezero(pp->p_prev, contig, PAGESIZE - contig);
2532 }
2533 }
2534
2535 /*
2536 * Get a bp and initialize it
2537 */
2538 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2539 ASSERT(bp != NULL);
2540
2541 bp->b_edev = ip->i_dev;
2542 bp->b_dev = cmpdev(ip->i_dev);
2543 bp->b_blkno = bn;
2544 bp->b_un.b_addr = 0;
2545 bp->b_file = ip->i_vnode;
2546
2547 /*
2548 * Start I/O
2549 */
2550 if (multi_io == 0) {
2551
2552 /*
2553 * Single I/O is sufficient for this page
2554 */
2555 (void) bdev_strategy(bp);
2556 } else {
2557
2558 /*
2559 * We need to do the I/O in
2560 * piece's
2561 */
2562 error = ud_multi_strat(ip, pp, bp, off);
2563 if (error != 0) {
2564 goto out;
2565 }
2566 }
2567 if ((bflgs & B_ASYNC) == 0) {
2568
2569 /*
2570 * Wait for i/o to complete.
2571 */
2572
2573 error = biowait(bp);
2574 pageio_done(bp);
2575 if (error) {
2576 goto out;
2577 }
2578 }
2579 }
2580 if ((off + contig) >= ip->i_size) {
2581 contig = ip->i_size - off;
2582 }
2583
2584 out:
2585 *pg_off = contig;
2586 return (error);
2587 }
2588
2589 int32_t
2590 ud_putpages(struct vnode *vp, offset_t off,
2591 size_t len, int32_t flags, struct cred *cr)
2592 {
2593 struct ud_inode *ip;
2594 page_t *pp;
2595 u_offset_t io_off;
2596 size_t io_len;
2597 u_offset_t eoff;
2598 int32_t err = 0;
2599 int32_t dolock;
2600
2601 ud_printf("ud_putpages\n");
2602
2603 if (vp->v_count == 0) {
2604 cmn_err(CE_WARN, "ud_putpages: bad v_count");
2605 return (EINVAL);
2606 }
2607
2608 ip = VTOI(vp);
2609
2610 /*
2611 * Acquire the readers/write inode lock before locking
2612 * any pages in this inode.
2613 * The inode lock is held during i/o.
2614 */
2615 if (len == 0) {
2616 mutex_enter(&ip->i_tlock);
2617 ip->i_delayoff = ip->i_delaylen = 0;
2618 mutex_exit(&ip->i_tlock);
2619 }
2620 dolock = (rw_owner(&ip->i_contents) != curthread);
2621 if (dolock) {
2622 rw_enter(&ip->i_contents, RW_READER);
2623 }
2624
2625 if (!vn_has_cached_data(vp)) {
2626 if (dolock) {
2627 rw_exit(&ip->i_contents);
2628 }
2629 return (0);
2630 }
2631
2632 if (len == 0) {
2633 /*
2634 * Search the entire vp list for pages >= off.
2635 */
2636 err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2637 flags, cr);
2638 } else {
2639 /*
2640 * Loop over all offsets in the range looking for
2641 * pages to deal with.
2642 */
2643 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2644 eoff = MIN(off + len, eoff);
2645 } else {
2646 eoff = off + len;
2647 }
2648
2649 for (io_off = off; io_off < eoff; io_off += io_len) {
2650 /*
2651 * If we are not invalidating, synchronously
2652 * freeing or writing pages, use the routine
2653 * page_lookup_nowait() to prevent reclaiming
2654 * them from the free list.
2655 */
2656 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2657 pp = page_lookup(vp, io_off,
2658 (flags & (B_INVAL | B_FREE)) ?
2659 SE_EXCL : SE_SHARED);
2660 } else {
2661 pp = page_lookup_nowait(vp, io_off,
2662 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2663 }
2664
2665 if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2666 io_len = PAGESIZE;
2667 } else {
2668
2669 err = ud_putapage(vp, pp,
2670 &io_off, &io_len, flags, cr);
2671 if (err != 0) {
2672 break;
2673 }
2674 /*
2675 * "io_off" and "io_len" are returned as
2676 * the range of pages we actually wrote.
2677 * This allows us to skip ahead more quickly
2678 * since several pages may've been dealt
2679 * with by this iteration of the loop.
2680 */
2681 }
2682 }
2683 }
2684 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2685 /*
2686 * We have just sync'ed back all the pages on
2687 * the inode, turn off the IMODTIME flag.
2688 */
2689 mutex_enter(&ip->i_tlock);
2690 ip->i_flag &= ~IMODTIME;
2691 mutex_exit(&ip->i_tlock);
2692 }
2693 if (dolock) {
2694 rw_exit(&ip->i_contents);
2695 }
2696 return (err);
2697 }
2698
2699 /* ARGSUSED */
2700 int32_t
2701 ud_putapage(struct vnode *vp,
2702 page_t *pp, u_offset_t *offp,
2703 size_t *lenp, int32_t flags, struct cred *cr)
2704 {
2705 daddr_t bn;
2706 size_t io_len;
2707 struct ud_inode *ip;
2708 int32_t error = 0, contig, multi_io = 0;
2709 struct udf_vfs *udf_vfsp;
2710 u_offset_t off, io_off;
2711 caddr_t kaddr, caddr;
2712 struct buf *bp = NULL;
2713 int32_t lbmask;
2714 uint64_t isize;
2715 uint16_t crc_len;
2716 struct file_entry *fe;
2717
2718 ud_printf("ud_putapage\n");
2719
2720 ip = VTOI(vp);
2721 ASSERT(ip);
2722 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2723 lbmask = ip->i_udf->udf_lbmask;
2724 isize = (ip->i_size + lbmask) & (~lbmask);
2725
2726 udf_vfsp = ip->i_udf;
2727 ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2728
2729 /*
2730 * If the modified time on the inode has not already been
2731 * set elsewhere (e.g. for write/setattr) we set the time now.
2732 * This gives us approximate modified times for mmap'ed files
2733 * which are modified via stores in the user address space.
2734 */
2735 if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2736 mutex_enter(&ip->i_tlock);
2737 ip->i_flag |= IUPD;
2738 ITIMES_NOLOCK(ip);
2739 mutex_exit(&ip->i_tlock);
2740 }
2741
2742
2743 /*
2744 * Align the request to a block boundry (for old file systems),
2745 * and go ask bmap() how contiguous things are for this file.
2746 */
2747 off = pp->p_offset & ~(offset_t)lbmask;
2748 /* block align it */
2749
2750
2751 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2752 ASSERT(ip->i_size <= ip->i_max_emb);
2753
2754 pp = pvn_write_kluster(vp, pp, &io_off,
2755 &io_len, off, PAGESIZE, flags);
2756 if (io_len == 0) {
2757 io_len = PAGESIZE;
2758 }
2759
2760 bp = ud_bread(ip->i_dev,
2761 ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2762 udf_vfsp->udf_lbsize);
2763 fe = (struct file_entry *)bp->b_un.b_addr;
2764 if ((bp->b_flags & B_ERROR) ||
2765 (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2766 ip->i_icb_block,
2767 1, udf_vfsp->udf_lbsize) != 0)) {
2768 if (pp != NULL)
2769 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2770 if (bp->b_flags & B_ERROR) {
2771 error = EIO;
2772 } else {
2773 error = EINVAL;
2774 }
2775 brelse(bp);
2776 return (error);
2777 }
2778 if ((bp->b_error == 0) &&
2779 (bp->b_resid == 0)) {
2780
2781 caddr = bp->b_un.b_addr + ip->i_data_off;
2782 kaddr = (caddr_t)ppmapin(pp,
2783 PROT_READ | PROT_WRITE, (caddr_t)-1);
2784 (void) kcopy(kaddr, caddr, ip->i_size);
2785 ppmapout(kaddr);
2786 }
2787 crc_len = offsetof(struct file_entry, fe_spec) +
2788 SWAP_32(fe->fe_len_ear);
2789 crc_len += ip->i_size;
2790 ud_make_tag(ip->i_udf, &fe->fe_tag,
2791 UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2792
2793 bwrite(bp);
2794
2795 if (flags & B_ASYNC) {
2796 pvn_write_done(pp, flags);
2797 }
2798 contig = ip->i_size;
2799 } else {
2800
2801 if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2802 goto out;
2803 }
2804 contig = MIN(contig, PAGESIZE);
2805 contig = (contig + lbmask) & (~lbmask);
2806
2807 if (contig < PAGESIZE) {
2808 uint64_t count;
2809
2810 count = isize - off;
2811 if (contig != count) {
2812 multi_io = 1;
2813 contig = (int32_t)(MIN(count, PAGESIZE));
2814 }
2815 }
2816
2817 if ((off + contig) > isize) {
2818 contig = isize - off;
2819 }
2820
2821 if (contig > PAGESIZE) {
2822 if (contig & PAGEOFFSET) {
2823 contig &= PAGEMASK;
2824 }
2825 }
2826
2827 pp = pvn_write_kluster(vp, pp, &io_off,
2828 &io_len, off, contig, flags);
2829 if (io_len == 0) {
2830 io_len = PAGESIZE;
2831 }
2832
2833 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2834 ASSERT(bp != NULL);
2835
2836 bp->b_edev = ip->i_dev;
2837 bp->b_dev = cmpdev(ip->i_dev);
2838 bp->b_blkno = bn;
2839 bp->b_un.b_addr = 0;
2840 bp->b_file = vp;
2841 bp->b_offset = (offset_t)off;
2842
2843
2844 /*
2845 * write throttle
2846 */
2847 ASSERT(bp->b_iodone == NULL);
2848 bp->b_iodone = ud_iodone;
2849 mutex_enter(&ip->i_tlock);
2850 ip->i_writes += bp->b_bcount;
2851 mutex_exit(&ip->i_tlock);
2852
2853 if (multi_io == 0) {
2854
2855 (void) bdev_strategy(bp);
2856 } else {
2857 error = ud_multi_strat(ip, pp, bp, off);
2858 if (error != 0) {
2859 goto out;
2860 }
2861 }
2862
2863 if ((flags & B_ASYNC) == 0) {
2864 /*
2865 * Wait for i/o to complete.
2866 */
2867 error = biowait(bp);
2868 pageio_done(bp);
2869 }
2870 }
2871
2872 if ((flags & B_ASYNC) == 0) {
2873 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2874 }
2875
2876 pp = NULL;
2877
2878 out:
2879 if (error != 0 && pp != NULL) {
2880 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2881 }
2882
2883 if (offp) {
2884 *offp = io_off;
2885 }
2886 if (lenp) {
2887 *lenp = io_len;
2888 }
2889
2890 return (error);
2891 }
2892
2893
2894 int32_t
2895 ud_iodone(struct buf *bp)
2896 {
2897 struct ud_inode *ip;
2898
2899 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2900
2901 bp->b_iodone = NULL;
2902
2903 ip = VTOI(bp->b_pages->p_vnode);
2904
2905 mutex_enter(&ip->i_tlock);
2906 if (ip->i_writes >= ud_LW) {
2907 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2908 if (ud_WRITES) {
2909 cv_broadcast(&ip->i_wrcv); /* wake all up */
2910 }
2911 }
2912 } else {
2913 ip->i_writes -= bp->b_bcount;
2914 }
2915 mutex_exit(&ip->i_tlock);
2916 iodone(bp);
2917 return (0);
2918 }
2919
2920 /* ARGSUSED3 */
2921 int32_t
2922 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
2923 {
2924 struct vnode *vp;
2925 struct udf_vfs *udf_vfsp;
2926 krw_t rwtype;
2927 caddr_t base;
2928 uint32_t flags;
2929 int32_t error, n, on, mapon, dofree;
2930 u_offset_t off;
2931 long oresid = uio->uio_resid;
2932
2933 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2934 if ((ip->i_type != VREG) &&
2935 (ip->i_type != VDIR) &&
2936 (ip->i_type != VLNK)) {
2937 return (EIO);
2938 }
2939
2940 if (uio->uio_loffset > MAXOFFSET_T) {
2941 return (0);
2942 }
2943
2944 if ((uio->uio_loffset < (offset_t)0) ||
2945 ((uio->uio_loffset + uio->uio_resid) < 0)) {
2946 return (EINVAL);
2947 }
2948 if (uio->uio_resid == 0) {
2949 return (0);
2950 }
2951
2952 vp = ITOV(ip);
2953 udf_vfsp = ip->i_udf;
2954 mutex_enter(&ip->i_tlock);
2955 ip->i_flag |= IACC;
2956 mutex_exit(&ip->i_tlock);
2957
2958 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
2959
2960 do {
2961 offset_t diff;
2962 u_offset_t uoff = uio->uio_loffset;
2963 off = uoff & (offset_t)MAXBMASK;
2964 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
2965 on = (int)blkoff(udf_vfsp, uoff);
2966 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
2967
2968 diff = ip->i_size - uoff;
2969
2970 if (diff <= (offset_t)0) {
2971 error = 0;
2972 goto out;
2973 }
2974 if (diff < (offset_t)n) {
2975 n = (int)diff;
2976 }
2977 dofree = ud_freebehind &&
2978 ip->i_nextr == (off & PAGEMASK) &&
2979 off > ud_smallfile;
2980
2981 if (rwtype == RW_READER) {
2982 rw_exit(&ip->i_contents);
2983 }
2984
2985 base = segmap_getmapflt(segkmap, vp, (off + mapon),
2986 (uint32_t)n, 1, S_READ);
2987 error = uiomove(base + mapon, (long)n, UIO_READ, uio);
2988
2989 flags = 0;
2990 if (!error) {
2991 /*
2992 * If read a whole block, or read to eof,
2993 * won't need this buffer again soon.
2994 */
2995 if (n + on == MAXBSIZE && ud_freebehind && dofree &&
2996 freemem < lotsfree + pages_before_pager) {
2997 flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
2998 }
2999 /*
3000 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3001 * we want to make sure that the page which has
3002 * been read, is written on disk if it is dirty.
3003 * And corresponding indirect blocks should also
3004 * be flushed out.
3005 */
3006 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3007 flags &= ~SM_ASYNC;
3008 flags |= SM_WRITE;
3009 }
3010 error = segmap_release(segkmap, base, flags);
3011 } else {
3012 (void) segmap_release(segkmap, base, flags);
3013 }
3014
3015 if (rwtype == RW_READER) {
3016 rw_enter(&ip->i_contents, rwtype);
3017 }
3018 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3019 out:
3020 /*
3021 * Inode is updated according to this table if FRSYNC is set.
3022 *
3023 * FSYNC FDSYNC(posix.4)
3024 * --------------------------
3025 * always IATTCHG|IBDWRITE
3026 */
3027 if (ioflag & FRSYNC) {
3028 if ((ioflag & FSYNC) ||
3029 ((ioflag & FDSYNC) &&
3030 (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3031 rw_exit(&ip->i_contents);
3032 rw_enter(&ip->i_contents, RW_WRITER);
3033 ud_iupdat(ip, 1);
3034 }
3035 }
3036 /*
3037 * If we've already done a partial read, terminate
3038 * the read but return no error.
3039 */
3040 if (oresid != uio->uio_resid) {
3041 error = 0;
3042 }
3043 ITIMES(ip);
3044
3045 return (error);
3046 }
3047
3048 int32_t
3049 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3050 {
3051 caddr_t base;
3052 struct vnode *vp;
3053 struct udf_vfs *udf_vfsp;
3054 uint32_t flags;
3055 int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3056 int32_t pagecreate, newpage;
3057 uint64_t old_i_size;
3058 u_offset_t off;
3059 long start_resid = uio->uio_resid, premove_resid;
3060 rlim64_t limit = uio->uio_limit;
3061
3062
3063 ASSERT(RW_WRITE_HELD(&ip->i_contents));
3064 if ((ip->i_type != VREG) &&
3065 (ip->i_type != VDIR) &&
3066 (ip->i_type != VLNK)) {
3067 return (EIO);
3068 }
3069
3070 if (uio->uio_loffset >= MAXOFFSET_T) {
3071 return (EFBIG);
3072 }
3073 /*
3074 * see udf_l_pathconf
3075 */
3076 if (limit > (((uint64_t)1 << 40) - 1)) {
3077 limit = ((uint64_t)1 << 40) - 1;
3078 }
3079 if (uio->uio_loffset >= limit) {
3080 proc_t *p = ttoproc(curthread);
3081
3082 mutex_enter(&p->p_lock);
3083 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3084 p, RCA_UNSAFE_SIGINFO);
3085 mutex_exit(&p->p_lock);
3086 return (EFBIG);
3087 }
3088 if ((uio->uio_loffset < (offset_t)0) ||
3089 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3090 return (EINVAL);
3091 }
3092 if (uio->uio_resid == 0) {
3093 return (0);
3094 }
3095
3096 mutex_enter(&ip->i_tlock);
3097 ip->i_flag |= INOACC;
3098
3099 if (ioflag & (FSYNC | FDSYNC)) {
3100 ip->i_flag |= ISYNC;
3101 iupdat_flag = 1;
3102 }
3103 mutex_exit(&ip->i_tlock);
3104
3105 udf_vfsp = ip->i_udf;
3106 vp = ITOV(ip);
3107
3108 do {
3109 u_offset_t uoff = uio->uio_loffset;
3110 off = uoff & (offset_t)MAXBMASK;
3111 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3112 on = (int)blkoff(udf_vfsp, uoff);
3113 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3114
3115 if (ip->i_type == VREG && uoff + n >= limit) {
3116 if (uoff >= limit) {
3117 error = EFBIG;
3118 goto out;
3119 }
3120 n = (int)(limit - (rlim64_t)uoff);
3121 }
3122 if (uoff + n > ip->i_size) {
3123 /*
3124 * We are extending the length of the file.
3125 * bmap is used so that we are sure that
3126 * if we need to allocate new blocks, that it
3127 * is done here before we up the file size.
3128 */
3129 error = ud_bmap_write(ip, uoff,
3130 (int)(on + n), mapon == 0, cr);
3131 if (error) {
3132 break;
3133 }
3134 i_size_changed = 1;
3135 old_i_size = ip->i_size;
3136 ip->i_size = uoff + n;
3137 /*
3138 * If we are writing from the beginning of
3139 * the mapping, we can just create the
3140 * pages without having to read them.
3141 */
3142 pagecreate = (mapon == 0);
3143 } else if (n == MAXBSIZE) {
3144 /*
3145 * Going to do a whole mappings worth,
3146 * so we can just create the pages w/o
3147 * having to read them in. But before
3148 * we do that, we need to make sure any
3149 * needed blocks are allocated first.
3150 */
3151 error = ud_bmap_write(ip, uoff,
3152 (int)(on + n), 1, cr);
3153 if (error) {
3154 break;
3155 }
3156 pagecreate = 1;
3157 } else {
3158 pagecreate = 0;
3159 }
3160
3161 rw_exit(&ip->i_contents);
3162
3163 /*
3164 * Touch the page and fault it in if it is not in
3165 * core before segmap_getmapflt can lock it. This
3166 * is to avoid the deadlock if the buffer is mapped
3167 * to the same file through mmap which we want to
3168 * write to.
3169 */
3170 uio_prefaultpages((long)n, uio);
3171
3172 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3173 (uint32_t)n, !pagecreate, S_WRITE);
3174
3175 /*
3176 * segmap_pagecreate() returns 1 if it calls
3177 * page_create_va() to allocate any pages.
3178 */
3179 newpage = 0;
3180 if (pagecreate) {
3181 newpage = segmap_pagecreate(segkmap, base,
3182 (size_t)n, 0);
3183 }
3184
3185 premove_resid = uio->uio_resid;
3186 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3187
3188 if (pagecreate &&
3189 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3190 /*
3191 * We created pages w/o initializing them completely,
3192 * thus we need to zero the part that wasn't set up.
3193 * This happens on most EOF write cases and if
3194 * we had some sort of error during the uiomove.
3195 */
3196 int nzero, nmoved;
3197
3198 nmoved = (int)(uio->uio_loffset - (off + mapon));
3199 ASSERT(nmoved >= 0 && nmoved <= n);
3200 nzero = roundup(on + n, PAGESIZE) - nmoved;
3201 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3202 (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3203 }
3204
3205 /*
3206 * Unlock the pages allocated by page_create_va()
3207 * in segmap_pagecreate()
3208 */
3209 if (newpage) {
3210 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3211 }
3212
3213 if (error) {
3214 /*
3215 * If we failed on a write, we may have already
3216 * allocated file blocks as well as pages. It's
3217 * hard to undo the block allocation, but we must
3218 * be sure to invalidate any pages that may have
3219 * been allocated.
3220 */
3221 (void) segmap_release(segkmap, base, SM_INVAL);
3222 } else {
3223 flags = 0;
3224 /*
3225 * Force write back for synchronous write cases.
3226 */
3227 if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3228 /*
3229 * If the sticky bit is set but the
3230 * execute bit is not set, we do a
3231 * synchronous write back and free
3232 * the page when done. We set up swap
3233 * files to be handled this way to
3234 * prevent servers from keeping around
3235 * the client's swap pages too long.
3236 * XXX - there ought to be a better way.
3237 */
3238 if (IS_SWAPVP(vp)) {
3239 flags = SM_WRITE | SM_FREE |
3240 SM_DONTNEED;
3241 iupdat_flag = 0;
3242 } else {
3243 flags = SM_WRITE;
3244 }
3245 } else if (((mapon + n) == MAXBSIZE) ||
3246 IS_SWAPVP(vp)) {
3247 /*
3248 * Have written a whole block.
3249 * Start an asynchronous write and
3250 * mark the buffer to indicate that
3251 * it won't be needed again soon.
3252 */
3253 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3254 }
3255 error = segmap_release(segkmap, base, flags);
3256
3257 /*
3258 * If the operation failed and is synchronous,
3259 * then we need to unwind what uiomove() last
3260 * did so we can potentially return an error to
3261 * the caller. If this write operation was
3262 * done in two pieces and the first succeeded,
3263 * then we won't return an error for the second
3264 * piece that failed. However, we only want to
3265 * return a resid value that reflects what was
3266 * really done.
3267 *
3268 * Failures for non-synchronous operations can
3269 * be ignored since the page subsystem will
3270 * retry the operation until it succeeds or the
3271 * file system is unmounted.
3272 */
3273 if (error) {
3274 if ((ioflag & (FSYNC | FDSYNC)) ||
3275 ip->i_type == VDIR) {
3276 uio->uio_resid = premove_resid;
3277 } else {
3278 error = 0;
3279 }
3280 }
3281 }
3282
3283 /*
3284 * Re-acquire contents lock.
3285 */
3286 rw_enter(&ip->i_contents, RW_WRITER);
3287 /*
3288 * If the uiomove() failed or if a synchronous
3289 * page push failed, fix up i_size.
3290 */
3291 if (error) {
3292 if (i_size_changed) {
3293 /*
3294 * The uiomove failed, and we
3295 * allocated blocks,so get rid
3296 * of them.
3297 */
3298 (void) ud_itrunc(ip, old_i_size, 0, cr);
3299 }
3300 } else {
3301 /*
3302 * XXX - Can this be out of the loop?
3303 */
3304 ip->i_flag |= IUPD | ICHG;
3305 if (i_size_changed) {
3306 ip->i_flag |= IATTCHG;
3307 }
3308 if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3309 (IEXEC >> 10))) != 0 &&
3310 (ip->i_char & (ISUID | ISGID)) != 0 &&
3311 secpolicy_vnode_setid_retain(cr,
3312 (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3313 /*
3314 * Clear Set-UID & Set-GID bits on
3315 * successful write if not privileged
3316 * and at least one of the execute bits
3317 * is set. If we always clear Set-GID,
3318 * mandatory file and record locking is
3319 * unuseable.
3320 */
3321 ip->i_char &= ~(ISUID | ISGID);
3322 }
3323 }
3324 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3325
3326 out:
3327 /*
3328 * Inode is updated according to this table -
3329 *
3330 * FSYNC FDSYNC(posix.4)
3331 * --------------------------
3332 * always@ IATTCHG|IBDWRITE
3333 *
3334 * @ - If we are doing synchronous write the only time we should
3335 * not be sync'ing the ip here is if we have the stickyhack
3336 * activated, the file is marked with the sticky bit and
3337 * no exec bit, the file length has not been changed and
3338 * no new blocks have been allocated during this write.
3339 */
3340 if ((ip->i_flag & ISYNC) != 0) {
3341 /*
3342 * we have eliminated nosync
3343 */
3344 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3345 ((ioflag & FSYNC) && iupdat_flag)) {
3346 ud_iupdat(ip, 1);
3347 }
3348 }
3349
3350 /*
3351 * If we've already done a partial-write, terminate
3352 * the write but return no error.
3353 */
3354 if (start_resid != uio->uio_resid) {
3355 error = 0;
3356 }
3357 ip->i_flag &= ~(INOACC | ISYNC);
3358 ITIMES_NOLOCK(ip);
3359
3360 return (error);
3361 }
3362
3363 int32_t
3364 ud_multi_strat(struct ud_inode *ip,
3365 page_t *pp, struct buf *bp, u_offset_t start)
3366 {
3367 daddr_t bn;
3368 int32_t error = 0, io_count, contig, alloc_sz, i;
3369 uint32_t io_off;
3370 mio_master_t *mm = NULL;
3371 mio_slave_t *ms = NULL;
3372 struct buf *rbp;
3373
3374 ASSERT(!(start & PAGEOFFSET));
3375
3376 /*
3377 * Figure out how many buffers to allocate
3378 */
3379 io_count = 0;
3380 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3381 contig = 0;
3382 if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3383 &bn, &contig)) {
3384 goto end;
3385 }
3386 if (contig == 0) {
3387 goto end;
3388 }
3389 contig = MIN(contig, PAGESIZE - io_off);
3390 if (bn != UDF_HOLE) {
3391 io_count ++;
3392 } else {
3393 /*
3394 * HOLE
3395 */
3396 if (bp->b_flags & B_READ) {
3397
3398 /*
3399 * This is a hole and is read
3400 * it should be filled with 0's
3401 */
3402 pagezero(pp, io_off, contig);
3403 }
3404 }
3405 }
3406
3407
3408 if (io_count != 0) {
3409
3410 /*
3411 * Allocate memory for all the
3412 * required number of buffers
3413 */
3414 alloc_sz = sizeof (mio_master_t) +
3415 (sizeof (mio_slave_t) * io_count);
3416 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3417 if (mm == NULL) {
3418 error = ENOMEM;
3419 goto end;
3420 }
3421
3422 /*
3423 * initialize master
3424 */
3425 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3426 mm->mm_size = alloc_sz;
3427 mm->mm_bp = bp;
3428 mm->mm_resid = 0;
3429 mm->mm_error = 0;
3430 mm->mm_index = master_index++;
3431
3432 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3433
3434 /*
3435 * Initialize buffers
3436 */
3437 io_count = 0;
3438 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3439 contig = 0;
3440 if (error = ud_bmap_read(ip,
3441 (u_offset_t)(start + io_off),
3442 &bn, &contig)) {
3443 goto end;
3444 }
3445 ASSERT(contig);
3446 if ((io_off + contig) > bp->b_bcount) {
3447 contig = bp->b_bcount - io_off;
3448 }
3449 if (bn != UDF_HOLE) {
3450 /*
3451 * Clone the buffer
3452 * and prepare to start I/O
3453 */
3454 ms->ms_ptr = mm;
3455 bioinit(&ms->ms_buf);
3456 rbp = bioclone(bp, io_off, (size_t)contig,
3457 bp->b_edev, bn, ud_slave_done,
3458 &ms->ms_buf, KM_NOSLEEP);
3459 ASSERT(rbp == &ms->ms_buf);
3460 mm->mm_resid += contig;
3461 io_count++;
3462 ms ++;
3463 }
3464 }
3465
3466 /*
3467 * Start I/O's
3468 */
3469 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3470 for (i = 0; i < io_count; i++) {
3471 (void) bdev_strategy(&ms->ms_buf);
3472 ms ++;
3473 }
3474 }
3475
3476 end:
3477 if (error != 0) {
3478 bp->b_flags |= B_ERROR;
3479 bp->b_error = error;
3480 if (mm != NULL) {
3481 mutex_destroy(&mm->mm_mutex);
3482 kmem_free(mm, mm->mm_size);
3483 }
3484 }
3485 return (error);
3486 }
3487
3488 int32_t
3489 ud_slave_done(struct buf *bp)
3490 {
3491 mio_master_t *mm;
3492 int32_t resid;
3493
3494 ASSERT(SEMA_HELD(&bp->b_sem));
3495 ASSERT((bp->b_flags & B_DONE) == 0);
3496
3497 mm = ((mio_slave_t *)bp)->ms_ptr;
3498
3499 /*
3500 * Propagate error and byte count info from slave struct to
3501 * the master struct
3502 */
3503 mutex_enter(&mm->mm_mutex);
3504 if (bp->b_flags & B_ERROR) {
3505
3506 /*
3507 * If multiple slave buffers get
3508 * error we forget the old errors
3509 * this is ok because we any way
3510 * cannot return multiple errors
3511 */
3512 mm->mm_error = bp->b_error;
3513 }
3514 mm->mm_resid -= bp->b_bcount;
3515 resid = mm->mm_resid;
3516 mutex_exit(&mm->mm_mutex);
3517
3518 /*
3519 * free up the resources allocated to cloned buffers.
3520 */
3521 bp_mapout(bp);
3522 biofini(bp);
3523
3524 if (resid == 0) {
3525
3526 /*
3527 * This is the last I/O operation
3528 * clean up and return the original buffer
3529 */
3530 if (mm->mm_error) {
3531 mm->mm_bp->b_flags |= B_ERROR;
3532 mm->mm_bp->b_error = mm->mm_error;
3533 }
3534 biodone(mm->mm_bp);
3535 mutex_destroy(&mm->mm_mutex);
3536 kmem_free(mm, mm->mm_size);
3537 }
3538 return (0);
3539 }