1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
26 */
27
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vfs.h>
38 #include <sys/vfs_opreg.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/esunddi.h>
51 #include <sys/flock.h>
52 #include <sys/modctl.h>
53 #include <sys/cmn_err.h>
54 #include <sys/mkdev.h>
55 #include <sys/pathname.h>
56 #include <sys/ddi.h>
57 #include <sys/stat.h>
58 #include <sys/fs/snode.h>
59 #include <sys/fs/dv_node.h>
60 #include <sys/zone.h>
61
62 #include <sys/socket.h>
63 #include <sys/socketvar.h>
64 #include <netinet/in.h>
65 #include <sys/un.h>
66 #include <sys/ucred.h>
67
68 #include <sys/tiuser.h>
69 #define _SUN_TPI_VERSION 2
70 #include <sys/tihdr.h>
71
72 #include <c2/audit.h>
73
74 #include <fs/sockfs/nl7c.h>
75 #include <fs/sockfs/sockcommon.h>
76 #include <fs/sockfs/sockfilter_impl.h>
77 #include <fs/sockfs/socktpi.h>
78 #include <fs/sockfs/socktpi_impl.h>
79 #include <fs/sockfs/sodirect.h>
80
81 /*
82 * Macros that operate on struct cmsghdr.
83 * The CMSG_VALID macro does not assume that the last option buffer is padded.
84 */
85 #define CMSG_CONTENT(cmsg) (&((cmsg)[1]))
86 #define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
87 #define CMSG_VALID(cmsg, start, end) \
88 (ISALIGNED_cmsghdr(cmsg) && \
89 ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \
90 ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \
91 ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
92 ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
93 #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */
94
95 dev_t sockdev; /* For fsid in getattr */
96 int sockfs_defer_nl7c_init = 0;
97
98 struct socklist socklist;
99
100 struct kmem_cache *socket_cache;
101
102 /*
103 * sockconf_lock protects the socket configuration (socket types and
104 * socket filters) which is changed via the sockconfig system call.
105 */
106 krwlock_t sockconf_lock;
107
108 static int sockfs_update(kstat_t *, int);
109 static int sockfs_snapshot(kstat_t *, void *, int);
110 extern smod_info_t *sotpi_smod_create(void);
111
112 extern void sendfile_init();
113
114 extern void nl7c_init(void);
115
116 extern int modrootloaded;
117
118 /*
119 * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
120 * Returns with the vnode held.
121 */
122 int
123 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
124 {
125 struct snode *csp;
126 vnode_t *vp, *dvp;
127 major_t maj;
128 int error;
129
130 ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
131
132 /*
133 * Lookup the underlying filesystem vnode.
134 */
135 error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
136 if (error)
137 return (error);
138
139 /* Check that it is the correct vnode */
140 if (vp->v_type != VCHR) {
141 VN_RELE(vp);
142 return (ENOTSOCK);
143 }
144
145 /*
146 * If devpath went through devfs, the device should already
147 * be configured. If devpath is a mknod file, however, we
148 * need to make sure the device is properly configured.
149 * To do this, we do something similar to spec_open()
150 * except that we resolve to the minor/leaf level since
151 * we need to return a vnode.
152 */
153 csp = VTOS(VTOS(vp)->s_commonvp);
154 if (!(csp->s_flag & SDIPSET)) {
155 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
156 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
157 if (error == 0)
158 error = devfs_lookupname(pathname, NULLVPP, &dvp);
159 VN_RELE(vp);
160 kmem_free(pathname, MAXPATHLEN);
161 if (error != 0)
162 return (ENXIO);
163 vp = dvp; /* use the devfs vp */
164 }
165
166 /* device is configured at this point */
167 maj = getmajor(vp->v_rdev);
168 if (!STREAMSTAB(maj)) {
169 VN_RELE(vp);
170 return (ENOSTR);
171 }
172
173 *vpp = vp;
174 return (0);
175 }
176
177 /*
178 * Update the accessed, updated, or changed times in an sonode
179 * with the current time.
180 *
181 * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
182 * attributes in a fstat call. (They return the current time and 0 for
183 * all timestamps, respectively.) We maintain the current timestamps
184 * here primarily so that should sockmod be popped the resulting
185 * file descriptor will behave like a stream w.r.t. the timestamps.
186 */
187 void
188 so_update_attrs(struct sonode *so, int flag)
189 {
190 time_t now = gethrestime_sec();
191
192 if (SOCK_IS_NONSTR(so))
193 return;
194
195 mutex_enter(&so->so_lock);
196 so->so_flag |= flag;
197 if (flag & SOACC)
198 SOTOTPI(so)->sti_atime = now;
199 if (flag & SOMOD)
200 SOTOTPI(so)->sti_mtime = now;
201 mutex_exit(&so->so_lock);
202 }
203
204 extern so_create_func_t sock_comm_create_function;
205 extern so_destroy_func_t sock_comm_destroy_function;
206 /*
207 * Init function called when sockfs is loaded.
208 */
209 int
210 sockinit(int fstype, char *name)
211 {
212 static const fs_operation_def_t sock_vfsops_template[] = {
213 NULL, NULL
214 };
215 int error;
216 major_t dev;
217 char *err_str;
218
219 error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
220 if (error != 0) {
221 zcmn_err(GLOBAL_ZONEID, CE_WARN,
222 "sockinit: bad vfs ops template");
223 return (error);
224 }
225
226 error = vn_make_ops(name, socket_vnodeops_template,
227 &socket_vnodeops);
228 if (error != 0) {
229 err_str = "sockinit: bad socket vnode ops template";
230 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
231 socket_vnodeops = NULL;
232 goto failure;
233 }
234
235 socket_cache = kmem_cache_create("socket_cache",
236 sizeof (struct sonode), 0, sonode_constructor,
237 sonode_destructor, NULL, NULL, NULL, 0);
238
239 rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
240
241 error = socktpi_init();
242 if (error != 0) {
243 err_str = NULL;
244 goto failure;
245 }
246
247 error = sod_init();
248 if (error != 0) {
249 err_str = NULL;
250 goto failure;
251 }
252
253 /*
254 * Set up the default create and destroy functions
255 */
256 sock_comm_create_function = socket_sonode_create;
257 sock_comm_destroy_function = socket_sonode_destroy;
258
259 /*
260 * Build initial list mapping socket parameters to vnode.
261 */
262 smod_init();
263 smod_add(sotpi_smod_create());
264
265 sockparams_init();
266
267 /*
268 * If sockets are needed before init runs /sbin/soconfig
269 * it is possible to preload the sockparams list here using
270 * calls like:
271 * sockconfig(1,2,3, "/dev/tcp", 0);
272 */
273
274 /*
275 * Create a unique dev_t for use in so_fsid.
276 */
277
278 if ((dev = getudev()) == (major_t)-1)
279 dev = 0;
280 sockdev = makedevice(dev, 0);
281
282 mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
283 sendfile_init();
284 if (!modrootloaded) {
285 sockfs_defer_nl7c_init = 1;
286 } else {
287 nl7c_init();
288 }
289
290 /* Initialize socket filters */
291 sof_init();
292
293 return (0);
294
295 failure:
296 (void) vfs_freevfsops_by_type(fstype);
297 if (socket_vnodeops != NULL)
298 vn_freevnodeops(socket_vnodeops);
299 if (err_str != NULL)
300 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
301 return (error);
302 }
303
304 /*
305 * Caller must hold the mutex. Used to set SOLOCKED.
306 */
307 void
308 so_lock_single(struct sonode *so)
309 {
310 ASSERT(MUTEX_HELD(&so->so_lock));
311
312 while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
313 cv_wait_stop(&so->so_single_cv, &so->so_lock,
314 SO_LOCK_WAKEUP_TIME);
315 }
316 so->so_flag |= SOLOCKED;
317 }
318
319 /*
320 * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
321 * Used to clear SOLOCKED or SOASYNC_UNBIND.
322 */
323 void
324 so_unlock_single(struct sonode *so, int flag)
325 {
326 ASSERT(MUTEX_HELD(&so->so_lock));
327 ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
328 ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
329 ASSERT(so->so_flag & flag);
330 /*
331 * Process the T_DISCON_IND on sti_discon_ind_mp.
332 *
333 * Call to so_drain_discon_ind will result in so_lock
334 * being dropped and re-acquired later.
335 */
336 if (!SOCK_IS_NONSTR(so)) {
337 sotpi_info_t *sti = SOTOTPI(so);
338
339 if (sti->sti_discon_ind_mp != NULL)
340 so_drain_discon_ind(so);
341 }
342
343 cv_signal(&so->so_single_cv);
344 so->so_flag &= ~flag;
345 }
346
347 /*
348 * Caller must hold the mutex. Used to set SOREADLOCKED.
349 * If the caller wants nonblocking behavior it should set fmode.
350 */
351 int
352 so_lock_read(struct sonode *so, int fmode)
353 {
354 ASSERT(MUTEX_HELD(&so->so_lock));
355
356 while (so->so_flag & SOREADLOCKED) {
357 if (fmode & (FNDELAY|FNONBLOCK))
358 return (EWOULDBLOCK);
359 cv_wait_stop(&so->so_read_cv, &so->so_lock,
360 SO_LOCK_WAKEUP_TIME);
361 }
362 so->so_flag |= SOREADLOCKED;
363 return (0);
364 }
365
366 /*
367 * Like so_lock_read above but allows signals.
368 */
369 int
370 so_lock_read_intr(struct sonode *so, int fmode)
371 {
372 ASSERT(MUTEX_HELD(&so->so_lock));
373
374 while (so->so_flag & SOREADLOCKED) {
375 if (fmode & (FNDELAY|FNONBLOCK))
376 return (EWOULDBLOCK);
377 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
378 return (EINTR);
379 }
380 so->so_flag |= SOREADLOCKED;
381 return (0);
382 }
383
384 /*
385 * Caller must hold the mutex. Used to clear SOREADLOCKED,
386 * set in so_lock_read() or so_lock_read_intr().
387 */
388 void
389 so_unlock_read(struct sonode *so)
390 {
391 ASSERT(MUTEX_HELD(&so->so_lock));
392 ASSERT(so->so_flag & SOREADLOCKED);
393
394 cv_signal(&so->so_read_cv);
395 so->so_flag &= ~SOREADLOCKED;
396 }
397
398 /*
399 * Verify that the specified offset falls within the mblk and
400 * that the resulting pointer is aligned.
401 * Returns NULL if not.
402 */
403 void *
404 sogetoff(mblk_t *mp, t_uscalar_t offset,
405 t_uscalar_t length, uint_t align_size)
406 {
407 uintptr_t ptr1, ptr2;
408
409 ASSERT(mp && mp->b_wptr >= mp->b_rptr);
410 ptr1 = (uintptr_t)mp->b_rptr + offset;
411 ptr2 = (uintptr_t)ptr1 + length;
412 if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
413 eprintline(0);
414 return (NULL);
415 }
416 if ((ptr1 & (align_size - 1)) != 0) {
417 eprintline(0);
418 return (NULL);
419 }
420 return ((void *)ptr1);
421 }
422
423 /*
424 * Return the AF_UNIX underlying filesystem vnode matching a given name.
425 * Makes sure the sending and the destination sonodes are compatible.
426 * The vnode is returned held.
427 *
428 * The underlying filesystem VSOCK vnode has a v_stream pointer that
429 * references the actual stream head (hence indirectly the actual sonode).
430 */
431 static int
432 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
433 vnode_t **vpp)
434 {
435 vnode_t *vp; /* Underlying filesystem vnode */
436 vnode_t *rvp; /* real vnode */
437 vnode_t *svp; /* sockfs vnode */
438 struct sonode *so2;
439 int error;
440
441 dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
442 soun->sun_path));
443
444 error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
445 if (error) {
446 eprintsoline(so, error);
447 return (error);
448 }
449
450 /*
451 * Traverse lofs mounts get the real vnode
452 */
453 if (VOP_REALVP(vp, &rvp, NULL) == 0) {
454 VN_HOLD(rvp); /* hold the real vnode */
455 VN_RELE(vp); /* release hold from lookup */
456 vp = rvp;
457 }
458
459 if (vp->v_type != VSOCK) {
460 error = ENOTSOCK;
461 eprintsoline(so, error);
462 goto done2;
463 }
464
465 if (checkaccess) {
466 /*
467 * Check that we have permissions to access the destination
468 * vnode. This check is not done in BSD but it is required
469 * by X/Open.
470 */
471 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
472 eprintsoline(so, error);
473 goto done2;
474 }
475 }
476
477 /*
478 * Check if the remote socket has been closed.
479 *
480 * Synchronize with vn_rele_stream by holding v_lock while traversing
481 * v_stream->sd_vnode.
482 */
483 mutex_enter(&vp->v_lock);
484 if (vp->v_stream == NULL) {
485 mutex_exit(&vp->v_lock);
486 if (so->so_type == SOCK_DGRAM)
487 error = EDESTADDRREQ;
488 else
489 error = ECONNREFUSED;
490
491 eprintsoline(so, error);
492 goto done2;
493 }
494 ASSERT(vp->v_stream->sd_vnode);
495 svp = vp->v_stream->sd_vnode;
496 /*
497 * holding v_lock on underlying filesystem vnode and acquiring
498 * it on sockfs vnode. Assumes that no code ever attempts to
499 * acquire these locks in the reverse order.
500 */
501 VN_HOLD(svp);
502 mutex_exit(&vp->v_lock);
503
504 if (svp->v_type != VSOCK) {
505 error = ENOTSOCK;
506 eprintsoline(so, error);
507 goto done;
508 }
509
510 so2 = VTOSO(svp);
511
512 if (so->so_type != so2->so_type) {
513 error = EPROTOTYPE;
514 eprintsoline(so, error);
515 goto done;
516 }
517
518 VN_RELE(svp);
519 *vpp = vp;
520 return (0);
521
522 done:
523 VN_RELE(svp);
524 done2:
525 VN_RELE(vp);
526 return (error);
527 }
528
529 /*
530 * Verify peer address for connect and sendto/sendmsg.
531 * Since sendto/sendmsg would not get synchronous errors from the transport
532 * provider we have to do these ugly checks in the socket layer to
533 * preserve compatibility with SunOS 4.X.
534 */
535 int
536 so_addr_verify(struct sonode *so, const struct sockaddr *name,
537 socklen_t namelen)
538 {
539 int family;
540
541 dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
542 (void *)so, (void *)name, namelen));
543
544 ASSERT(name != NULL);
545
546 family = so->so_family;
547 switch (family) {
548 case AF_INET:
549 if (name->sa_family != family) {
550 eprintsoline(so, EAFNOSUPPORT);
551 return (EAFNOSUPPORT);
552 }
553 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
554 eprintsoline(so, EINVAL);
555 return (EINVAL);
556 }
557 break;
558 case AF_INET6: {
559 #ifdef DEBUG
560 struct sockaddr_in6 *sin6;
561 #endif /* DEBUG */
562
563 if (name->sa_family != family) {
564 eprintsoline(so, EAFNOSUPPORT);
565 return (EAFNOSUPPORT);
566 }
567 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
568 eprintsoline(so, EINVAL);
569 return (EINVAL);
570 }
571 #ifdef DEBUG
572 /* Verify that apps don't forget to clear sin6_scope_id etc */
573 sin6 = (struct sockaddr_in6 *)name;
574 if (sin6->sin6_scope_id != 0 &&
575 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
576 zcmn_err(getzoneid(), CE_WARN,
577 "connect/send* with uninitialized sin6_scope_id "
578 "(%d) on socket. Pid = %d\n",
579 (int)sin6->sin6_scope_id, (int)curproc->p_pid);
580 }
581 #endif /* DEBUG */
582 break;
583 }
584 case AF_UNIX:
585 if (SOTOTPI(so)->sti_faddr_noxlate) {
586 return (0);
587 }
588 if (namelen < (socklen_t)sizeof (short)) {
589 eprintsoline(so, ENOENT);
590 return (ENOENT);
591 }
592 if (name->sa_family != family) {
593 eprintsoline(so, EAFNOSUPPORT);
594 return (EAFNOSUPPORT);
595 }
596 /* MAXPATHLEN + soun_family + nul termination */
597 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
598 eprintsoline(so, ENAMETOOLONG);
599 return (ENAMETOOLONG);
600 }
601
602 break;
603
604 default:
605 /*
606 * Default is don't do any length or sa_family check
607 * to allow non-sockaddr style addresses.
608 */
609 break;
610 }
611
612 return (0);
613 }
614
615
616 /*
617 * Translate an AF_UNIX sockaddr_un to the transport internal name.
618 * Assumes caller has called so_addr_verify first. The translated
619 * (internal form) address is stored in sti->sti_ux_taddr.
620 */
621 /*ARGSUSED*/
622 int
623 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
624 socklen_t namelen, int checkaccess,
625 void **addrp, socklen_t *addrlenp)
626 {
627 int error;
628 struct sockaddr_un *soun;
629 vnode_t *vp;
630 void *addr;
631 socklen_t addrlen;
632 sotpi_info_t *sti = SOTOTPI(so);
633
634 dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
635 (void *)so, (void *)name, namelen, checkaccess));
636
637 ASSERT(name != NULL);
638 ASSERT(so->so_family == AF_UNIX);
639 ASSERT(!sti->sti_faddr_noxlate);
640 ASSERT(namelen >= (socklen_t)sizeof (short));
641 ASSERT(name->sa_family == AF_UNIX);
642 soun = (struct sockaddr_un *)name;
643 /*
644 * Lookup vnode for the specified path name and verify that
645 * it is a socket.
646 */
647 error = so_ux_lookup(so, soun, checkaccess, &vp);
648 if (error) {
649 eprintsoline(so, error);
650 return (error);
651 }
652 /*
653 * Use the address of the peer vnode as the address to send
654 * to. We release the peer vnode here. In case it has been
655 * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the
656 * transport the message will get an error or be dropped.
657 * Note that that soua_vp is never dereferenced; it's just a
658 * convenient value by which we can identify the peer.
659 */
660 sti->sti_ux_taddr.soua_vp = vp;
661 sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT;
662 addr = &sti->sti_ux_taddr;
663 addrlen = (socklen_t)sizeof (sti->sti_ux_taddr);
664 dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
665 addrlen, (void *)vp));
666 VN_RELE(vp);
667 *addrp = addr;
668 *addrlenp = (socklen_t)addrlen;
669 return (0);
670 }
671
672 /*
673 * Esballoc free function for messages that contain SO_FILEP option.
674 * Decrement the reference count on the file pointers using closef.
675 */
676 void
677 fdbuf_free(struct fdbuf *fdbuf)
678 {
679 int i;
680 struct file *fp;
681
682 dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
683 for (i = 0; i < fdbuf->fd_numfd; i++) {
684 /*
685 * We need pointer size alignment for fd_fds. On a LP64
686 * kernel, the required alignment is 8 bytes while
687 * the option headers and values are only 4 bytes
688 * aligned. So its safer to do a bcopy compared to
689 * assigning fdbuf->fd_fds[i] to fp.
690 */
691 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
692 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
693 (void) closef(fp);
694 }
695 if (fdbuf->fd_ebuf != NULL)
696 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
697 kmem_free(fdbuf, fdbuf->fd_size);
698 }
699
700 /*
701 * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
702 * Waits if memory is not available.
703 */
704 mblk_t *
705 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
706 {
707 uchar_t *buf;
708 mblk_t *mp;
709
710 dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
711 buf = kmem_alloc(size, KM_SLEEP);
712 fdbuf->fd_ebuf = (caddr_t)buf;
713 fdbuf->fd_ebuflen = size;
714 fdbuf->fd_frtn.free_func = fdbuf_free;
715 fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
716
717 mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
718 mp->b_datap->db_type = M_PROTO;
719 return (mp);
720 }
721
722 /*
723 * Extract file descriptors from a fdbuf.
724 * Return list in rights/rightslen.
725 */
726 /*ARGSUSED*/
727 static int
728 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
729 {
730 int i, fd;
731 int *rp;
732 struct file *fp;
733 int numfd;
734
735 dprint(1, ("fdbuf_extract: %d fds, len %d\n",
736 fdbuf->fd_numfd, rightslen));
737
738 numfd = fdbuf->fd_numfd;
739 ASSERT(rightslen == numfd * (int)sizeof (int));
740
741 /*
742 * Allocate a file descriptor and increment the f_count.
743 * The latter is needed since we always call fdbuf_free
744 * which performs a closef.
745 */
746 rp = (int *)rights;
747 for (i = 0; i < numfd; i++) {
748 if ((fd = ufalloc(0)) == -1)
749 goto cleanup;
750 /*
751 * We need pointer size alignment for fd_fds. On a LP64
752 * kernel, the required alignment is 8 bytes while
753 * the option headers and values are only 4 bytes
754 * aligned. So its safer to do a bcopy compared to
755 * assigning fdbuf->fd_fds[i] to fp.
756 */
757 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
758 mutex_enter(&fp->f_tlock);
759 fp->f_count++;
760 mutex_exit(&fp->f_tlock);
761 setf(fd, fp);
762 *rp++ = fd;
763 if (AU_AUDITING())
764 audit_fdrecv(fd, fp);
765 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
766 i, fd, (void *)fp, fp->f_count));
767 }
768 return (0);
769
770 cleanup:
771 /*
772 * Undo whatever partial work the loop above has done.
773 */
774 {
775 int j;
776
777 rp = (int *)rights;
778 for (j = 0; j < i; j++) {
779 dprint(0,
780 ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
781 (void) closeandsetf(*rp++, NULL);
782 }
783 }
784
785 return (EMFILE);
786 }
787
788 /*
789 * Insert file descriptors into an fdbuf.
790 * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
791 * by calling fdbuf_free().
792 */
793 int
794 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
795 {
796 int numfd, i;
797 int *fds;
798 struct file *fp;
799 struct fdbuf *fdbuf;
800 int fdbufsize;
801
802 dprint(1, ("fdbuf_create: len %d\n", rightslen));
803
804 numfd = rightslen / (int)sizeof (int);
805
806 fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
807 fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
808 fdbuf->fd_size = fdbufsize;
809 fdbuf->fd_numfd = 0;
810 fdbuf->fd_ebuf = NULL;
811 fdbuf->fd_ebuflen = 0;
812 fds = (int *)rights;
813 for (i = 0; i < numfd; i++) {
814 if ((fp = getf(fds[i])) == NULL) {
815 fdbuf_free(fdbuf);
816 return (EBADF);
817 }
818 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
819 i, fds[i], (void *)fp, fp->f_count));
820 mutex_enter(&fp->f_tlock);
821 fp->f_count++;
822 mutex_exit(&fp->f_tlock);
823 /*
824 * The maximum alignment for fdbuf (or any option header
825 * and its value) it 4 bytes. On a LP64 kernel, the alignment
826 * is not sufficient for pointers (fd_fds in this case). Since
827 * we just did a kmem_alloc (we get a double word alignment),
828 * we don't need to do anything on the send side (we loose
829 * the double word alignment because fdbuf goes after an
830 * option header (eg T_unitdata_req) which is only 4 byte
831 * aligned). We take care of this when we extract the file
832 * descriptor in fdbuf_extract or fdbuf_free.
833 */
834 fdbuf->fd_fds[i] = fp;
835 fdbuf->fd_numfd++;
836 releasef(fds[i]);
837 if (AU_AUDITING())
838 audit_fdsend(fds[i], fp, 0);
839 }
840 *fdbufp = fdbuf;
841 return (0);
842 }
843
844 static int
845 fdbuf_optlen(int rightslen)
846 {
847 int numfd;
848
849 numfd = rightslen / (int)sizeof (int);
850
851 return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
852 }
853
854 static t_uscalar_t
855 fdbuf_cmsglen(int fdbuflen)
856 {
857 return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
858 (int)sizeof (struct file *) * (int)sizeof (int));
859 }
860
861
862 /*
863 * Return non-zero if the mblk and fdbuf are consistent.
864 */
865 static int
866 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
867 {
868 if (fdbuflen >= FDBUF_HDRSIZE &&
869 fdbuflen == fdbuf->fd_size) {
870 frtn_t *frp = mp->b_datap->db_frtnp;
871 /*
872 * Check that the SO_FILEP portion of the
873 * message has not been modified by
874 * the loopback transport. The sending sockfs generates
875 * a message that is esballoc'ed with the free function
876 * being fdbuf_free() and where free_arg contains the
877 * identical information as the SO_FILEP content.
878 *
879 * If any of these constraints are not satisfied we
880 * silently ignore the option.
881 */
882 ASSERT(mp);
883 if (frp != NULL &&
884 frp->free_func == fdbuf_free &&
885 frp->free_arg != NULL &&
886 bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
887 dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
888 (void *)fdbuf, fdbuflen));
889 return (1);
890 } else {
891 zcmn_err(getzoneid(), CE_WARN,
892 "sockfs: mismatched fdbuf content (%p)",
893 (void *)mp);
894 return (0);
895 }
896 } else {
897 zcmn_err(getzoneid(), CE_WARN,
898 "sockfs: mismatched fdbuf len %d, %d\n",
899 fdbuflen, fdbuf->fd_size);
900 return (0);
901 }
902 }
903
904 /*
905 * When the file descriptors returned by sorecvmsg can not be passed
906 * to the application this routine will cleanup the references on
907 * the files. Start at startoff bytes into the buffer.
908 */
909 static void
910 close_fds(void *fdbuf, int fdbuflen, int startoff)
911 {
912 int *fds = (int *)fdbuf;
913 int numfd = fdbuflen / (int)sizeof (int);
914 int i;
915
916 dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
917
918 for (i = 0; i < numfd; i++) {
919 if (startoff < 0)
920 startoff = 0;
921 if (startoff < (int)sizeof (int)) {
922 /*
923 * This file descriptor is partially or fully after
924 * the offset
925 */
926 dprint(0,
927 ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
928 (void) closeandsetf(fds[i], NULL);
929 }
930 startoff -= (int)sizeof (int);
931 }
932 }
933
934 /*
935 * Close all file descriptors contained in the control part starting at
936 * the startoffset.
937 */
938 void
939 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
940 int startoff)
941 {
942 struct cmsghdr *cmsg;
943
944 if (control == NULL)
945 return;
946
947 if (oldflg) {
948 close_fds(control, controllen, startoff);
949 return;
950 }
951 /* Scan control part for file descriptors. */
952 for (cmsg = (struct cmsghdr *)control;
953 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
954 cmsg = CMSG_NEXT(cmsg)) {
955 if (cmsg->cmsg_level == SOL_SOCKET &&
956 cmsg->cmsg_type == SCM_RIGHTS) {
957 close_fds(CMSG_CONTENT(cmsg),
958 (int)CMSG_CONTENTLEN(cmsg),
959 startoff - (int)sizeof (struct cmsghdr));
960 }
961 startoff -= cmsg->cmsg_len;
962 }
963 }
964
965 /*
966 * Returns a pointer/length for the file descriptors contained
967 * in the control buffer. Returns with *fdlenp == -1 if there are no
968 * file descriptor options present. This is different than there being
969 * a zero-length file descriptor option.
970 * Fail if there are multiple SCM_RIGHT cmsgs.
971 */
972 int
973 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
974 void **fdsp, int *fdlenp)
975 {
976 struct cmsghdr *cmsg;
977 void *fds;
978 int fdlen;
979
980 if (control == NULL) {
981 *fdsp = NULL;
982 *fdlenp = -1;
983 return (0);
984 }
985
986 if (oldflg) {
987 *fdsp = control;
988 if (controllen == 0)
989 *fdlenp = -1;
990 else
991 *fdlenp = controllen;
992 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
993 return (0);
994 }
995
996 fds = NULL;
997 fdlen = 0;
998
999 for (cmsg = (struct cmsghdr *)control;
1000 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1001 cmsg = CMSG_NEXT(cmsg)) {
1002 if (cmsg->cmsg_level == SOL_SOCKET &&
1003 cmsg->cmsg_type == SCM_RIGHTS) {
1004 if (fds != NULL)
1005 return (EINVAL);
1006 fds = CMSG_CONTENT(cmsg);
1007 fdlen = (int)CMSG_CONTENTLEN(cmsg);
1008 dprint(1, ("so_getfdopt: new %lu\n",
1009 (size_t)CMSG_CONTENTLEN(cmsg)));
1010 }
1011 }
1012 if (fds == NULL) {
1013 dprint(1, ("so_getfdopt: NONE\n"));
1014 *fdlenp = -1;
1015 } else
1016 *fdlenp = fdlen;
1017 *fdsp = fds;
1018 return (0);
1019 }
1020
1021 /*
1022 * Return the length of the options including any file descriptor options.
1023 */
1024 t_uscalar_t
1025 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1026 {
1027 struct cmsghdr *cmsg;
1028 t_uscalar_t optlen = 0;
1029 t_uscalar_t len;
1030
1031 if (control == NULL)
1032 return (0);
1033
1034 if (oldflg)
1035 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1036 fdbuf_optlen(controllen)));
1037
1038 for (cmsg = (struct cmsghdr *)control;
1039 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1040 cmsg = CMSG_NEXT(cmsg)) {
1041 if (cmsg->cmsg_level == SOL_SOCKET &&
1042 cmsg->cmsg_type == SCM_RIGHTS) {
1043 len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1044 } else {
1045 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1046 }
1047 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1048 sizeof (struct T_opthdr));
1049 }
1050 dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1051 controllen, oldflg, optlen));
1052 return (optlen);
1053 }
1054
1055 /*
1056 * Copy options from control to the mblk. Skip any file descriptor options.
1057 */
1058 void
1059 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1060 {
1061 struct T_opthdr toh;
1062 struct cmsghdr *cmsg;
1063
1064 if (control == NULL)
1065 return;
1066
1067 if (oldflg) {
1068 /* No real options - caller has handled file descriptors */
1069 return;
1070 }
1071 for (cmsg = (struct cmsghdr *)control;
1072 CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1073 cmsg = CMSG_NEXT(cmsg)) {
1074 /*
1075 * Note: The caller handles file descriptors prior
1076 * to calling this function.
1077 */
1078 t_uscalar_t len;
1079
1080 if (cmsg->cmsg_level == SOL_SOCKET &&
1081 cmsg->cmsg_type == SCM_RIGHTS)
1082 continue;
1083
1084 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1085 toh.level = cmsg->cmsg_level;
1086 toh.name = cmsg->cmsg_type;
1087 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1088 toh.status = 0;
1089
1090 soappendmsg(mp, &toh, sizeof (toh));
1091 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1092 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1093 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1094 }
1095 }
1096
1097 /*
1098 * Return the length of the control message derived from the options.
1099 * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1100 * When oldflg is set only include SO_FILEP.
1101 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1102 * allocates the space that so_opt2cmsg fills. If one changes, the other should
1103 * also be checked for any possible impacts.
1104 */
1105 t_uscalar_t
1106 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1107 {
1108 t_uscalar_t cmsglen = 0;
1109 struct T_opthdr *tohp;
1110 t_uscalar_t len;
1111 t_uscalar_t last_roundup = 0;
1112
1113 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1114
1115 for (tohp = (struct T_opthdr *)opt;
1116 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1117 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1118 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1119 tohp->level, tohp->name, tohp->len));
1120 if (tohp->level == SOL_SOCKET &&
1121 (tohp->name == SO_SRCADDR ||
1122 tohp->name == SO_UNIX_CLOSE)) {
1123 continue;
1124 }
1125 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1126 struct fdbuf *fdbuf;
1127 int fdbuflen;
1128
1129 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1130 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1131
1132 if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1133 continue;
1134 if (oldflg) {
1135 cmsglen += fdbuf_cmsglen(fdbuflen);
1136 continue;
1137 }
1138 len = fdbuf_cmsglen(fdbuflen);
1139 } else if (tohp->level == SOL_SOCKET &&
1140 tohp->name == SCM_TIMESTAMP) {
1141 if (oldflg)
1142 continue;
1143
1144 if (get_udatamodel() == DATAMODEL_NATIVE) {
1145 len = sizeof (struct timeval);
1146 } else {
1147 len = sizeof (struct timeval32);
1148 }
1149 } else {
1150 if (oldflg)
1151 continue;
1152 len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1153 }
1154 /*
1155 * Exclude roundup for last option to not set
1156 * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1157 */
1158 last_roundup = (t_uscalar_t)
1159 (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1160 (len + (int)sizeof (struct cmsghdr)));
1161 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1162 last_roundup;
1163 }
1164 cmsglen -= last_roundup;
1165 dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1166 optlen, oldflg, cmsglen));
1167 return (cmsglen);
1168 }
1169
1170 /*
1171 * Copy options from options to the control. Convert SO_FILEP to
1172 * file descriptors.
1173 * Returns errno or zero.
1174 * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1175 * allocates the space that so_opt2cmsg fills. If one changes, the other should
1176 * also be checked for any possible impacts.
1177 */
1178 int
1179 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1180 void *control, t_uscalar_t controllen)
1181 {
1182 struct T_opthdr *tohp;
1183 struct cmsghdr *cmsg;
1184 struct fdbuf *fdbuf;
1185 int fdbuflen;
1186 int error;
1187 #if defined(DEBUG) || defined(__lint)
1188 struct cmsghdr *cend = (struct cmsghdr *)
1189 (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1190 #endif
1191 cmsg = (struct cmsghdr *)control;
1192
1193 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1194
1195 for (tohp = (struct T_opthdr *)opt;
1196 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1197 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1198 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1199 tohp->level, tohp->name, tohp->len));
1200
1201 if (tohp->level == SOL_SOCKET &&
1202 (tohp->name == SO_SRCADDR ||
1203 tohp->name == SO_UNIX_CLOSE)) {
1204 continue;
1205 }
1206 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1207 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1208 fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1209 fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1210
1211 if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1212 return (EPROTO);
1213 if (oldflg) {
1214 error = fdbuf_extract(fdbuf, control,
1215 (int)controllen);
1216 if (error != 0)
1217 return (error);
1218 continue;
1219 } else {
1220 int fdlen;
1221
1222 fdlen = (int)fdbuf_cmsglen(
1223 (int)_TPI_TOPT_DATALEN(tohp));
1224
1225 cmsg->cmsg_level = tohp->level;
1226 cmsg->cmsg_type = SCM_RIGHTS;
1227 cmsg->cmsg_len = (socklen_t)(fdlen +
1228 sizeof (struct cmsghdr));
1229
1230 error = fdbuf_extract(fdbuf,
1231 CMSG_CONTENT(cmsg), fdlen);
1232 if (error != 0)
1233 return (error);
1234 }
1235 } else if (tohp->level == SOL_SOCKET &&
1236 tohp->name == SCM_TIMESTAMP) {
1237 timestruc_t *timestamp;
1238
1239 if (oldflg)
1240 continue;
1241
1242 cmsg->cmsg_level = tohp->level;
1243 cmsg->cmsg_type = tohp->name;
1244
1245 timestamp =
1246 (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1247 sizeof (intptr_t));
1248
1249 if (get_udatamodel() == DATAMODEL_NATIVE) {
1250 struct timeval tv;
1251
1252 cmsg->cmsg_len = sizeof (struct timeval) +
1253 sizeof (struct cmsghdr);
1254 tv.tv_sec = timestamp->tv_sec;
1255 tv.tv_usec = timestamp->tv_nsec /
1256 (NANOSEC / MICROSEC);
1257 /*
1258 * on LP64 systems, the struct timeval in
1259 * the destination will not be 8-byte aligned,
1260 * so use bcopy to avoid alignment trouble
1261 */
1262 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1263 } else {
1264 struct timeval32 *time32;
1265
1266 cmsg->cmsg_len = sizeof (struct timeval32) +
1267 sizeof (struct cmsghdr);
1268 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1269 time32->tv_sec = (time32_t)timestamp->tv_sec;
1270 time32->tv_usec =
1271 (int32_t)(timestamp->tv_nsec /
1272 (NANOSEC / MICROSEC));
1273 }
1274
1275 } else {
1276 if (oldflg)
1277 continue;
1278
1279 cmsg->cmsg_level = tohp->level;
1280 cmsg->cmsg_type = tohp->name;
1281 cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1282 sizeof (struct cmsghdr));
1283
1284 /* copy content to control data part */
1285 bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1286 CMSG_CONTENTLEN(cmsg));
1287 }
1288 /* move to next CMSG structure! */
1289 cmsg = CMSG_NEXT(cmsg);
1290 }
1291 dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1292 control, controllen, (void *)cend, (void *)cmsg));
1293 ASSERT(cmsg <= cend);
1294 return (0);
1295 }
1296
1297 /*
1298 * Extract the SO_SRCADDR option value if present.
1299 */
1300 void
1301 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1302 t_uscalar_t *srclenp)
1303 {
1304 struct T_opthdr *tohp;
1305
1306 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1307
1308 ASSERT(srcp != NULL && srclenp != NULL);
1309 *srcp = NULL;
1310 *srclenp = 0;
1311
1312 for (tohp = (struct T_opthdr *)opt;
1313 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1314 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1315 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1316 tohp->level, tohp->name, tohp->len));
1317 if (tohp->level == SOL_SOCKET &&
1318 tohp->name == SO_SRCADDR) {
1319 *srcp = _TPI_TOPT_DATA(tohp);
1320 *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1321 }
1322 }
1323 }
1324
1325 /*
1326 * Verify if the SO_UNIX_CLOSE option is present.
1327 */
1328 int
1329 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1330 {
1331 struct T_opthdr *tohp;
1332
1333 ASSERT(__TPI_TOPT_ISALIGNED(opt));
1334
1335 for (tohp = (struct T_opthdr *)opt;
1336 tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1337 tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1338 dprint(1,
1339 ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1340 tohp->level, tohp->name, tohp->len));
1341 if (tohp->level == SOL_SOCKET &&
1342 tohp->name == SO_UNIX_CLOSE)
1343 return (1);
1344 }
1345 return (0);
1346 }
1347
1348 /*
1349 * Allocate an M_PROTO message.
1350 *
1351 * If allocation fails the behavior depends on sleepflg:
1352 * _ALLOC_NOSLEEP fail immediately
1353 * _ALLOC_INTR sleep for memory until a signal is caught
1354 * _ALLOC_SLEEP sleep forever. Don't return NULL.
1355 */
1356 mblk_t *
1357 soallocproto(size_t size, int sleepflg, cred_t *cr)
1358 {
1359 mblk_t *mp;
1360
1361 /* Round up size for reuse */
1362 size = MAX(size, 64);
1363 if (cr != NULL)
1364 mp = allocb_cred(size, cr, curproc->p_pid);
1365 else
1366 mp = allocb(size, BPRI_MED);
1367
1368 if (mp == NULL) {
1369 int error; /* Dummy - error not returned to caller */
1370
1371 switch (sleepflg) {
1372 case _ALLOC_SLEEP:
1373 if (cr != NULL) {
1374 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1375 cr, curproc->p_pid);
1376 } else {
1377 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1378 &error);
1379 }
1380 ASSERT(mp);
1381 break;
1382 case _ALLOC_INTR:
1383 if (cr != NULL) {
1384 mp = allocb_cred_wait(size, 0, &error, cr,
1385 curproc->p_pid);
1386 } else {
1387 mp = allocb_wait(size, BPRI_MED, 0, &error);
1388 }
1389 if (mp == NULL) {
1390 /* Caught signal while sleeping for memory */
1391 eprintline(ENOBUFS);
1392 return (NULL);
1393 }
1394 break;
1395 case _ALLOC_NOSLEEP:
1396 default:
1397 eprintline(ENOBUFS);
1398 return (NULL);
1399 }
1400 }
1401 DB_TYPE(mp) = M_PROTO;
1402 return (mp);
1403 }
1404
1405 /*
1406 * Allocate an M_PROTO message with a single component.
1407 * len is the length of buf. size is the amount to allocate.
1408 *
1409 * buf can be NULL with a non-zero len.
1410 * This results in a bzero'ed chunk being placed the message.
1411 */
1412 mblk_t *
1413 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1414 cred_t *cr)
1415 {
1416 mblk_t *mp;
1417
1418 if (size == 0)
1419 size = len;
1420
1421 ASSERT(size >= len);
1422 /* Round up size for reuse */
1423 size = MAX(size, 64);
1424 mp = soallocproto(size, sleepflg, cr);
1425 if (mp == NULL)
1426 return (NULL);
1427 mp->b_datap->db_type = M_PROTO;
1428 if (len != 0) {
1429 if (buf != NULL)
1430 bcopy(buf, mp->b_wptr, len);
1431 else
1432 bzero(mp->b_wptr, len);
1433 mp->b_wptr += len;
1434 }
1435 return (mp);
1436 }
1437
1438 /*
1439 * Append buf/len to mp.
1440 * The caller has to ensure that there is enough room in the mblk.
1441 *
1442 * buf can be NULL with a non-zero len.
1443 * This results in a bzero'ed chunk being placed the message.
1444 */
1445 void
1446 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1447 {
1448 ASSERT(mp);
1449
1450 if (len != 0) {
1451 /* Assert for room left */
1452 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1453 if (buf != NULL)
1454 bcopy(buf, mp->b_wptr, len);
1455 else
1456 bzero(mp->b_wptr, len);
1457 }
1458 mp->b_wptr += len;
1459 }
1460
1461 /*
1462 * Create a message using two kernel buffers.
1463 * If size is set that will determine the allocation size (e.g. for future
1464 * soappendmsg calls). If size is zero it is derived from the buffer
1465 * lengths.
1466 */
1467 mblk_t *
1468 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1469 ssize_t size, int sleepflg, cred_t *cr)
1470 {
1471 mblk_t *mp;
1472
1473 if (size == 0)
1474 size = len1 + len2;
1475 ASSERT(size >= len1 + len2);
1476
1477 mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1478 if (mp)
1479 soappendmsg(mp, buf2, len2);
1480 return (mp);
1481 }
1482
1483 /*
1484 * Create a message using three kernel buffers.
1485 * If size is set that will determine the allocation size (for future
1486 * soappendmsg calls). If size is zero it is derived from the buffer
1487 * lengths.
1488 */
1489 mblk_t *
1490 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1491 const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1492 {
1493 mblk_t *mp;
1494
1495 if (size == 0)
1496 size = len1 + len2 +len3;
1497 ASSERT(size >= len1 + len2 + len3);
1498
1499 mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1500 if (mp != NULL) {
1501 soappendmsg(mp, buf2, len2);
1502 soappendmsg(mp, buf3, len3);
1503 }
1504 return (mp);
1505 }
1506
1507 #ifdef DEBUG
1508 char *
1509 pr_state(uint_t state, uint_t mode)
1510 {
1511 static char buf[1024];
1512
1513 buf[0] = 0;
1514 if (state & SS_ISCONNECTED)
1515 (void) strcat(buf, "ISCONNECTED ");
1516 if (state & SS_ISCONNECTING)
1517 (void) strcat(buf, "ISCONNECTING ");
1518 if (state & SS_ISDISCONNECTING)
1519 (void) strcat(buf, "ISDISCONNECTING ");
1520 if (state & SS_CANTSENDMORE)
1521 (void) strcat(buf, "CANTSENDMORE ");
1522
1523 if (state & SS_CANTRCVMORE)
1524 (void) strcat(buf, "CANTRCVMORE ");
1525 if (state & SS_ISBOUND)
1526 (void) strcat(buf, "ISBOUND ");
1527 if (state & SS_NDELAY)
1528 (void) strcat(buf, "NDELAY ");
1529 if (state & SS_NONBLOCK)
1530 (void) strcat(buf, "NONBLOCK ");
1531
1532 if (state & SS_ASYNC)
1533 (void) strcat(buf, "ASYNC ");
1534 if (state & SS_ACCEPTCONN)
1535 (void) strcat(buf, "ACCEPTCONN ");
1536 if (state & SS_SAVEDEOR)
1537 (void) strcat(buf, "SAVEDEOR ");
1538
1539 if (state & SS_RCVATMARK)
1540 (void) strcat(buf, "RCVATMARK ");
1541 if (state & SS_OOBPEND)
1542 (void) strcat(buf, "OOBPEND ");
1543 if (state & SS_HAVEOOBDATA)
1544 (void) strcat(buf, "HAVEOOBDATA ");
1545 if (state & SS_HADOOBDATA)
1546 (void) strcat(buf, "HADOOBDATA ");
1547
1548 if (mode & SM_PRIV)
1549 (void) strcat(buf, "PRIV ");
1550 if (mode & SM_ATOMIC)
1551 (void) strcat(buf, "ATOMIC ");
1552 if (mode & SM_ADDR)
1553 (void) strcat(buf, "ADDR ");
1554 if (mode & SM_CONNREQUIRED)
1555 (void) strcat(buf, "CONNREQUIRED ");
1556
1557 if (mode & SM_FDPASSING)
1558 (void) strcat(buf, "FDPASSING ");
1559 if (mode & SM_EXDATA)
1560 (void) strcat(buf, "EXDATA ");
1561 if (mode & SM_OPTDATA)
1562 (void) strcat(buf, "OPTDATA ");
1563 if (mode & SM_BYTESTREAM)
1564 (void) strcat(buf, "BYTESTREAM ");
1565 return (buf);
1566 }
1567
1568 char *
1569 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1570 {
1571 static char buf[1024];
1572
1573 if (addr == NULL || addrlen == 0) {
1574 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1575 return (buf);
1576 }
1577 switch (family) {
1578 case AF_INET: {
1579 struct sockaddr_in sin;
1580
1581 bcopy(addr, &sin, sizeof (sin));
1582
1583 (void) sprintf(buf, "(len %d) %x/%d",
1584 addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1585 break;
1586 }
1587 case AF_INET6: {
1588 struct sockaddr_in6 sin6;
1589 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1590
1591 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1592 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1593 addrlen,
1594 ntohs(piece[0]), ntohs(piece[1]),
1595 ntohs(piece[2]), ntohs(piece[3]),
1596 ntohs(piece[4]), ntohs(piece[5]),
1597 ntohs(piece[6]), ntohs(piece[7]),
1598 ntohs(sin6.sin6_port));
1599 break;
1600 }
1601 case AF_UNIX: {
1602 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1603
1604 (void) sprintf(buf, "(len %d) %s", addrlen,
1605 (soun == NULL) ? "(none)" : soun->sun_path);
1606 break;
1607 }
1608 default:
1609 (void) sprintf(buf, "(unknown af %d)", family);
1610 break;
1611 }
1612 return (buf);
1613 }
1614
1615 /* The logical equivalence operator (a if-and-only-if b) */
1616 #define EQUIVALENT(a, b) (((a) && (b)) || (!(a) && (!(b))))
1617
1618 /*
1619 * Verify limitations and invariants on oob state.
1620 * Return 1 if OK, otherwise 0 so that it can be used as
1621 * ASSERT(verify_oobstate(so));
1622 */
1623 int
1624 so_verify_oobstate(struct sonode *so)
1625 {
1626 boolean_t havemark;
1627
1628 ASSERT(MUTEX_HELD(&so->so_lock));
1629
1630 /*
1631 * The possible state combinations are:
1632 * 0
1633 * SS_OOBPEND
1634 * SS_OOBPEND|SS_HAVEOOBDATA
1635 * SS_OOBPEND|SS_HADOOBDATA
1636 * SS_HADOOBDATA
1637 */
1638 switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1639 case 0:
1640 case SS_OOBPEND:
1641 case SS_OOBPEND|SS_HAVEOOBDATA:
1642 case SS_OOBPEND|SS_HADOOBDATA:
1643 case SS_HADOOBDATA:
1644 break;
1645 default:
1646 printf("Bad oob state 1 (%p): state %s\n",
1647 (void *)so, pr_state(so->so_state, so->so_mode));
1648 return (0);
1649 }
1650
1651 /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1652 if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1653 printf("Bad oob state 2 (%p): state %s\n",
1654 (void *)so, pr_state(so->so_state, so->so_mode));
1655 return (0);
1656 }
1657
1658 /*
1659 * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1660 * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1661 */
1662 havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1663 SOTOTPI(so)->sti_oobsigcnt > 0;
1664
1665 if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1666 so->so_state & SS_OOBPEND)) {
1667 printf("Bad oob state 3 (%p): state %s\n",
1668 (void *)so, pr_state(so->so_state, so->so_mode));
1669 return (0);
1670 }
1671
1672 /*
1673 * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1674 */
1675 if (!(so->so_options & SO_OOBINLINE) &&
1676 !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1677 printf("Bad oob state 4 (%p): state %s\n",
1678 (void *)so, pr_state(so->so_state, so->so_mode));
1679 return (0);
1680 }
1681
1682 if (!SOCK_IS_NONSTR(so) &&
1683 SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1684 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1685 (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1686 SOTOTPI(so)->sti_oobcnt,
1687 pr_state(so->so_state, so->so_mode));
1688 return (0);
1689 }
1690
1691 return (1);
1692 }
1693 #undef EQUIVALENT
1694 #endif /* DEBUG */
1695
1696 /* initialize sockfs zone specific kstat related items */
1697 void *
1698 sock_kstat_init(zoneid_t zoneid)
1699 {
1700 kstat_t *ksp;
1701
1702 ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1703 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1704
1705 if (ksp != NULL) {
1706 ksp->ks_update = sockfs_update;
1707 ksp->ks_snapshot = sockfs_snapshot;
1708 ksp->ks_lock = &socklist.sl_lock;
1709 ksp->ks_private = (void *)(uintptr_t)zoneid;
1710 kstat_install(ksp);
1711 }
1712
1713 return (ksp);
1714 }
1715
1716 /* tear down sockfs zone specific kstat related items */
1717 /*ARGSUSED*/
1718 void
1719 sock_kstat_fini(zoneid_t zoneid, void *arg)
1720 {
1721 kstat_t *ksp = (kstat_t *)arg;
1722
1723 if (ksp != NULL) {
1724 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1725 kstat_delete(ksp);
1726 }
1727 }
1728
1729 /*
1730 * Zones:
1731 * Note that nactive is going to be different for each zone.
1732 * This means we require kstat to call sockfs_update and then sockfs_snapshot
1733 * for the same zone, or sockfs_snapshot will be taken into the wrong size
1734 * buffer. This is safe, but if the buffer is too small, user will not be
1735 * given details of all sockets. However, as this kstat has a ks_lock, kstat
1736 * driver will keep it locked between the update and the snapshot, so no
1737 * other process (zone) can currently get inbetween resulting in a wrong size
1738 * buffer allocation.
1739 */
1740 static int
1741 sockfs_update(kstat_t *ksp, int rw)
1742 {
1743 uint_t nactive = 0; /* # of active AF_UNIX sockets */
1744 struct sonode *so; /* current sonode on socklist */
1745 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1746
1747 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1748
1749 if (rw == KSTAT_WRITE) { /* bounce all writes */
1750 return (EACCES);
1751 }
1752
1753 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1754 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1755 nactive++;
1756 }
1757 }
1758 ksp->ks_ndata = nactive;
1759 ksp->ks_data_size = nactive * sizeof (struct sockinfo);
1760
1761 return (0);
1762 }
1763
1764 static int
1765 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1766 {
1767 int ns; /* # of sonodes we've copied */
1768 struct sonode *so; /* current sonode on socklist */
1769 struct sockinfo *psi; /* where we put sockinfo data */
1770 t_uscalar_t sn_len; /* soa_len */
1771 zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1772 sotpi_info_t *sti;
1773
1774 ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1775
1776 ksp->ks_snaptime = gethrtime();
1777
1778 if (rw == KSTAT_WRITE) { /* bounce all writes */
1779 return (EACCES);
1780 }
1781
1782 /*
1783 * For each sonode on the socklist, we massage the important
1784 * info into buf, in sockinfo format.
1785 */
1786 psi = (struct sockinfo *)buf;
1787 ns = 0;
1788 for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1789 vattr_t attr;
1790
1791 /* only stuff active sonodes and the same zone: */
1792 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1793 continue;
1794 }
1795
1796 /*
1797 * If the sonode was activated between the update and the
1798 * snapshot, we're done - as this is only a snapshot.
1799 */
1800 if ((caddr_t)(psi) >= (caddr_t)buf + ksp->ks_data_size) {
1801 break;
1802 }
1803
1804 sti = SOTOTPI(so);
1805 /* copy important info into buf: */
1806 psi->si_size = sizeof (struct sockinfo);
1807 psi->si_family = so->so_family;
1808 psi->si_type = so->so_type;
1809 psi->si_flag = so->so_flag;
1810 psi->si_state = so->so_state;
1811 psi->si_serv_type = sti->sti_serv_type;
1812 psi->si_ux_laddr_sou_magic = sti->sti_ux_laddr.soua_magic;
1813 psi->si_ux_faddr_sou_magic = sti->sti_ux_faddr.soua_magic;
1814 psi->si_laddr_soa_len = sti->sti_laddr.soa_len;
1815 psi->si_faddr_soa_len = sti->sti_faddr.soa_len;
1816 psi->si_szoneid = so->so_zoneid;
1817 psi->si_faddr_noxlate = sti->sti_faddr_noxlate;
1818
1819 /*
1820 * Grab the inode, if possible.
1821 * This must be done before entering so_lock as VOP_GETATTR
1822 * will acquire it.
1823 */
1824 if (so->so_vnode == NULL ||
1825 VOP_GETATTR(so->so_vnode, &attr, 0, CRED(), NULL) != 0)
1826 attr.va_nodeid = 0;
1827
1828 psi->si_inode = attr.va_nodeid;
1829
1830 mutex_enter(&so->so_lock);
1831
1832 if (sti->sti_laddr_sa != NULL) {
1833 ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1834 sn_len = sti->sti_laddr_len;
1835 ASSERT(sn_len <= sizeof (short) +
1836 sizeof (psi->si_laddr_sun_path));
1837
1838 psi->si_laddr_family =
1839 sti->sti_laddr_sa->sa_family;
1840 if (sn_len != 0) {
1841 /* AF_UNIX socket names are NULL terminated */
1842 (void) strncpy(psi->si_laddr_sun_path,
1843 sti->sti_laddr_sa->sa_data,
1844 sizeof (psi->si_laddr_sun_path));
1845 sn_len = strlen(psi->si_laddr_sun_path);
1846 }
1847 psi->si_laddr_sun_path[sn_len] = 0;
1848 }
1849
1850 if (sti->sti_faddr_sa != NULL) {
1851 ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1852 sn_len = sti->sti_faddr_len;
1853 ASSERT(sn_len <= sizeof (short) +
1854 sizeof (psi->si_faddr_sun_path));
1855
1856 psi->si_faddr_family =
1857 sti->sti_faddr_sa->sa_family;
1858 if (sn_len != 0) {
1859 (void) strncpy(psi->si_faddr_sun_path,
1860 sti->sti_faddr_sa->sa_data,
1861 sizeof (psi->si_faddr_sun_path));
1862 sn_len = strlen(psi->si_faddr_sun_path);
1863 }
1864 psi->si_faddr_sun_path[sn_len] = 0;
1865 }
1866
1867 mutex_exit(&so->so_lock);
1868
1869 (void) snprintf(psi->si_son_straddr,
1870 sizeof (psi->si_son_straddr), "%p", (void *)so);
1871 (void) snprintf(psi->si_lvn_straddr,
1872 sizeof (psi->si_lvn_straddr), "%p",
1873 (void *)sti->sti_ux_laddr.soua_vp);
1874 (void) snprintf(psi->si_fvn_straddr,
1875 sizeof (psi->si_fvn_straddr), "%p",
1876 (void *)sti->sti_ux_faddr.soua_vp);
1877
1878 ns++;
1879 psi++;
1880 }
1881
1882 ksp->ks_ndata = ns;
1883 return (0);
1884 }
1885
1886 ssize_t
1887 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1888 {
1889 struct uio auio;
1890 struct iovec aiov[MSG_MAXIOVLEN];
1891 register vnode_t *vp;
1892 int ioflag, rwflag;
1893 ssize_t cnt;
1894 int error = 0;
1895 int iovcnt = 0;
1896 short fflag;
1897
1898 vp = fp->f_vnode;
1899 fflag = fp->f_flag;
1900
1901 rwflag = 0;
1902 aiov[0].iov_base = (caddr_t)buf;
1903 aiov[0].iov_len = size;
1904 iovcnt = 1;
1905 cnt = (ssize_t)size;
1906 (void) VOP_RWLOCK(vp, rwflag, NULL);
1907
1908 auio.uio_loffset = fileoff;
1909 auio.uio_iov = aiov;
1910 auio.uio_iovcnt = iovcnt;
1911 auio.uio_resid = cnt;
1912 auio.uio_segflg = UIO_SYSSPACE;
1913 auio.uio_llimit = MAXOFFSET_T;
1914 auio.uio_fmode = fflag;
1915 auio.uio_extflg = UIO_COPY_CACHED;
1916
1917 ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1918
1919 /* If read sync is not asked for, filter sync flags */
1920 if ((ioflag & FRSYNC) == 0)
1921 ioflag &= ~(FSYNC|FDSYNC);
1922 error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1923 cnt -= auio.uio_resid;
1924
1925 VOP_RWUNLOCK(vp, rwflag, NULL);
1926
1927 if (error == EINTR && cnt != 0)
1928 error = 0;
1929 out:
1930 if (error != 0) {
1931 *err = error;
1932 return (0);
1933 } else {
1934 *err = 0;
1935 return (cnt);
1936 }
1937 }
1938
1939 int
1940 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1941 {
1942 if (fromkernel) {
1943 bcopy(from, to, size);
1944 return (0);
1945 }
1946 return (xcopyin(from, to, size));
1947 }
1948
1949 int
1950 so_copyout(const void *from, void *to, size_t size, int tokernel)
1951 {
1952 if (tokernel) {
1953 bcopy(from, to, size);
1954 return (0);
1955 }
1956 return (xcopyout(from, to, size));
1957 }