1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/t_lock.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/buf.h>
  31 #include <sys/conf.h>
  32 #include <sys/cred.h>
  33 #include <sys/kmem.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vfs_opreg.h>
  37 #include <sys/vnode.h>
  38 #include <sys/debug.h>
  39 #include <sys/errno.h>
  40 #include <sys/time.h>
  41 #include <sys/file.h>
  42 #include <sys/open.h>
  43 #include <sys/user.h>
  44 #include <sys/termios.h>
  45 #include <sys/stream.h>
  46 #include <sys/strsubr.h>
  47 #include <sys/strsun.h>
  48 #include <sys/esunddi.h>
  49 #include <sys/flock.h>
  50 #include <sys/modctl.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/mkdev.h>
  53 #include <sys/pathname.h>
  54 #include <sys/ddi.h>
  55 #include <sys/stat.h>
  56 #include <sys/fs/snode.h>
  57 #include <sys/fs/dv_node.h>
  58 #include <sys/zone.h>
  59 
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <netinet/in.h>
  63 #include <sys/un.h>
  64 #include <sys/ucred.h>
  65 
  66 #include <sys/tiuser.h>
  67 #define _SUN_TPI_VERSION        2
  68 #include <sys/tihdr.h>
  69 
  70 #include <c2/audit.h>
  71 
  72 #include <fs/sockfs/nl7c.h>
  73 #include <fs/sockfs/sockcommon.h>
  74 #include <fs/sockfs/sockfilter_impl.h>
  75 #include <fs/sockfs/socktpi.h>
  76 #include <fs/sockfs/socktpi_impl.h>
  77 #include <fs/sockfs/sodirect.h>
  78 
  79 /*
  80  * Macros that operate on struct cmsghdr.
  81  * The CMSG_VALID macro does not assume that the last option buffer is padded.
  82  */
  83 #define CMSG_CONTENT(cmsg)      (&((cmsg)[1]))
  84 #define CMSG_CONTENTLEN(cmsg)   ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
  85 #define CMSG_VALID(cmsg, start, end)                                    \
  86         (ISALIGNED_cmsghdr(cmsg) &&                                     \
  87         ((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&                 \
  88         ((uintptr_t)(cmsg) < (uintptr_t)(end)) &&                    \
  89         ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
  90         ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
  91 #define SO_LOCK_WAKEUP_TIME     3000    /* Wakeup time in milliseconds */
  92 
  93 dev_t sockdev;  /* For fsid in getattr */
  94 int sockfs_defer_nl7c_init = 0;
  95 
  96 struct socklist socklist;
  97 
  98 struct kmem_cache *socket_cache;
  99 
 100 /*
 101  * sockconf_lock protects the socket configuration (socket types and
 102  * socket filters) which is changed via the sockconfig system call.
 103  */
 104 krwlock_t sockconf_lock;
 105 
 106 static int sockfs_update(kstat_t *, int);
 107 static int sockfs_snapshot(kstat_t *, void *, int);
 108 extern smod_info_t *sotpi_smod_create(void);
 109 
 110 extern void sendfile_init();
 111 
 112 extern void nl7c_init(void);
 113 
 114 extern int modrootloaded;
 115 
 116 /*
 117  * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
 118  * Returns with the vnode held.
 119  */
 120 int
 121 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
 122 {
 123         struct snode *csp;
 124         vnode_t *vp, *dvp;
 125         major_t maj;
 126         int error;
 127 
 128         ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
 129 
 130         /*
 131          * Lookup the underlying filesystem vnode.
 132          */
 133         error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
 134         if (error)
 135                 return (error);
 136 
 137         /* Check that it is the correct vnode */
 138         if (vp->v_type != VCHR) {
 139                 VN_RELE(vp);
 140                 return (ENOTSOCK);
 141         }
 142 
 143         /*
 144          * If devpath went through devfs, the device should already
 145          * be configured. If devpath is a mknod file, however, we
 146          * need to make sure the device is properly configured.
 147          * To do this, we do something similar to spec_open()
 148          * except that we resolve to the minor/leaf level since
 149          * we need to return a vnode.
 150          */
 151         csp = VTOS(VTOS(vp)->s_commonvp);
 152         if (!(csp->s_flag & SDIPSET)) {
 153                 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 154                 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
 155                 if (error == 0)
 156                         error = devfs_lookupname(pathname, NULLVPP, &dvp);
 157                 VN_RELE(vp);
 158                 kmem_free(pathname, MAXPATHLEN);
 159                 if (error != 0)
 160                         return (ENXIO);
 161                 vp = dvp;       /* use the devfs vp */
 162         }
 163 
 164         /* device is configured at this point */
 165         maj = getmajor(vp->v_rdev);
 166         if (!STREAMSTAB(maj)) {
 167                 VN_RELE(vp);
 168                 return (ENOSTR);
 169         }
 170 
 171         *vpp = vp;
 172         return (0);
 173 }
 174 
 175 /*
 176  * Update the accessed, updated, or changed times in an sonode
 177  * with the current time.
 178  *
 179  * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
 180  * attributes in a fstat call. (They return the current time and 0 for
 181  * all timestamps, respectively.) We maintain the current timestamps
 182  * here primarily so that should sockmod be popped the resulting
 183  * file descriptor will behave like a stream w.r.t. the timestamps.
 184  */
 185 void
 186 so_update_attrs(struct sonode *so, int flag)
 187 {
 188         time_t now = gethrestime_sec();
 189 
 190         if (SOCK_IS_NONSTR(so))
 191                 return;
 192 
 193         mutex_enter(&so->so_lock);
 194         so->so_flag |= flag;
 195         if (flag & SOACC)
 196                 SOTOTPI(so)->sti_atime = now;
 197         if (flag & SOMOD)
 198                 SOTOTPI(so)->sti_mtime = now;
 199         mutex_exit(&so->so_lock);
 200 }
 201 
 202 extern so_create_func_t sock_comm_create_function;
 203 extern so_destroy_func_t sock_comm_destroy_function;
 204 /*
 205  * Init function called when sockfs is loaded.
 206  */
 207 int
 208 sockinit(int fstype, char *name)
 209 {
 210         static const fs_operation_def_t sock_vfsops_template[] = {
 211                 NULL, NULL
 212         };
 213         int error;
 214         major_t dev;
 215         char *err_str;
 216 
 217         error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
 218         if (error != 0) {
 219                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 220                     "sockinit: bad vfs ops template");
 221                 return (error);
 222         }
 223 
 224         error = vn_make_ops(name, socket_vnodeops_template,
 225             &socket_vnodeops);
 226         if (error != 0) {
 227                 err_str = "sockinit: bad socket vnode ops template";
 228                 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
 229                 socket_vnodeops = NULL;
 230                 goto failure;
 231         }
 232 
 233         socket_cache = kmem_cache_create("socket_cache",
 234             sizeof (struct sonode), 0, sonode_constructor,
 235             sonode_destructor, NULL, NULL, NULL, 0);
 236 
 237         rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
 238 
 239         error = socktpi_init();
 240         if (error != 0) {
 241                 err_str = NULL;
 242                 goto failure;
 243         }
 244 
 245         error = sod_init();
 246         if (error != 0) {
 247                 err_str = NULL;
 248                 goto failure;
 249         }
 250 
 251         /*
 252          * Set up the default create and destroy functions
 253          */
 254         sock_comm_create_function = socket_sonode_create;
 255         sock_comm_destroy_function = socket_sonode_destroy;
 256 
 257         /*
 258          * Build initial list mapping socket parameters to vnode.
 259          */
 260         smod_init();
 261         smod_add(sotpi_smod_create());
 262 
 263         sockparams_init();
 264 
 265         /*
 266          * If sockets are needed before init runs /sbin/soconfig
 267          * it is possible to preload the sockparams list here using
 268          * calls like:
 269          *      sockconfig(1,2,3, "/dev/tcp", 0);
 270          */
 271 
 272         /*
 273          * Create a unique dev_t for use in so_fsid.
 274          */
 275 
 276         if ((dev = getudev()) == (major_t)-1)
 277                 dev = 0;
 278         sockdev = makedevice(dev, 0);
 279 
 280         mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
 281         sendfile_init();
 282         if (!modrootloaded) {
 283                 sockfs_defer_nl7c_init = 1;
 284         } else {
 285                 nl7c_init();
 286         }
 287 
 288         /* Initialize socket filters */
 289         sof_init();
 290 
 291         return (0);
 292 
 293 failure:
 294         (void) vfs_freevfsops_by_type(fstype);
 295         if (socket_vnodeops != NULL)
 296                 vn_freevnodeops(socket_vnodeops);
 297         if (err_str != NULL)
 298                 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
 299         return (error);
 300 }
 301 
 302 /*
 303  * Caller must hold the mutex. Used to set SOLOCKED.
 304  */
 305 void
 306 so_lock_single(struct sonode *so)
 307 {
 308         ASSERT(MUTEX_HELD(&so->so_lock));
 309 
 310         while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
 311                 cv_wait_stop(&so->so_single_cv, &so->so_lock,
 312                     SO_LOCK_WAKEUP_TIME);
 313         }
 314         so->so_flag |= SOLOCKED;
 315 }
 316 
 317 /*
 318  * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
 319  * Used to clear SOLOCKED or SOASYNC_UNBIND.
 320  */
 321 void
 322 so_unlock_single(struct sonode *so, int flag)
 323 {
 324         ASSERT(MUTEX_HELD(&so->so_lock));
 325         ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
 326         ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
 327         ASSERT(so->so_flag & flag);
 328         /*
 329          * Process the T_DISCON_IND on sti_discon_ind_mp.
 330          *
 331          * Call to so_drain_discon_ind will result in so_lock
 332          * being dropped and re-acquired later.
 333          */
 334         if (!SOCK_IS_NONSTR(so)) {
 335                 sotpi_info_t *sti = SOTOTPI(so);
 336 
 337                 if (sti->sti_discon_ind_mp != NULL)
 338                         so_drain_discon_ind(so);
 339         }
 340 
 341         cv_signal(&so->so_single_cv);
 342         so->so_flag &= ~flag;
 343 }
 344 
 345 /*
 346  * Caller must hold the mutex. Used to set SOREADLOCKED.
 347  * If the caller wants nonblocking behavior it should set fmode.
 348  */
 349 int
 350 so_lock_read(struct sonode *so, int fmode)
 351 {
 352         ASSERT(MUTEX_HELD(&so->so_lock));
 353 
 354         while (so->so_flag & SOREADLOCKED) {
 355                 if (fmode & (FNDELAY|FNONBLOCK))
 356                         return (EWOULDBLOCK);
 357                 cv_wait_stop(&so->so_read_cv, &so->so_lock,
 358                     SO_LOCK_WAKEUP_TIME);
 359         }
 360         so->so_flag |= SOREADLOCKED;
 361         return (0);
 362 }
 363 
 364 /*
 365  * Like so_lock_read above but allows signals.
 366  */
 367 int
 368 so_lock_read_intr(struct sonode *so, int fmode)
 369 {
 370         ASSERT(MUTEX_HELD(&so->so_lock));
 371 
 372         while (so->so_flag & SOREADLOCKED) {
 373                 if (fmode & (FNDELAY|FNONBLOCK))
 374                         return (EWOULDBLOCK);
 375                 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
 376                         return (EINTR);
 377         }
 378         so->so_flag |= SOREADLOCKED;
 379         return (0);
 380 }
 381 
 382 /*
 383  * Caller must hold the mutex. Used to clear SOREADLOCKED,
 384  * set in so_lock_read() or so_lock_read_intr().
 385  */
 386 void
 387 so_unlock_read(struct sonode *so)
 388 {
 389         ASSERT(MUTEX_HELD(&so->so_lock));
 390         ASSERT(so->so_flag & SOREADLOCKED);
 391 
 392         cv_signal(&so->so_read_cv);
 393         so->so_flag &= ~SOREADLOCKED;
 394 }
 395 
 396 /*
 397  * Verify that the specified offset falls within the mblk and
 398  * that the resulting pointer is aligned.
 399  * Returns NULL if not.
 400  */
 401 void *
 402 sogetoff(mblk_t *mp, t_uscalar_t offset,
 403     t_uscalar_t length, uint_t align_size)
 404 {
 405         uintptr_t ptr1, ptr2;
 406 
 407         ASSERT(mp && mp->b_wptr >= mp->b_rptr);
 408         ptr1 = (uintptr_t)mp->b_rptr + offset;
 409         ptr2 = (uintptr_t)ptr1 + length;
 410         if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
 411                 eprintline(0);
 412                 return (NULL);
 413         }
 414         if ((ptr1 & (align_size - 1)) != 0) {
 415                 eprintline(0);
 416                 return (NULL);
 417         }
 418         return ((void *)ptr1);
 419 }
 420 
 421 /*
 422  * Return the AF_UNIX underlying filesystem vnode matching a given name.
 423  * Makes sure the sending and the destination sonodes are compatible.
 424  * The vnode is returned held.
 425  *
 426  * The underlying filesystem VSOCK vnode has a v_stream pointer that
 427  * references the actual stream head (hence indirectly the actual sonode).
 428  */
 429 static int
 430 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
 431     vnode_t **vpp)
 432 {
 433         vnode_t         *vp;    /* Underlying filesystem vnode */
 434         vnode_t         *rvp;   /* real vnode */
 435         vnode_t         *svp;   /* sockfs vnode */
 436         struct sonode   *so2;
 437         int             error;
 438 
 439         dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
 440             soun->sun_path));
 441 
 442         error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
 443         if (error) {
 444                 eprintsoline(so, error);
 445                 return (error);
 446         }
 447 
 448         /*
 449          * Traverse lofs mounts get the real vnode
 450          */
 451         if (VOP_REALVP(vp, &rvp, NULL) == 0) {
 452                 VN_HOLD(rvp);           /* hold the real vnode */
 453                 VN_RELE(vp);            /* release hold from lookup */
 454                 vp = rvp;
 455         }
 456 
 457         if (vp->v_type != VSOCK) {
 458                 error = ENOTSOCK;
 459                 eprintsoline(so, error);
 460                 goto done2;
 461         }
 462 
 463         if (checkaccess) {
 464                 /*
 465                  * Check that we have permissions to access the destination
 466                  * vnode. This check is not done in BSD but it is required
 467                  * by X/Open.
 468                  */
 469                 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
 470                         eprintsoline(so, error);
 471                         goto done2;
 472                 }
 473         }
 474 
 475         /*
 476          * Check if the remote socket has been closed.
 477          *
 478          * Synchronize with vn_rele_stream by holding v_lock while traversing
 479          * v_stream->sd_vnode.
 480          */
 481         mutex_enter(&vp->v_lock);
 482         if (vp->v_stream == NULL) {
 483                 mutex_exit(&vp->v_lock);
 484                 if (so->so_type == SOCK_DGRAM)
 485                         error = EDESTADDRREQ;
 486                 else
 487                         error = ECONNREFUSED;
 488 
 489                 eprintsoline(so, error);
 490                 goto done2;
 491         }
 492         ASSERT(vp->v_stream->sd_vnode);
 493         svp = vp->v_stream->sd_vnode;
 494         /*
 495          * holding v_lock on underlying filesystem vnode and acquiring
 496          * it on sockfs vnode. Assumes that no code ever attempts to
 497          * acquire these locks in the reverse order.
 498          */
 499         VN_HOLD(svp);
 500         mutex_exit(&vp->v_lock);
 501 
 502         if (svp->v_type != VSOCK) {
 503                 error = ENOTSOCK;
 504                 eprintsoline(so, error);
 505                 goto done;
 506         }
 507 
 508         so2 = VTOSO(svp);
 509 
 510         if (so->so_type != so2->so_type) {
 511                 error = EPROTOTYPE;
 512                 eprintsoline(so, error);
 513                 goto done;
 514         }
 515 
 516         VN_RELE(svp);
 517         *vpp = vp;
 518         return (0);
 519 
 520 done:
 521         VN_RELE(svp);
 522 done2:
 523         VN_RELE(vp);
 524         return (error);
 525 }
 526 
 527 /*
 528  * Verify peer address for connect and sendto/sendmsg.
 529  * Since sendto/sendmsg would not get synchronous errors from the transport
 530  * provider we have to do these ugly checks in the socket layer to
 531  * preserve compatibility with SunOS 4.X.
 532  */
 533 int
 534 so_addr_verify(struct sonode *so, const struct sockaddr *name,
 535     socklen_t namelen)
 536 {
 537         int             family;
 538 
 539         dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
 540             (void *)so, (void *)name, namelen));
 541 
 542         ASSERT(name != NULL);
 543 
 544         family = so->so_family;
 545         switch (family) {
 546         case AF_INET:
 547                 if (name->sa_family != family) {
 548                         eprintsoline(so, EAFNOSUPPORT);
 549                         return (EAFNOSUPPORT);
 550                 }
 551                 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
 552                         eprintsoline(so, EINVAL);
 553                         return (EINVAL);
 554                 }
 555                 break;
 556         case AF_INET6: {
 557 #ifdef DEBUG
 558                 struct sockaddr_in6 *sin6;
 559 #endif /* DEBUG */
 560 
 561                 if (name->sa_family != family) {
 562                         eprintsoline(so, EAFNOSUPPORT);
 563                         return (EAFNOSUPPORT);
 564                 }
 565                 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
 566                         eprintsoline(so, EINVAL);
 567                         return (EINVAL);
 568                 }
 569 #ifdef DEBUG
 570                 /* Verify that apps don't forget to clear sin6_scope_id etc */
 571                 sin6 = (struct sockaddr_in6 *)name;
 572                 if (sin6->sin6_scope_id != 0 &&
 573                     !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 574                         zcmn_err(getzoneid(), CE_WARN,
 575                             "connect/send* with uninitialized sin6_scope_id "
 576                             "(%d) on socket. Pid = %d\n",
 577                             (int)sin6->sin6_scope_id, (int)curproc->p_pid);
 578                 }
 579 #endif /* DEBUG */
 580                 break;
 581         }
 582         case AF_UNIX:
 583                 if (SOTOTPI(so)->sti_faddr_noxlate) {
 584                         return (0);
 585                 }
 586                 if (namelen < (socklen_t)sizeof (short)) {
 587                         eprintsoline(so, ENOENT);
 588                         return (ENOENT);
 589                 }
 590                 if (name->sa_family != family) {
 591                         eprintsoline(so, EAFNOSUPPORT);
 592                         return (EAFNOSUPPORT);
 593                 }
 594                 /* MAXPATHLEN + soun_family + nul termination */
 595                 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 596                         eprintsoline(so, ENAMETOOLONG);
 597                         return (ENAMETOOLONG);
 598                 }
 599 
 600                 break;
 601 
 602         default:
 603                 /*
 604                  * Default is don't do any length or sa_family check
 605                  * to allow non-sockaddr style addresses.
 606                  */
 607                 break;
 608         }
 609 
 610         return (0);
 611 }
 612 
 613 
 614 /*
 615  * Translate an AF_UNIX sockaddr_un to the transport internal name.
 616  * Assumes caller has called so_addr_verify first.
 617  */
 618 /*ARGSUSED*/
 619 int
 620 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
 621     socklen_t namelen, int checkaccess,
 622     void **addrp, socklen_t *addrlenp)
 623 {
 624         int                     error;
 625         struct sockaddr_un      *soun;
 626         vnode_t                 *vp;
 627         void                    *addr;
 628         socklen_t               addrlen;
 629         sotpi_info_t            *sti = SOTOTPI(so);
 630 
 631         dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
 632             (void *)so, (void *)name, namelen, checkaccess));
 633 
 634         ASSERT(name != NULL);
 635         ASSERT(so->so_family == AF_UNIX);
 636         ASSERT(!sti->sti_faddr_noxlate);
 637         ASSERT(namelen >= (socklen_t)sizeof (short));
 638         ASSERT(name->sa_family == AF_UNIX);
 639         soun = (struct sockaddr_un *)name;
 640         /*
 641          * Lookup vnode for the specified path name and verify that
 642          * it is a socket.
 643          */
 644         error = so_ux_lookup(so, soun, checkaccess, &vp);
 645         if (error) {
 646                 eprintsoline(so, error);
 647                 return (error);
 648         }
 649         /*
 650          * Use the address of the peer vnode as the address to send
 651          * to. We release the peer vnode here. In case it has been
 652          * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the
 653          * transport the message will get an error or be dropped.
 654          */
 655         sti->sti_ux_faddr.soua_vp = vp;
 656         sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT;
 657         addr = &sti->sti_ux_faddr;
 658         addrlen = (socklen_t)sizeof (sti->sti_ux_faddr);
 659         dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
 660             addrlen, (void *)vp));
 661         VN_RELE(vp);
 662         *addrp = addr;
 663         *addrlenp = (socklen_t)addrlen;
 664         return (0);
 665 }
 666 
 667 /*
 668  * Esballoc free function for messages that contain SO_FILEP option.
 669  * Decrement the reference count on the file pointers using closef.
 670  */
 671 void
 672 fdbuf_free(struct fdbuf *fdbuf)
 673 {
 674         int     i;
 675         struct file *fp;
 676 
 677         dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
 678         for (i = 0; i < fdbuf->fd_numfd; i++) {
 679                 /*
 680                  * We need pointer size alignment for fd_fds. On a LP64
 681                  * kernel, the required alignment is 8 bytes while
 682                  * the option headers and values are only 4 bytes
 683                  * aligned. So its safer to do a bcopy compared to
 684                  * assigning fdbuf->fd_fds[i] to fp.
 685                  */
 686                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 687                 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
 688                 (void) closef(fp);
 689         }
 690         if (fdbuf->fd_ebuf != NULL)
 691                 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
 692         kmem_free(fdbuf, fdbuf->fd_size);
 693 }
 694 
 695 /*
 696  * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
 697  * Waits if memory is not available.
 698  */
 699 mblk_t *
 700 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
 701 {
 702         uchar_t *buf;
 703         mblk_t  *mp;
 704 
 705         dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
 706         buf = kmem_alloc(size, KM_SLEEP);
 707         fdbuf->fd_ebuf = (caddr_t)buf;
 708         fdbuf->fd_ebuflen = size;
 709         fdbuf->fd_frtn.free_func = fdbuf_free;
 710         fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
 711 
 712         mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
 713         mp->b_datap->db_type = M_PROTO;
 714         return (mp);
 715 }
 716 
 717 /*
 718  * Extract file descriptors from a fdbuf.
 719  * Return list in rights/rightslen.
 720  */
 721 /*ARGSUSED*/
 722 static int
 723 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
 724 {
 725         int     i, fd;
 726         int     *rp;
 727         struct file *fp;
 728         int     numfd;
 729 
 730         dprint(1, ("fdbuf_extract: %d fds, len %d\n",
 731             fdbuf->fd_numfd, rightslen));
 732 
 733         numfd = fdbuf->fd_numfd;
 734         ASSERT(rightslen == numfd * (int)sizeof (int));
 735 
 736         /*
 737          * Allocate a file descriptor and increment the f_count.
 738          * The latter is needed since we always call fdbuf_free
 739          * which performs a closef.
 740          */
 741         rp = (int *)rights;
 742         for (i = 0; i < numfd; i++) {
 743                 if ((fd = ufalloc(0)) == -1)
 744                         goto cleanup;
 745                 /*
 746                  * We need pointer size alignment for fd_fds. On a LP64
 747                  * kernel, the required alignment is 8 bytes while
 748                  * the option headers and values are only 4 bytes
 749                  * aligned. So its safer to do a bcopy compared to
 750                  * assigning fdbuf->fd_fds[i] to fp.
 751                  */
 752                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 753                 mutex_enter(&fp->f_tlock);
 754                 fp->f_count++;
 755                 mutex_exit(&fp->f_tlock);
 756                 setf(fd, fp);
 757                 *rp++ = fd;
 758 
 759                 /*
 760                  * Add the current pid to the list associated with this
 761                  * descriptor.
 762                  */
 763                 if (fp->f_vnode != NULL)
 764                         (void) VOP_IOCTL(fp->f_vnode, F_ASSOCI_PID,
 765                             (intptr_t)curproc->p_pidp->pid_id, FKIOCTL, kcred,
 766                             NULL, NULL);
 767 
 768                 if (AU_AUDITING())
 769                         audit_fdrecv(fd, fp);
 770                 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
 771                     i, fd, (void *)fp, fp->f_count));
 772         }
 773         return (0);
 774 
 775 cleanup:
 776         /*
 777          * Undo whatever partial work the loop above has done.
 778          */
 779         {
 780                 int j;
 781 
 782                 rp = (int *)rights;
 783                 for (j = 0; j < i; j++) {
 784                         dprint(0,
 785                             ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
 786                         (void) closeandsetf(*rp++, NULL);
 787                 }
 788         }
 789 
 790         return (EMFILE);
 791 }
 792 
 793 /*
 794  * Insert file descriptors into an fdbuf.
 795  * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
 796  * by calling fdbuf_free().
 797  */
 798 int
 799 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
 800 {
 801         int             numfd, i;
 802         int             *fds;
 803         struct file     *fp;
 804         struct fdbuf    *fdbuf;
 805         int             fdbufsize;
 806 
 807         dprint(1, ("fdbuf_create: len %d\n", rightslen));
 808 
 809         numfd = rightslen / (int)sizeof (int);
 810 
 811         fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
 812         fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
 813         fdbuf->fd_size = fdbufsize;
 814         fdbuf->fd_numfd = 0;
 815         fdbuf->fd_ebuf = NULL;
 816         fdbuf->fd_ebuflen = 0;
 817         fds = (int *)rights;
 818         for (i = 0; i < numfd; i++) {
 819                 if ((fp = getf(fds[i])) == NULL) {
 820                         fdbuf_free(fdbuf);
 821                         return (EBADF);
 822                 }
 823                 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
 824                     i, fds[i], (void *)fp, fp->f_count));
 825                 mutex_enter(&fp->f_tlock);
 826                 fp->f_count++;
 827                 mutex_exit(&fp->f_tlock);
 828                 /*
 829                  * The maximum alignment for fdbuf (or any option header
 830                  * and its value) it 4 bytes. On a LP64 kernel, the alignment
 831                  * is not sufficient for pointers (fd_fds in this case). Since
 832                  * we just did a kmem_alloc (we get a double word alignment),
 833                  * we don't need to do anything on the send side (we loose
 834                  * the double word alignment because fdbuf goes after an
 835                  * option header (eg T_unitdata_req) which is only 4 byte
 836                  * aligned). We take care of this when we extract the file
 837                  * descriptor in fdbuf_extract or fdbuf_free.
 838                  */
 839                 fdbuf->fd_fds[i] = fp;
 840                 fdbuf->fd_numfd++;
 841                 releasef(fds[i]);
 842                 if (AU_AUDITING())
 843                         audit_fdsend(fds[i], fp, 0);
 844         }
 845         *fdbufp = fdbuf;
 846         return (0);
 847 }
 848 
 849 static int
 850 fdbuf_optlen(int rightslen)
 851 {
 852         int numfd;
 853 
 854         numfd = rightslen / (int)sizeof (int);
 855 
 856         return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
 857 }
 858 
 859 static t_uscalar_t
 860 fdbuf_cmsglen(int fdbuflen)
 861 {
 862         return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
 863             (int)sizeof (struct file *) * (int)sizeof (int));
 864 }
 865 
 866 
 867 /*
 868  * Return non-zero if the mblk and fdbuf are consistent.
 869  */
 870 static int
 871 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
 872 {
 873         if (fdbuflen >= FDBUF_HDRSIZE &&
 874             fdbuflen == fdbuf->fd_size) {
 875                 frtn_t *frp = mp->b_datap->db_frtnp;
 876                 /*
 877                  * Check that the SO_FILEP portion of the
 878                  * message has not been modified by
 879                  * the loopback transport. The sending sockfs generates
 880                  * a message that is esballoc'ed with the free function
 881                  * being fdbuf_free() and where free_arg contains the
 882                  * identical information as the SO_FILEP content.
 883                  *
 884                  * If any of these constraints are not satisfied we
 885                  * silently ignore the option.
 886                  */
 887                 ASSERT(mp);
 888                 if (frp != NULL &&
 889                     frp->free_func == fdbuf_free &&
 890                     frp->free_arg != NULL &&
 891                     bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
 892                         dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
 893                             (void *)fdbuf, fdbuflen));
 894                         return (1);
 895                 } else {
 896                         zcmn_err(getzoneid(), CE_WARN,
 897                             "sockfs: mismatched fdbuf content (%p)",
 898                             (void *)mp);
 899                         return (0);
 900                 }
 901         } else {
 902                 zcmn_err(getzoneid(), CE_WARN,
 903                     "sockfs: mismatched fdbuf len %d, %d\n",
 904                     fdbuflen, fdbuf->fd_size);
 905                 return (0);
 906         }
 907 }
 908 
 909 /*
 910  * When the file descriptors returned by sorecvmsg can not be passed
 911  * to the application this routine will cleanup the references on
 912  * the files. Start at startoff bytes into the buffer.
 913  */
 914 static void
 915 close_fds(void *fdbuf, int fdbuflen, int startoff)
 916 {
 917         int *fds = (int *)fdbuf;
 918         int numfd = fdbuflen / (int)sizeof (int);
 919         int i;
 920 
 921         dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
 922 
 923         for (i = 0; i < numfd; i++) {
 924                 if (startoff < 0)
 925                         startoff = 0;
 926                 if (startoff < (int)sizeof (int)) {
 927                         /*
 928                          * This file descriptor is partially or fully after
 929                          * the offset
 930                          */
 931                         dprint(0,
 932                             ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
 933                         (void) closeandsetf(fds[i], NULL);
 934                 }
 935                 startoff -= (int)sizeof (int);
 936         }
 937 }
 938 
 939 /*
 940  * Close all file descriptors contained in the control part starting at
 941  * the startoffset.
 942  */
 943 void
 944 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
 945     int startoff)
 946 {
 947         struct cmsghdr *cmsg;
 948 
 949         if (control == NULL)
 950                 return;
 951 
 952         if (oldflg) {
 953                 close_fds(control, controllen, startoff);
 954                 return;
 955         }
 956         /* Scan control part for file descriptors. */
 957         for (cmsg = (struct cmsghdr *)control;
 958             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
 959             cmsg = CMSG_NEXT(cmsg)) {
 960                 if (cmsg->cmsg_level == SOL_SOCKET &&
 961                     cmsg->cmsg_type == SCM_RIGHTS) {
 962                         close_fds(CMSG_CONTENT(cmsg),
 963                             (int)CMSG_CONTENTLEN(cmsg),
 964                             startoff - (int)sizeof (struct cmsghdr));
 965                 }
 966                 startoff -= cmsg->cmsg_len;
 967         }
 968 }
 969 
 970 /*
 971  * Returns a pointer/length for the file descriptors contained
 972  * in the control buffer. Returns with *fdlenp == -1 if there are no
 973  * file descriptor options present. This is different than there being
 974  * a zero-length file descriptor option.
 975  * Fail if there are multiple SCM_RIGHT cmsgs.
 976  */
 977 int
 978 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
 979     void **fdsp, int *fdlenp)
 980 {
 981         struct cmsghdr *cmsg;
 982         void *fds;
 983         int fdlen;
 984 
 985         if (control == NULL) {
 986                 *fdsp = NULL;
 987                 *fdlenp = -1;
 988                 return (0);
 989         }
 990 
 991         if (oldflg) {
 992                 *fdsp = control;
 993                 if (controllen == 0)
 994                         *fdlenp = -1;
 995                 else
 996                         *fdlenp = controllen;
 997                 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
 998                 return (0);
 999         }
1000 
1001         fds = NULL;
1002         fdlen = 0;
1003 
1004         for (cmsg = (struct cmsghdr *)control;
1005             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1006             cmsg = CMSG_NEXT(cmsg)) {
1007                 if (cmsg->cmsg_level == SOL_SOCKET &&
1008                     cmsg->cmsg_type == SCM_RIGHTS) {
1009                         if (fds != NULL)
1010                                 return (EINVAL);
1011                         fds = CMSG_CONTENT(cmsg);
1012                         fdlen = (int)CMSG_CONTENTLEN(cmsg);
1013                         dprint(1, ("so_getfdopt: new %lu\n",
1014                             (size_t)CMSG_CONTENTLEN(cmsg)));
1015                 }
1016         }
1017         if (fds == NULL) {
1018                 dprint(1, ("so_getfdopt: NONE\n"));
1019                 *fdlenp = -1;
1020         } else
1021                 *fdlenp = fdlen;
1022         *fdsp = fds;
1023         return (0);
1024 }
1025 
1026 /*
1027  * Return the length of the options including any file descriptor options.
1028  */
1029 t_uscalar_t
1030 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1031 {
1032         struct cmsghdr *cmsg;
1033         t_uscalar_t optlen = 0;
1034         t_uscalar_t len;
1035 
1036         if (control == NULL)
1037                 return (0);
1038 
1039         if (oldflg)
1040                 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1041                     fdbuf_optlen(controllen)));
1042 
1043         for (cmsg = (struct cmsghdr *)control;
1044             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1045             cmsg = CMSG_NEXT(cmsg)) {
1046                 if (cmsg->cmsg_level == SOL_SOCKET &&
1047                     cmsg->cmsg_type == SCM_RIGHTS) {
1048                         len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1049                 } else {
1050                         len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1051                 }
1052                 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1053                     sizeof (struct T_opthdr));
1054         }
1055         dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1056             controllen, oldflg, optlen));
1057         return (optlen);
1058 }
1059 
1060 /*
1061  * Copy options from control to the mblk. Skip any file descriptor options.
1062  */
1063 void
1064 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1065 {
1066         struct T_opthdr toh;
1067         struct cmsghdr *cmsg;
1068 
1069         if (control == NULL)
1070                 return;
1071 
1072         if (oldflg) {
1073                 /* No real options - caller has handled file descriptors */
1074                 return;
1075         }
1076         for (cmsg = (struct cmsghdr *)control;
1077             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1078             cmsg = CMSG_NEXT(cmsg)) {
1079                 /*
1080                  * Note: The caller handles file descriptors prior
1081                  * to calling this function.
1082                  */
1083                 t_uscalar_t len;
1084 
1085                 if (cmsg->cmsg_level == SOL_SOCKET &&
1086                     cmsg->cmsg_type == SCM_RIGHTS)
1087                         continue;
1088 
1089                 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1090                 toh.level = cmsg->cmsg_level;
1091                 toh.name = cmsg->cmsg_type;
1092                 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1093                 toh.status = 0;
1094 
1095                 soappendmsg(mp, &toh, sizeof (toh));
1096                 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1097                 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1098                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1099         }
1100 }
1101 
1102 /*
1103  * Return the length of the control message derived from the options.
1104  * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1105  * When oldflg is set only include SO_FILEP.
1106  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1107  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1108  * also be checked for any possible impacts.
1109  */
1110 t_uscalar_t
1111 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1112 {
1113         t_uscalar_t cmsglen = 0;
1114         struct T_opthdr *tohp;
1115         t_uscalar_t len;
1116         t_uscalar_t last_roundup = 0;
1117 
1118         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1119 
1120         for (tohp = (struct T_opthdr *)opt;
1121             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1122             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1123                 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1124                     tohp->level, tohp->name, tohp->len));
1125                 if (tohp->level == SOL_SOCKET &&
1126                     (tohp->name == SO_SRCADDR ||
1127                     tohp->name == SO_UNIX_CLOSE)) {
1128                         continue;
1129                 }
1130                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1131                         struct fdbuf *fdbuf;
1132                         int fdbuflen;
1133 
1134                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1135                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1136 
1137                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1138                                 continue;
1139                         if (oldflg) {
1140                                 cmsglen += fdbuf_cmsglen(fdbuflen);
1141                                 continue;
1142                         }
1143                         len = fdbuf_cmsglen(fdbuflen);
1144                 } else if (tohp->level == SOL_SOCKET &&
1145                     tohp->name == SCM_TIMESTAMP) {
1146                         if (oldflg)
1147                                 continue;
1148 
1149                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1150                                 len = sizeof (struct timeval);
1151                         } else {
1152                                 len = sizeof (struct timeval32);
1153                         }
1154                 } else {
1155                         if (oldflg)
1156                                 continue;
1157                         len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1158                 }
1159                 /*
1160                  * Exclude roundup for last option to not set
1161                  * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1162                  */
1163                 last_roundup = (t_uscalar_t)
1164                     (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1165                     (len + (int)sizeof (struct cmsghdr)));
1166                 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1167                     last_roundup;
1168         }
1169         cmsglen -= last_roundup;
1170         dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1171             optlen, oldflg, cmsglen));
1172         return (cmsglen);
1173 }
1174 
1175 /*
1176  * Copy options from options to the control. Convert SO_FILEP to
1177  * file descriptors.
1178  * Returns errno or zero.
1179  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1180  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1181  * also be checked for any possible impacts.
1182  */
1183 int
1184 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1185     void *control, t_uscalar_t controllen)
1186 {
1187         struct T_opthdr *tohp;
1188         struct cmsghdr *cmsg;
1189         struct fdbuf *fdbuf;
1190         int fdbuflen;
1191         int error;
1192 #if defined(DEBUG) || defined(__lint)
1193         struct cmsghdr *cend = (struct cmsghdr *)
1194             (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1195 #endif
1196         cmsg = (struct cmsghdr *)control;
1197 
1198         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1199 
1200         for (tohp = (struct T_opthdr *)opt;
1201             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1202             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1203                 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1204                     tohp->level, tohp->name, tohp->len));
1205 
1206                 if (tohp->level == SOL_SOCKET &&
1207                     (tohp->name == SO_SRCADDR ||
1208                     tohp->name == SO_UNIX_CLOSE)) {
1209                         continue;
1210                 }
1211                 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1212                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1213                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1214                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1215 
1216                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1217                                 return (EPROTO);
1218                         if (oldflg) {
1219                                 error = fdbuf_extract(fdbuf, control,
1220                                     (int)controllen);
1221                                 if (error != 0)
1222                                         return (error);
1223                                 continue;
1224                         } else {
1225                                 int fdlen;
1226 
1227                                 fdlen = (int)fdbuf_cmsglen(
1228                                     (int)_TPI_TOPT_DATALEN(tohp));
1229 
1230                                 cmsg->cmsg_level = tohp->level;
1231                                 cmsg->cmsg_type = SCM_RIGHTS;
1232                                 cmsg->cmsg_len = (socklen_t)(fdlen +
1233                                     sizeof (struct cmsghdr));
1234 
1235                                 error = fdbuf_extract(fdbuf,
1236                                     CMSG_CONTENT(cmsg), fdlen);
1237                                 if (error != 0)
1238                                         return (error);
1239                         }
1240                 } else if (tohp->level == SOL_SOCKET &&
1241                     tohp->name == SCM_TIMESTAMP) {
1242                         timestruc_t *timestamp;
1243 
1244                         if (oldflg)
1245                                 continue;
1246 
1247                         cmsg->cmsg_level = tohp->level;
1248                         cmsg->cmsg_type = tohp->name;
1249 
1250                         timestamp =
1251                             (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1252                             sizeof (intptr_t));
1253 
1254                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1255                                 struct timeval tv;
1256 
1257                                 cmsg->cmsg_len = sizeof (struct timeval) +
1258                                     sizeof (struct cmsghdr);
1259                                 tv.tv_sec = timestamp->tv_sec;
1260                                 tv.tv_usec = timestamp->tv_nsec /
1261                                     (NANOSEC / MICROSEC);
1262                                 /*
1263                                  * on LP64 systems, the struct timeval in
1264                                  * the destination will not be 8-byte aligned,
1265                                  * so use bcopy to avoid alignment trouble
1266                                  */
1267                                 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1268                         } else {
1269                                 struct timeval32 *time32;
1270 
1271                                 cmsg->cmsg_len = sizeof (struct timeval32) +
1272                                     sizeof (struct cmsghdr);
1273                                 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1274                                 time32->tv_sec = (time32_t)timestamp->tv_sec;
1275                                 time32->tv_usec =
1276                                     (int32_t)(timestamp->tv_nsec /
1277                                     (NANOSEC / MICROSEC));
1278                         }
1279 
1280                 } else {
1281                         if (oldflg)
1282                                 continue;
1283 
1284                         cmsg->cmsg_level = tohp->level;
1285                         cmsg->cmsg_type = tohp->name;
1286                         cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1287                             sizeof (struct cmsghdr));
1288 
1289                         /* copy content to control data part */
1290                         bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1291                             CMSG_CONTENTLEN(cmsg));
1292                 }
1293                 /* move to next CMSG structure! */
1294                 cmsg = CMSG_NEXT(cmsg);
1295         }
1296         dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1297             control, controllen, (void *)cend, (void *)cmsg));
1298         ASSERT(cmsg <= cend);
1299         return (0);
1300 }
1301 
1302 /*
1303  * Extract the SO_SRCADDR option value if present.
1304  */
1305 void
1306 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1307     t_uscalar_t *srclenp)
1308 {
1309         struct T_opthdr         *tohp;
1310 
1311         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1312 
1313         ASSERT(srcp != NULL && srclenp != NULL);
1314         *srcp = NULL;
1315         *srclenp = 0;
1316 
1317         for (tohp = (struct T_opthdr *)opt;
1318             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1319             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1320                 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1321                     tohp->level, tohp->name, tohp->len));
1322                 if (tohp->level == SOL_SOCKET &&
1323                     tohp->name == SO_SRCADDR) {
1324                         *srcp = _TPI_TOPT_DATA(tohp);
1325                         *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1326                 }
1327         }
1328 }
1329 
1330 /*
1331  * Verify if the SO_UNIX_CLOSE option is present.
1332  */
1333 int
1334 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1335 {
1336         struct T_opthdr         *tohp;
1337 
1338         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1339 
1340         for (tohp = (struct T_opthdr *)opt;
1341             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1342             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1343                 dprint(1,
1344                     ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1345                     tohp->level, tohp->name, tohp->len));
1346                 if (tohp->level == SOL_SOCKET &&
1347                     tohp->name == SO_UNIX_CLOSE)
1348                         return (1);
1349         }
1350         return (0);
1351 }
1352 
1353 /*
1354  * Allocate an M_PROTO message.
1355  *
1356  * If allocation fails the behavior depends on sleepflg:
1357  *      _ALLOC_NOSLEEP  fail immediately
1358  *      _ALLOC_INTR     sleep for memory until a signal is caught
1359  *      _ALLOC_SLEEP    sleep forever. Don't return NULL.
1360  */
1361 mblk_t *
1362 soallocproto(size_t size, int sleepflg, cred_t *cr)
1363 {
1364         mblk_t  *mp;
1365 
1366         /* Round up size for reuse */
1367         size = MAX(size, 64);
1368         if (cr != NULL)
1369                 mp = allocb_cred(size, cr, curproc->p_pid);
1370         else
1371                 mp = allocb(size, BPRI_MED);
1372 
1373         if (mp == NULL) {
1374                 int error;      /* Dummy - error not returned to caller */
1375 
1376                 switch (sleepflg) {
1377                 case _ALLOC_SLEEP:
1378                         if (cr != NULL) {
1379                                 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1380                                     cr, curproc->p_pid);
1381                         } else {
1382                                 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1383                                     &error);
1384                         }
1385                         ASSERT(mp);
1386                         break;
1387                 case _ALLOC_INTR:
1388                         if (cr != NULL) {
1389                                 mp = allocb_cred_wait(size, 0, &error, cr,
1390                                     curproc->p_pid);
1391                         } else {
1392                                 mp = allocb_wait(size, BPRI_MED, 0, &error);
1393                         }
1394                         if (mp == NULL) {
1395                                 /* Caught signal while sleeping for memory */
1396                                 eprintline(ENOBUFS);
1397                                 return (NULL);
1398                         }
1399                         break;
1400                 case _ALLOC_NOSLEEP:
1401                 default:
1402                         eprintline(ENOBUFS);
1403                         return (NULL);
1404                 }
1405         }
1406         DB_TYPE(mp) = M_PROTO;
1407         return (mp);
1408 }
1409 
1410 /*
1411  * Allocate an M_PROTO message with a single component.
1412  * len is the length of buf. size is the amount to allocate.
1413  *
1414  * buf can be NULL with a non-zero len.
1415  * This results in a bzero'ed chunk being placed the message.
1416  */
1417 mblk_t *
1418 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1419     cred_t *cr)
1420 {
1421         mblk_t  *mp;
1422 
1423         if (size == 0)
1424                 size = len;
1425 
1426         ASSERT(size >= len);
1427         /* Round up size for reuse */
1428         size = MAX(size, 64);
1429         mp = soallocproto(size, sleepflg, cr);
1430         if (mp == NULL)
1431                 return (NULL);
1432         mp->b_datap->db_type = M_PROTO;
1433         if (len != 0) {
1434                 if (buf != NULL)
1435                         bcopy(buf, mp->b_wptr, len);
1436                 else
1437                         bzero(mp->b_wptr, len);
1438                 mp->b_wptr += len;
1439         }
1440         return (mp);
1441 }
1442 
1443 /*
1444  * Append buf/len to mp.
1445  * The caller has to ensure that there is enough room in the mblk.
1446  *
1447  * buf can be NULL with a non-zero len.
1448  * This results in a bzero'ed chunk being placed the message.
1449  */
1450 void
1451 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1452 {
1453         ASSERT(mp);
1454 
1455         if (len != 0) {
1456                 /* Assert for room left */
1457                 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1458                 if (buf != NULL)
1459                         bcopy(buf, mp->b_wptr, len);
1460                 else
1461                         bzero(mp->b_wptr, len);
1462         }
1463         mp->b_wptr += len;
1464 }
1465 
1466 /*
1467  * Create a message using two kernel buffers.
1468  * If size is set that will determine the allocation size (e.g. for future
1469  * soappendmsg calls). If size is zero it is derived from the buffer
1470  * lengths.
1471  */
1472 mblk_t *
1473 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1474     ssize_t size, int sleepflg, cred_t *cr)
1475 {
1476         mblk_t *mp;
1477 
1478         if (size == 0)
1479                 size = len1 + len2;
1480         ASSERT(size >= len1 + len2);
1481 
1482         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1483         if (mp)
1484                 soappendmsg(mp, buf2, len2);
1485         return (mp);
1486 }
1487 
1488 /*
1489  * Create a message using three kernel buffers.
1490  * If size is set that will determine the allocation size (for future
1491  * soappendmsg calls). If size is zero it is derived from the buffer
1492  * lengths.
1493  */
1494 mblk_t *
1495 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1496     const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1497 {
1498         mblk_t *mp;
1499 
1500         if (size == 0)
1501                 size = len1 + len2 +len3;
1502         ASSERT(size >= len1 + len2 + len3);
1503 
1504         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1505         if (mp != NULL) {
1506                 soappendmsg(mp, buf2, len2);
1507                 soappendmsg(mp, buf3, len3);
1508         }
1509         return (mp);
1510 }
1511 
1512 #ifdef DEBUG
1513 char *
1514 pr_state(uint_t state, uint_t mode)
1515 {
1516         static char buf[1024];
1517 
1518         buf[0] = 0;
1519         if (state & SS_ISCONNECTED)
1520                 (void) strcat(buf, "ISCONNECTED ");
1521         if (state & SS_ISCONNECTING)
1522                 (void) strcat(buf, "ISCONNECTING ");
1523         if (state & SS_ISDISCONNECTING)
1524                 (void) strcat(buf, "ISDISCONNECTING ");
1525         if (state & SS_CANTSENDMORE)
1526                 (void) strcat(buf, "CANTSENDMORE ");
1527 
1528         if (state & SS_CANTRCVMORE)
1529                 (void) strcat(buf, "CANTRCVMORE ");
1530         if (state & SS_ISBOUND)
1531                 (void) strcat(buf, "ISBOUND ");
1532         if (state & SS_NDELAY)
1533                 (void) strcat(buf, "NDELAY ");
1534         if (state & SS_NONBLOCK)
1535                 (void) strcat(buf, "NONBLOCK ");
1536 
1537         if (state & SS_ASYNC)
1538                 (void) strcat(buf, "ASYNC ");
1539         if (state & SS_ACCEPTCONN)
1540                 (void) strcat(buf, "ACCEPTCONN ");
1541         if (state & SS_SAVEDEOR)
1542                 (void) strcat(buf, "SAVEDEOR ");
1543 
1544         if (state & SS_RCVATMARK)
1545                 (void) strcat(buf, "RCVATMARK ");
1546         if (state & SS_OOBPEND)
1547                 (void) strcat(buf, "OOBPEND ");
1548         if (state & SS_HAVEOOBDATA)
1549                 (void) strcat(buf, "HAVEOOBDATA ");
1550         if (state & SS_HADOOBDATA)
1551                 (void) strcat(buf, "HADOOBDATA ");
1552 
1553         if (mode & SM_PRIV)
1554                 (void) strcat(buf, "PRIV ");
1555         if (mode & SM_ATOMIC)
1556                 (void) strcat(buf, "ATOMIC ");
1557         if (mode & SM_ADDR)
1558                 (void) strcat(buf, "ADDR ");
1559         if (mode & SM_CONNREQUIRED)
1560                 (void) strcat(buf, "CONNREQUIRED ");
1561 
1562         if (mode & SM_FDPASSING)
1563                 (void) strcat(buf, "FDPASSING ");
1564         if (mode & SM_EXDATA)
1565                 (void) strcat(buf, "EXDATA ");
1566         if (mode & SM_OPTDATA)
1567                 (void) strcat(buf, "OPTDATA ");
1568         if (mode & SM_BYTESTREAM)
1569                 (void) strcat(buf, "BYTESTREAM ");
1570         return (buf);
1571 }
1572 
1573 char *
1574 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1575 {
1576         static char buf[1024];
1577 
1578         if (addr == NULL || addrlen == 0) {
1579                 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1580                 return (buf);
1581         }
1582         switch (family) {
1583         case AF_INET: {
1584                 struct sockaddr_in sin;
1585 
1586                 bcopy(addr, &sin, sizeof (sin));
1587 
1588                 (void) sprintf(buf, "(len %d) %x/%d",
1589                     addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1590                 break;
1591         }
1592         case AF_INET6: {
1593                 struct sockaddr_in6 sin6;
1594                 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1595 
1596                 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1597                 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1598                     addrlen,
1599                     ntohs(piece[0]), ntohs(piece[1]),
1600                     ntohs(piece[2]), ntohs(piece[3]),
1601                     ntohs(piece[4]), ntohs(piece[5]),
1602                     ntohs(piece[6]), ntohs(piece[7]),
1603                     ntohs(sin6.sin6_port));
1604                 break;
1605         }
1606         case AF_UNIX: {
1607                 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1608 
1609                 (void) sprintf(buf, "(len %d) %s", addrlen,
1610                     (soun == NULL) ? "(none)" : soun->sun_path);
1611                 break;
1612         }
1613         default:
1614                 (void) sprintf(buf, "(unknown af %d)", family);
1615                 break;
1616         }
1617         return (buf);
1618 }
1619 
1620 /* The logical equivalence operator (a if-and-only-if b) */
1621 #define EQUIVALENT(a, b)        (((a) && (b)) || (!(a) && (!(b))))
1622 
1623 /*
1624  * Verify limitations and invariants on oob state.
1625  * Return 1 if OK, otherwise 0 so that it can be used as
1626  *      ASSERT(verify_oobstate(so));
1627  */
1628 int
1629 so_verify_oobstate(struct sonode *so)
1630 {
1631         boolean_t havemark;
1632 
1633         ASSERT(MUTEX_HELD(&so->so_lock));
1634 
1635         /*
1636          * The possible state combinations are:
1637          *      0
1638          *      SS_OOBPEND
1639          *      SS_OOBPEND|SS_HAVEOOBDATA
1640          *      SS_OOBPEND|SS_HADOOBDATA
1641          *      SS_HADOOBDATA
1642          */
1643         switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1644         case 0:
1645         case SS_OOBPEND:
1646         case SS_OOBPEND|SS_HAVEOOBDATA:
1647         case SS_OOBPEND|SS_HADOOBDATA:
1648         case SS_HADOOBDATA:
1649                 break;
1650         default:
1651                 printf("Bad oob state 1 (%p): state %s\n",
1652                     (void *)so, pr_state(so->so_state, so->so_mode));
1653                 return (0);
1654         }
1655 
1656         /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1657         if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1658                 printf("Bad oob state 2 (%p): state %s\n",
1659                     (void *)so, pr_state(so->so_state, so->so_mode));
1660                 return (0);
1661         }
1662 
1663         /*
1664          * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1665          * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1666          */
1667         havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1668             SOTOTPI(so)->sti_oobsigcnt > 0;
1669 
1670         if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1671             so->so_state & SS_OOBPEND)) {
1672                 printf("Bad oob state 3 (%p): state %s\n",
1673                     (void *)so, pr_state(so->so_state, so->so_mode));
1674                 return (0);
1675         }
1676 
1677         /*
1678          * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1679          */
1680         if (!(so->so_options & SO_OOBINLINE) &&
1681             !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1682                 printf("Bad oob state 4 (%p): state %s\n",
1683                     (void *)so, pr_state(so->so_state, so->so_mode));
1684                 return (0);
1685         }
1686 
1687         if (!SOCK_IS_NONSTR(so) &&
1688             SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1689                 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1690                     (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1691                     SOTOTPI(so)->sti_oobcnt,
1692                     pr_state(so->so_state, so->so_mode));
1693                 return (0);
1694         }
1695 
1696         return (1);
1697 }
1698 #undef  EQUIVALENT
1699 #endif /* DEBUG */
1700 
1701 /* initialize sockfs zone specific kstat related items                  */
1702 void *
1703 sock_kstat_init(zoneid_t zoneid)
1704 {
1705         kstat_t *ksp;
1706 
1707         ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1708             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1709 
1710         if (ksp != NULL) {
1711                 ksp->ks_update = sockfs_update;
1712                 ksp->ks_snapshot = sockfs_snapshot;
1713                 ksp->ks_lock = &socklist.sl_lock;
1714                 ksp->ks_private = (void *)(uintptr_t)zoneid;
1715                 kstat_install(ksp);
1716         }
1717 
1718         return (ksp);
1719 }
1720 
1721 /* tear down sockfs zone specific kstat related items                   */
1722 /*ARGSUSED*/
1723 void
1724 sock_kstat_fini(zoneid_t zoneid, void *arg)
1725 {
1726         kstat_t *ksp = (kstat_t *)arg;
1727 
1728         if (ksp != NULL) {
1729                 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1730                 kstat_delete(ksp);
1731         }
1732 }
1733 
1734 /*
1735  * Zones:
1736  * Note that nactive is going to be different for each zone.
1737  * This means we require kstat to call sockfs_update and then sockfs_snapshot
1738  * for the same zone, or sockfs_snapshot will be taken into the wrong size
1739  * buffer. This is safe, but if the buffer is too small, user will not be
1740  * given details of all sockets. However, as this kstat has a ks_lock, kstat
1741  * driver will keep it locked between the update and the snapshot, so no
1742  * other process (zone) can currently get inbetween resulting in a wrong size
1743  * buffer allocation.
1744  */
1745 static int
1746 sockfs_update(kstat_t *ksp, int rw)
1747 {
1748         uint_t  n, nactive = 0;         /* # of active AF_UNIX sockets  */
1749         uint_t  tsze;
1750         struct sonode   *so;            /* current sonode on socklist   */
1751         zoneid_t        myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1752 
1753         tsze = 0;
1754 
1755         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1756 
1757         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1758                 return (EACCES);
1759         }
1760 
1761         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1762                 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1763 
1764                         nactive++;
1765 
1766                         mutex_enter(&so->so_pid_tree_lock);
1767                         n = avl_numnodes(&so->so_pid_tree);
1768                         mutex_exit(&so->so_pid_tree_lock);
1769 
1770                         tsze += sizeof (struct sockinfo);
1771                         tsze += (n > 1) ? ((n - 1) * sizeof (pid_t)) : 0;
1772                 }
1773         }
1774         ksp->ks_ndata = nactive;
1775         ksp->ks_data_size = tsze;
1776 
1777         return (0);
1778 }
1779 
1780 static int
1781 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1782 {
1783         int                     ns;     /* # of sonodes we've copied    */
1784         struct sonode           *so;    /* current sonode on socklist   */
1785         struct sockinfo         *psi;   /* where we put sockinfo data   */
1786         t_uscalar_t             sn_len; /* soa_len                      */
1787         zoneid_t                myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1788         sotpi_info_t            *sti;
1789 
1790         uint_t                          sze;
1791         mblk_t                          *mblk;
1792         conn_pid_info_t                 *cpi;
1793 
1794         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1795 
1796         ksp->ks_snaptime = gethrtime();
1797 
1798         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1799                 return (EACCES);
1800         }
1801 
1802         /*
1803          * for each sonode on the socklist, we massage the important
1804          * info into buf, in k_sockinfo format.
1805          */
1806         psi = (struct sockinfo *)buf;
1807         ns = 0;
1808         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1809                 /* only stuff active sonodes and the same zone:         */
1810                 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1811                         continue;
1812                 }
1813 
1814                 mblk = so_get_sock_pid_mblk((sock_upper_handle_t)so);
1815                 if (mblk == NULL) {
1816                         continue;
1817                 }
1818                 cpi = (conn_pid_info_t *)mblk->b_datap->db_base;
1819                 sze = sizeof (struct sockinfo);
1820                 sze += (cpi->cpi_pids_cnt > 1) ?
1821                     ((cpi->cpi_pids_cnt - 1) * sizeof (pid_t)) : 0;
1822 
1823                 /*
1824                  * If the sonode was activated between the update and the
1825                  * snapshot, we're done - as this is only a snapshot. We need
1826                  * to make sure that we have space for this sockinfo. In the
1827                  * time window between the update and the snapshot, the size of
1828                  * sockinfo may change, as new pids are added/removed to/from
1829                  * the list. We have to take that into consideration and only
1830                  * include the sockinfo if we have enough space. That means the
1831                  * number of entries we return by snapshot might not equal the
1832                  * the number of entries calculated by update.
1833                  */
1834                 if (((caddr_t)(psi) + sze) >
1835                     ((caddr_t)buf + ksp->ks_data_size)) {
1836                         break;
1837                 }
1838 
1839                 sti = SOTOTPI(so);
1840                 /* copy important info into buf:                        */
1841                 psi->si_size = sze;
1842                 psi->si_family = so->so_family;
1843                 psi->si_type = so->so_type;
1844                 psi->si_flag = so->so_flag;
1845                 psi->si_state = so->so_state;
1846                 psi->si_serv_type = sti->sti_serv_type;
1847                 psi->si_ux_laddr_sou_magic =
1848                     sti->sti_ux_laddr.soua_magic;
1849                 psi->si_ux_faddr_sou_magic =
1850                     sti->sti_ux_faddr.soua_magic;
1851                 psi->si_laddr_soa_len = sti->sti_laddr.soa_len;
1852                 psi->si_faddr_soa_len = sti->sti_faddr.soa_len;
1853                 psi->si_szoneid = so->so_zoneid;
1854                 psi->si_faddr_noxlate = sti->sti_faddr_noxlate;
1855 
1856 
1857                 mutex_enter(&so->so_lock);
1858 
1859                 if (sti->sti_laddr_sa != NULL) {
1860                         ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1861                         sn_len = sti->sti_laddr_len;
1862                         ASSERT(sn_len <= sizeof (short) +
1863                             sizeof (psi->si_laddr_sun_path));
1864 
1865                         psi->si_laddr_family =
1866                             sti->sti_laddr_sa->sa_family;
1867                         if (sn_len != 0) {
1868                                 /* AF_UNIX socket names are NULL terminated */
1869                                 (void) strncpy(psi->si_laddr_sun_path,
1870                                     sti->sti_laddr_sa->sa_data,
1871                                     sizeof (psi->si_laddr_sun_path));
1872                                 sn_len = strlen(psi->si_laddr_sun_path);
1873                         }
1874                         psi->si_laddr_sun_path[sn_len] = 0;
1875                 }
1876 
1877                 if (sti->sti_faddr_sa != NULL) {
1878                         ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1879                         sn_len = sti->sti_faddr_len;
1880                         ASSERT(sn_len <= sizeof (short) +
1881                             sizeof (psi->si_faddr_sun_path));
1882 
1883                         psi->si_faddr_family =
1884                             sti->sti_faddr_sa->sa_family;
1885                         if (sn_len != 0) {
1886                                 (void) strncpy(psi->si_faddr_sun_path,
1887                                     sti->sti_faddr_sa->sa_data,
1888                                     sizeof (psi->si_faddr_sun_path));
1889                                 sn_len = strlen(psi->si_faddr_sun_path);
1890                         }
1891                         psi->si_faddr_sun_path[sn_len] = 0;
1892                 }
1893 
1894                 mutex_exit(&so->so_lock);
1895 
1896                 (void) sprintf(psi->si_son_straddr, "%p", (void *)so);
1897                 (void) sprintf(psi->si_lvn_straddr, "%p",
1898                     (void *)sti->sti_ux_laddr.soua_vp);
1899                 (void) sprintf(psi->si_fvn_straddr, "%p",
1900                     (void *)sti->sti_ux_faddr.soua_vp);
1901 
1902                 psi->si_pids[0] = 0;
1903                 if ((psi->si_pn_cnt = cpi->cpi_pids_cnt) > 0) {
1904                         (void) memcpy(psi->si_pids, cpi->cpi_pids,
1905                             psi->si_pn_cnt * sizeof (pid_t));
1906                 }
1907 
1908                 freemsg(mblk);
1909 
1910                 psi = (struct sockinfo *)((caddr_t)psi + psi->si_size);
1911                 ns++;
1912         }
1913 
1914         ksp->ks_ndata = ns;
1915         return (0);
1916 }
1917 
1918 ssize_t
1919 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1920 {
1921         struct uio auio;
1922         struct iovec aiov[MSG_MAXIOVLEN];
1923         register vnode_t *vp;
1924         int ioflag, rwflag;
1925         ssize_t cnt;
1926         int error = 0;
1927         int iovcnt = 0;
1928         short fflag;
1929 
1930         vp = fp->f_vnode;
1931         fflag = fp->f_flag;
1932 
1933         rwflag = 0;
1934         aiov[0].iov_base = (caddr_t)buf;
1935         aiov[0].iov_len = size;
1936         iovcnt = 1;
1937         cnt = (ssize_t)size;
1938         (void) VOP_RWLOCK(vp, rwflag, NULL);
1939 
1940         auio.uio_loffset = fileoff;
1941         auio.uio_iov = aiov;
1942         auio.uio_iovcnt = iovcnt;
1943         auio.uio_resid = cnt;
1944         auio.uio_segflg = UIO_SYSSPACE;
1945         auio.uio_llimit = MAXOFFSET_T;
1946         auio.uio_fmode = fflag;
1947         auio.uio_extflg = UIO_COPY_CACHED;
1948 
1949         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1950 
1951         /* If read sync is not asked for, filter sync flags */
1952         if ((ioflag & FRSYNC) == 0)
1953                 ioflag &= ~(FSYNC|FDSYNC);
1954         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1955         cnt -= auio.uio_resid;
1956 
1957         VOP_RWUNLOCK(vp, rwflag, NULL);
1958 
1959         if (error == EINTR && cnt != 0)
1960                 error = 0;
1961 out:
1962         if (error != 0) {
1963                 *err = error;
1964                 return (0);
1965         } else {
1966                 *err = 0;
1967                 return (cnt);
1968         }
1969 }
1970 
1971 int
1972 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1973 {
1974         if (fromkernel) {
1975                 bcopy(from, to, size);
1976                 return (0);
1977         }
1978         return (xcopyin(from, to, size));
1979 }
1980 
1981 int
1982 so_copyout(const void *from, void *to, size_t size, int tokernel)
1983 {
1984         if (tokernel) {
1985                 bcopy(from, to, size);
1986                 return (0);
1987         }
1988         return (xcopyout(from, to, size));
1989 }