1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright 2015, Joyent, Inc. All rights reserved.
  26  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  27  */
  28 
  29 #include <sys/types.h>
  30 #include <sys/t_lock.h>
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/buf.h>
  34 #include <sys/conf.h>
  35 #include <sys/cred.h>
  36 #include <sys/kmem.h>
  37 #include <sys/sysmacros.h>
  38 #include <sys/vfs.h>
  39 #include <sys/vfs_opreg.h>
  40 #include <sys/vnode.h>
  41 #include <sys/debug.h>
  42 #include <sys/errno.h>
  43 #include <sys/time.h>
  44 #include <sys/file.h>
  45 #include <sys/open.h>
  46 #include <sys/user.h>
  47 #include <sys/termios.h>
  48 #include <sys/stream.h>
  49 #include <sys/strsubr.h>
  50 #include <sys/strsun.h>
  51 #include <sys/esunddi.h>
  52 #include <sys/flock.h>
  53 #include <sys/modctl.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/mkdev.h>
  56 #include <sys/pathname.h>
  57 #include <sys/ddi.h>
  58 #include <sys/stat.h>
  59 #include <sys/fs/snode.h>
  60 #include <sys/fs/dv_node.h>
  61 #include <sys/zone.h>
  62 
  63 #include <sys/socket.h>
  64 #include <sys/socketvar.h>
  65 #include <netinet/in.h>
  66 #include <sys/un.h>
  67 #include <sys/ucred.h>
  68 
  69 #include <sys/tiuser.h>
  70 #define _SUN_TPI_VERSION        2
  71 #include <sys/tihdr.h>
  72 
  73 #include <c2/audit.h>
  74 
  75 #include <fs/sockfs/nl7c.h>
  76 #include <fs/sockfs/sockcommon.h>
  77 #include <fs/sockfs/sockfilter_impl.h>
  78 #include <fs/sockfs/socktpi.h>
  79 #include <fs/sockfs/socktpi_impl.h>
  80 #include <fs/sockfs/sodirect.h>
  81 
  82 /*
  83  * Macros that operate on struct cmsghdr.
  84  * The CMSG_VALID macro does not assume that the last option buffer is padded.
  85  */
  86 #define CMSG_CONTENT(cmsg)      (&((cmsg)[1]))
  87 #define CMSG_CONTENTLEN(cmsg)   ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
  88 #define CMSG_VALID(cmsg, start, end)                                    \
  89         (ISALIGNED_cmsghdr(cmsg) &&                                     \
  90         ((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&                 \
  91         ((uintptr_t)(cmsg) < (uintptr_t)(end)) &&                    \
  92         ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
  93         ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
  94 #define SO_LOCK_WAKEUP_TIME     3000    /* Wakeup time in milliseconds */
  95 
  96 dev_t sockdev;  /* For fsid in getattr */
  97 int sockfs_defer_nl7c_init = 0;
  98 
  99 struct socklist socklist;
 100 
 101 struct kmem_cache *socket_cache;
 102 
 103 /*
 104  * sockconf_lock protects the socket configuration (socket types and
 105  * socket filters) which is changed via the sockconfig system call.
 106  */
 107 krwlock_t sockconf_lock;
 108 
 109 static int sockfs_update(kstat_t *, int);
 110 static int sockfs_snapshot(kstat_t *, void *, int);
 111 extern smod_info_t *sotpi_smod_create(void);
 112 
 113 extern void sendfile_init();
 114 
 115 extern void nl7c_init(void);
 116 
 117 extern int modrootloaded;
 118 
 119 /*
 120  * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
 121  * Returns with the vnode held.
 122  */
 123 int
 124 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
 125 {
 126         struct snode *csp;
 127         vnode_t *vp, *dvp;
 128         major_t maj;
 129         int error;
 130 
 131         ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
 132 
 133         /*
 134          * Lookup the underlying filesystem vnode.
 135          */
 136         error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
 137         if (error)
 138                 return (error);
 139 
 140         /* Check that it is the correct vnode */
 141         if (vp->v_type != VCHR) {
 142                 VN_RELE(vp);
 143                 return (ENOTSOCK);
 144         }
 145 
 146         /*
 147          * If devpath went through devfs, the device should already
 148          * be configured. If devpath is a mknod file, however, we
 149          * need to make sure the device is properly configured.
 150          * To do this, we do something similar to spec_open()
 151          * except that we resolve to the minor/leaf level since
 152          * we need to return a vnode.
 153          */
 154         csp = VTOS(VTOS(vp)->s_commonvp);
 155         if (!(csp->s_flag & SDIPSET)) {
 156                 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 157                 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
 158                 if (error == 0)
 159                         error = devfs_lookupname(pathname, NULLVPP, &dvp);
 160                 VN_RELE(vp);
 161                 kmem_free(pathname, MAXPATHLEN);
 162                 if (error != 0)
 163                         return (ENXIO);
 164                 vp = dvp;       /* use the devfs vp */
 165         }
 166 
 167         /* device is configured at this point */
 168         maj = getmajor(vp->v_rdev);
 169         if (!STREAMSTAB(maj)) {
 170                 VN_RELE(vp);
 171                 return (ENOSTR);
 172         }
 173 
 174         *vpp = vp;
 175         return (0);
 176 }
 177 
 178 /*
 179  * Update the accessed, updated, or changed times in an sonode
 180  * with the current time.
 181  *
 182  * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
 183  * attributes in a fstat call. (They return the current time and 0 for
 184  * all timestamps, respectively.) We maintain the current timestamps
 185  * here primarily so that should sockmod be popped the resulting
 186  * file descriptor will behave like a stream w.r.t. the timestamps.
 187  */
 188 void
 189 so_update_attrs(struct sonode *so, int flag)
 190 {
 191         time_t now = gethrestime_sec();
 192 
 193         if (SOCK_IS_NONSTR(so))
 194                 return;
 195 
 196         mutex_enter(&so->so_lock);
 197         so->so_flag |= flag;
 198         if (flag & SOACC)
 199                 SOTOTPI(so)->sti_atime = now;
 200         if (flag & SOMOD)
 201                 SOTOTPI(so)->sti_mtime = now;
 202         mutex_exit(&so->so_lock);
 203 }
 204 
 205 extern so_create_func_t sock_comm_create_function;
 206 extern so_destroy_func_t sock_comm_destroy_function;
 207 /*
 208  * Init function called when sockfs is loaded.
 209  */
 210 int
 211 sockinit(int fstype, char *name)
 212 {
 213         static const fs_operation_def_t sock_vfsops_template[] = {
 214                 NULL, NULL
 215         };
 216         int error;
 217         major_t dev;
 218         char *err_str;
 219 
 220         error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
 221         if (error != 0) {
 222                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 223                     "sockinit: bad vfs ops template");
 224                 return (error);
 225         }
 226 
 227         error = vn_make_ops(name, socket_vnodeops_template,
 228             &socket_vnodeops);
 229         if (error != 0) {
 230                 err_str = "sockinit: bad socket vnode ops template";
 231                 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
 232                 socket_vnodeops = NULL;
 233                 goto failure;
 234         }
 235 
 236         socket_cache = kmem_cache_create("socket_cache",
 237             sizeof (struct sonode), 0, sonode_constructor,
 238             sonode_destructor, NULL, NULL, NULL, 0);
 239 
 240         rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
 241 
 242         error = socktpi_init();
 243         if (error != 0) {
 244                 err_str = NULL;
 245                 goto failure;
 246         }
 247 
 248         error = sod_init();
 249         if (error != 0) {
 250                 err_str = NULL;
 251                 goto failure;
 252         }
 253 
 254         /*
 255          * Set up the default create and destroy functions
 256          */
 257         sock_comm_create_function = socket_sonode_create;
 258         sock_comm_destroy_function = socket_sonode_destroy;
 259 
 260         /*
 261          * Build initial list mapping socket parameters to vnode.
 262          */
 263         smod_init();
 264         smod_add(sotpi_smod_create());
 265 
 266         sockparams_init();
 267 
 268         /*
 269          * If sockets are needed before init runs /sbin/soconfig
 270          * it is possible to preload the sockparams list here using
 271          * calls like:
 272          *      sockconfig(1,2,3, "/dev/tcp", 0);
 273          */
 274 
 275         /*
 276          * Create a unique dev_t for use in so_fsid.
 277          */
 278 
 279         if ((dev = getudev()) == (major_t)-1)
 280                 dev = 0;
 281         sockdev = makedevice(dev, 0);
 282 
 283         mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
 284         sendfile_init();
 285         if (!modrootloaded) {
 286                 sockfs_defer_nl7c_init = 1;
 287         } else {
 288                 nl7c_init();
 289         }
 290 
 291         /* Initialize socket filters */
 292         sof_init();
 293 
 294         return (0);
 295 
 296 failure:
 297         (void) vfs_freevfsops_by_type(fstype);
 298         if (socket_vnodeops != NULL)
 299                 vn_freevnodeops(socket_vnodeops);
 300         if (err_str != NULL)
 301                 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
 302         return (error);
 303 }
 304 
 305 /*
 306  * Caller must hold the mutex. Used to set SOLOCKED.
 307  */
 308 void
 309 so_lock_single(struct sonode *so)
 310 {
 311         ASSERT(MUTEX_HELD(&so->so_lock));
 312 
 313         while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
 314                 cv_wait_stop(&so->so_single_cv, &so->so_lock,
 315                     SO_LOCK_WAKEUP_TIME);
 316         }
 317         so->so_flag |= SOLOCKED;
 318 }
 319 
 320 /*
 321  * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
 322  * Used to clear SOLOCKED or SOASYNC_UNBIND.
 323  */
 324 void
 325 so_unlock_single(struct sonode *so, int flag)
 326 {
 327         ASSERT(MUTEX_HELD(&so->so_lock));
 328         ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
 329         ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
 330         ASSERT(so->so_flag & flag);
 331         /*
 332          * Process the T_DISCON_IND on sti_discon_ind_mp.
 333          *
 334          * Call to so_drain_discon_ind will result in so_lock
 335          * being dropped and re-acquired later.
 336          */
 337         if (!SOCK_IS_NONSTR(so)) {
 338                 sotpi_info_t *sti = SOTOTPI(so);
 339 
 340                 if (sti->sti_discon_ind_mp != NULL)
 341                         so_drain_discon_ind(so);
 342         }
 343 
 344         cv_signal(&so->so_single_cv);
 345         so->so_flag &= ~flag;
 346 }
 347 
 348 /*
 349  * Caller must hold the mutex. Used to set SOREADLOCKED.
 350  * If the caller wants nonblocking behavior it should set fmode.
 351  */
 352 int
 353 so_lock_read(struct sonode *so, int fmode)
 354 {
 355         ASSERT(MUTEX_HELD(&so->so_lock));
 356 
 357         while (so->so_flag & SOREADLOCKED) {
 358                 if (fmode & (FNDELAY|FNONBLOCK))
 359                         return (EWOULDBLOCK);
 360                 cv_wait_stop(&so->so_read_cv, &so->so_lock,
 361                     SO_LOCK_WAKEUP_TIME);
 362         }
 363         so->so_flag |= SOREADLOCKED;
 364         return (0);
 365 }
 366 
 367 /*
 368  * Like so_lock_read above but allows signals.
 369  */
 370 int
 371 so_lock_read_intr(struct sonode *so, int fmode)
 372 {
 373         ASSERT(MUTEX_HELD(&so->so_lock));
 374 
 375         while (so->so_flag & SOREADLOCKED) {
 376                 if (fmode & (FNDELAY|FNONBLOCK))
 377                         return (EWOULDBLOCK);
 378                 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
 379                         return (EINTR);
 380         }
 381         so->so_flag |= SOREADLOCKED;
 382         return (0);
 383 }
 384 
 385 /*
 386  * Caller must hold the mutex. Used to clear SOREADLOCKED,
 387  * set in so_lock_read() or so_lock_read_intr().
 388  */
 389 void
 390 so_unlock_read(struct sonode *so)
 391 {
 392         ASSERT(MUTEX_HELD(&so->so_lock));
 393         ASSERT(so->so_flag & SOREADLOCKED);
 394 
 395         cv_signal(&so->so_read_cv);
 396         so->so_flag &= ~SOREADLOCKED;
 397 }
 398 
 399 /*
 400  * Verify that the specified offset falls within the mblk and
 401  * that the resulting pointer is aligned.
 402  * Returns NULL if not.
 403  */
 404 void *
 405 sogetoff(mblk_t *mp, t_uscalar_t offset,
 406     t_uscalar_t length, uint_t align_size)
 407 {
 408         uintptr_t ptr1, ptr2;
 409 
 410         ASSERT(mp && mp->b_wptr >= mp->b_rptr);
 411         ptr1 = (uintptr_t)mp->b_rptr + offset;
 412         ptr2 = (uintptr_t)ptr1 + length;
 413         if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
 414                 eprintline(0);
 415                 return (NULL);
 416         }
 417         if ((ptr1 & (align_size - 1)) != 0) {
 418                 eprintline(0);
 419                 return (NULL);
 420         }
 421         return ((void *)ptr1);
 422 }
 423 
 424 /*
 425  * Return the AF_UNIX underlying filesystem vnode matching a given name.
 426  * Makes sure the sending and the destination sonodes are compatible.
 427  * The vnode is returned held.
 428  *
 429  * The underlying filesystem VSOCK vnode has a v_stream pointer that
 430  * references the actual stream head (hence indirectly the actual sonode).
 431  */
 432 static int
 433 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
 434     vnode_t **vpp)
 435 {
 436         vnode_t         *vp;    /* Underlying filesystem vnode */
 437         vnode_t         *rvp;   /* real vnode */
 438         vnode_t         *svp;   /* sockfs vnode */
 439         struct sonode   *so2;
 440         int             error;
 441 
 442         dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
 443             soun->sun_path));
 444 
 445         error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
 446         if (error) {
 447                 eprintsoline(so, error);
 448                 return (error);
 449         }
 450 
 451         /*
 452          * Traverse lofs mounts get the real vnode
 453          */
 454         if (VOP_REALVP(vp, &rvp, NULL) == 0) {
 455                 VN_HOLD(rvp);           /* hold the real vnode */
 456                 VN_RELE(vp);            /* release hold from lookup */
 457                 vp = rvp;
 458         }
 459 
 460         if (vp->v_type != VSOCK) {
 461                 error = ENOTSOCK;
 462                 eprintsoline(so, error);
 463                 goto done2;
 464         }
 465 
 466         if (checkaccess) {
 467                 /*
 468                  * Check that we have permissions to access the destination
 469                  * vnode. This check is not done in BSD but it is required
 470                  * by X/Open.
 471                  */
 472                 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
 473                         eprintsoline(so, error);
 474                         goto done2;
 475                 }
 476         }
 477 
 478         /*
 479          * Check if the remote socket has been closed.
 480          *
 481          * Synchronize with vn_rele_stream by holding v_lock while traversing
 482          * v_stream->sd_vnode.
 483          */
 484         mutex_enter(&vp->v_lock);
 485         if (vp->v_stream == NULL) {
 486                 mutex_exit(&vp->v_lock);
 487                 if (so->so_type == SOCK_DGRAM)
 488                         error = EDESTADDRREQ;
 489                 else
 490                         error = ECONNREFUSED;
 491 
 492                 eprintsoline(so, error);
 493                 goto done2;
 494         }
 495         ASSERT(vp->v_stream->sd_vnode);
 496         svp = vp->v_stream->sd_vnode;
 497         /*
 498          * holding v_lock on underlying filesystem vnode and acquiring
 499          * it on sockfs vnode. Assumes that no code ever attempts to
 500          * acquire these locks in the reverse order.
 501          */
 502         VN_HOLD(svp);
 503         mutex_exit(&vp->v_lock);
 504 
 505         if (svp->v_type != VSOCK) {
 506                 error = ENOTSOCK;
 507                 eprintsoline(so, error);
 508                 goto done;
 509         }
 510 
 511         so2 = VTOSO(svp);
 512 
 513         if (so->so_type != so2->so_type) {
 514                 error = EPROTOTYPE;
 515                 eprintsoline(so, error);
 516                 goto done;
 517         }
 518 
 519         VN_RELE(svp);
 520         *vpp = vp;
 521         return (0);
 522 
 523 done:
 524         VN_RELE(svp);
 525 done2:
 526         VN_RELE(vp);
 527         return (error);
 528 }
 529 
 530 /*
 531  * Verify peer address for connect and sendto/sendmsg.
 532  * Since sendto/sendmsg would not get synchronous errors from the transport
 533  * provider we have to do these ugly checks in the socket layer to
 534  * preserve compatibility with SunOS 4.X.
 535  */
 536 int
 537 so_addr_verify(struct sonode *so, const struct sockaddr *name,
 538     socklen_t namelen)
 539 {
 540         int             family;
 541 
 542         dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
 543             (void *)so, (void *)name, namelen));
 544 
 545         ASSERT(name != NULL);
 546 
 547         family = so->so_family;
 548         switch (family) {
 549         case AF_INET:
 550                 if (name->sa_family != family) {
 551                         eprintsoline(so, EAFNOSUPPORT);
 552                         return (EAFNOSUPPORT);
 553                 }
 554                 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
 555                         eprintsoline(so, EINVAL);
 556                         return (EINVAL);
 557                 }
 558                 break;
 559         case AF_INET6: {
 560 #ifdef DEBUG
 561                 struct sockaddr_in6 *sin6;
 562 #endif /* DEBUG */
 563 
 564                 if (name->sa_family != family) {
 565                         eprintsoline(so, EAFNOSUPPORT);
 566                         return (EAFNOSUPPORT);
 567                 }
 568                 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
 569                         eprintsoline(so, EINVAL);
 570                         return (EINVAL);
 571                 }
 572 #ifdef DEBUG
 573                 /* Verify that apps don't forget to clear sin6_scope_id etc */
 574                 sin6 = (struct sockaddr_in6 *)name;
 575                 if (sin6->sin6_scope_id != 0 &&
 576                     !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 577                         zcmn_err(getzoneid(), CE_WARN,
 578                             "connect/send* with uninitialized sin6_scope_id "
 579                             "(%d) on socket. Pid = %d\n",
 580                             (int)sin6->sin6_scope_id, (int)curproc->p_pid);
 581                 }
 582 #endif /* DEBUG */
 583                 break;
 584         }
 585         case AF_UNIX:
 586                 if (SOTOTPI(so)->sti_faddr_noxlate) {
 587                         return (0);
 588                 }
 589                 if (namelen < (socklen_t)sizeof (short)) {
 590                         eprintsoline(so, ENOENT);
 591                         return (ENOENT);
 592                 }
 593                 if (name->sa_family != family) {
 594                         eprintsoline(so, EAFNOSUPPORT);
 595                         return (EAFNOSUPPORT);
 596                 }
 597                 /* MAXPATHLEN + soun_family + nul termination */
 598                 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 599                         eprintsoline(so, ENAMETOOLONG);
 600                         return (ENAMETOOLONG);
 601                 }
 602 
 603                 break;
 604 
 605         default:
 606                 /*
 607                  * Default is don't do any length or sa_family check
 608                  * to allow non-sockaddr style addresses.
 609                  */
 610                 break;
 611         }
 612 
 613         return (0);
 614 }
 615 
 616 
 617 /*
 618  * Translate an AF_UNIX sockaddr_un to the transport internal name.
 619  * Assumes caller has called so_addr_verify first.  The translated
 620  * (internal form) address is stored in sti->sti_ux_taddr.
 621  */
 622 /*ARGSUSED*/
 623 int
 624 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
 625     socklen_t namelen, int checkaccess,
 626     void **addrp, socklen_t *addrlenp)
 627 {
 628         int                     error;
 629         struct sockaddr_un      *soun;
 630         vnode_t                 *vp;
 631         void                    *addr;
 632         socklen_t               addrlen;
 633         sotpi_info_t            *sti = SOTOTPI(so);
 634 
 635         dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
 636             (void *)so, (void *)name, namelen, checkaccess));
 637 
 638         ASSERT(name != NULL);
 639         ASSERT(so->so_family == AF_UNIX);
 640         ASSERT(!sti->sti_faddr_noxlate);
 641         ASSERT(namelen >= (socklen_t)sizeof (short));
 642         ASSERT(name->sa_family == AF_UNIX);
 643         soun = (struct sockaddr_un *)name;
 644         /*
 645          * Lookup vnode for the specified path name and verify that
 646          * it is a socket.
 647          */
 648         error = so_ux_lookup(so, soun, checkaccess, &vp);
 649         if (error) {
 650                 eprintsoline(so, error);
 651                 return (error);
 652         }
 653         /*
 654          * Use the address of the peer vnode as the address to send
 655          * to. We release the peer vnode here. In case it has been
 656          * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the
 657          * transport the message will get an error or be dropped.
 658          * Note that that soua_vp is never dereferenced; it's just a
 659          * convenient value by which we can identify the peer.
 660          */
 661         sti->sti_ux_taddr.soua_vp = vp;
 662         sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT;
 663         addr = &sti->sti_ux_taddr;
 664         addrlen = (socklen_t)sizeof (sti->sti_ux_taddr);
 665         dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
 666             addrlen, (void *)vp));
 667         VN_RELE(vp);
 668         *addrp = addr;
 669         *addrlenp = (socklen_t)addrlen;
 670         return (0);
 671 }
 672 
 673 /*
 674  * Esballoc free function for messages that contain SO_FILEP option.
 675  * Decrement the reference count on the file pointers using closef.
 676  */
 677 void
 678 fdbuf_free(struct fdbuf *fdbuf)
 679 {
 680         int     i;
 681         struct file *fp;
 682 
 683         dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
 684         for (i = 0; i < fdbuf->fd_numfd; i++) {
 685                 /*
 686                  * We need pointer size alignment for fd_fds. On a LP64
 687                  * kernel, the required alignment is 8 bytes while
 688                  * the option headers and values are only 4 bytes
 689                  * aligned. So its safer to do a bcopy compared to
 690                  * assigning fdbuf->fd_fds[i] to fp.
 691                  */
 692                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 693                 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
 694                 (void) closef(fp);
 695         }
 696         if (fdbuf->fd_ebuf != NULL)
 697                 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
 698         kmem_free(fdbuf, fdbuf->fd_size);
 699 }
 700 
 701 /*
 702  * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
 703  * Waits if memory is not available.
 704  */
 705 mblk_t *
 706 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
 707 {
 708         uchar_t *buf;
 709         mblk_t  *mp;
 710 
 711         dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
 712         buf = kmem_alloc(size, KM_SLEEP);
 713         fdbuf->fd_ebuf = (caddr_t)buf;
 714         fdbuf->fd_ebuflen = size;
 715         fdbuf->fd_frtn.free_func = fdbuf_free;
 716         fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
 717 
 718         mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
 719         mp->b_datap->db_type = M_PROTO;
 720         return (mp);
 721 }
 722 
 723 /*
 724  * Extract file descriptors from a fdbuf.
 725  * Return list in rights/rightslen.
 726  */
 727 /*ARGSUSED*/
 728 static int
 729 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
 730 {
 731         int     i, fd;
 732         int     *rp;
 733         struct file *fp;
 734         int     numfd;
 735 
 736         dprint(1, ("fdbuf_extract: %d fds, len %d\n",
 737             fdbuf->fd_numfd, rightslen));
 738 
 739         numfd = fdbuf->fd_numfd;
 740         ASSERT(rightslen == numfd * (int)sizeof (int));
 741 
 742         /*
 743          * Allocate a file descriptor and increment the f_count.
 744          * The latter is needed since we always call fdbuf_free
 745          * which performs a closef.
 746          */
 747         rp = (int *)rights;
 748         for (i = 0; i < numfd; i++) {
 749                 if ((fd = ufalloc(0)) == -1)
 750                         goto cleanup;
 751                 /*
 752                  * We need pointer size alignment for fd_fds. On a LP64
 753                  * kernel, the required alignment is 8 bytes while
 754                  * the option headers and values are only 4 bytes
 755                  * aligned. So its safer to do a bcopy compared to
 756                  * assigning fdbuf->fd_fds[i] to fp.
 757                  */
 758                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 759                 mutex_enter(&fp->f_tlock);
 760                 fp->f_count++;
 761                 mutex_exit(&fp->f_tlock);
 762                 setf(fd, fp);
 763                 *rp++ = fd;
 764                 if (AU_AUDITING())
 765                         audit_fdrecv(fd, fp);
 766                 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
 767                     i, fd, (void *)fp, fp->f_count));
 768         }
 769         return (0);
 770 
 771 cleanup:
 772         /*
 773          * Undo whatever partial work the loop above has done.
 774          */
 775         {
 776                 int j;
 777 
 778                 rp = (int *)rights;
 779                 for (j = 0; j < i; j++) {
 780                         dprint(0,
 781                             ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
 782                         (void) closeandsetf(*rp++, NULL);
 783                 }
 784         }
 785 
 786         return (EMFILE);
 787 }
 788 
 789 /*
 790  * Insert file descriptors into an fdbuf.
 791  * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
 792  * by calling fdbuf_free().
 793  */
 794 int
 795 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
 796 {
 797         int             numfd, i;
 798         int             *fds;
 799         struct file     *fp;
 800         struct fdbuf    *fdbuf;
 801         int             fdbufsize;
 802 
 803         dprint(1, ("fdbuf_create: len %d\n", rightslen));
 804 
 805         numfd = rightslen / (int)sizeof (int);
 806 
 807         fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
 808         fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
 809         fdbuf->fd_size = fdbufsize;
 810         fdbuf->fd_numfd = 0;
 811         fdbuf->fd_ebuf = NULL;
 812         fdbuf->fd_ebuflen = 0;
 813         fds = (int *)rights;
 814         for (i = 0; i < numfd; i++) {
 815                 if ((fp = getf(fds[i])) == NULL) {
 816                         fdbuf_free(fdbuf);
 817                         return (EBADF);
 818                 }
 819                 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
 820                     i, fds[i], (void *)fp, fp->f_count));
 821                 mutex_enter(&fp->f_tlock);
 822                 fp->f_count++;
 823                 mutex_exit(&fp->f_tlock);
 824                 /*
 825                  * The maximum alignment for fdbuf (or any option header
 826                  * and its value) it 4 bytes. On a LP64 kernel, the alignment
 827                  * is not sufficient for pointers (fd_fds in this case). Since
 828                  * we just did a kmem_alloc (we get a double word alignment),
 829                  * we don't need to do anything on the send side (we loose
 830                  * the double word alignment because fdbuf goes after an
 831                  * option header (eg T_unitdata_req) which is only 4 byte
 832                  * aligned). We take care of this when we extract the file
 833                  * descriptor in fdbuf_extract or fdbuf_free.
 834                  */
 835                 fdbuf->fd_fds[i] = fp;
 836                 fdbuf->fd_numfd++;
 837                 releasef(fds[i]);
 838                 if (AU_AUDITING())
 839                         audit_fdsend(fds[i], fp, 0);
 840         }
 841         *fdbufp = fdbuf;
 842         return (0);
 843 }
 844 
 845 static int
 846 fdbuf_optlen(int rightslen)
 847 {
 848         int numfd;
 849 
 850         numfd = rightslen / (int)sizeof (int);
 851 
 852         return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
 853 }
 854 
 855 static t_uscalar_t
 856 fdbuf_cmsglen(int fdbuflen)
 857 {
 858         return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
 859             (int)sizeof (struct file *) * (int)sizeof (int));
 860 }
 861 
 862 
 863 /*
 864  * Return non-zero if the mblk and fdbuf are consistent.
 865  */
 866 static int
 867 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
 868 {
 869         if (fdbuflen >= FDBUF_HDRSIZE &&
 870             fdbuflen == fdbuf->fd_size) {
 871                 frtn_t *frp = mp->b_datap->db_frtnp;
 872                 /*
 873                  * Check that the SO_FILEP portion of the
 874                  * message has not been modified by
 875                  * the loopback transport. The sending sockfs generates
 876                  * a message that is esballoc'ed with the free function
 877                  * being fdbuf_free() and where free_arg contains the
 878                  * identical information as the SO_FILEP content.
 879                  *
 880                  * If any of these constraints are not satisfied we
 881                  * silently ignore the option.
 882                  */
 883                 ASSERT(mp);
 884                 if (frp != NULL &&
 885                     frp->free_func == fdbuf_free &&
 886                     frp->free_arg != NULL &&
 887                     bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
 888                         dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
 889                             (void *)fdbuf, fdbuflen));
 890                         return (1);
 891                 } else {
 892                         zcmn_err(getzoneid(), CE_WARN,
 893                             "sockfs: mismatched fdbuf content (%p)",
 894                             (void *)mp);
 895                         return (0);
 896                 }
 897         } else {
 898                 zcmn_err(getzoneid(), CE_WARN,
 899                     "sockfs: mismatched fdbuf len %d, %d\n",
 900                     fdbuflen, fdbuf->fd_size);
 901                 return (0);
 902         }
 903 }
 904 
 905 /*
 906  * When the file descriptors returned by sorecvmsg can not be passed
 907  * to the application this routine will cleanup the references on
 908  * the files. Start at startoff bytes into the buffer.
 909  */
 910 static void
 911 close_fds(void *fdbuf, int fdbuflen, int startoff)
 912 {
 913         int *fds = (int *)fdbuf;
 914         int numfd = fdbuflen / (int)sizeof (int);
 915         int i;
 916 
 917         dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
 918 
 919         for (i = 0; i < numfd; i++) {
 920                 if (startoff < 0)
 921                         startoff = 0;
 922                 if (startoff < (int)sizeof (int)) {
 923                         /*
 924                          * This file descriptor is partially or fully after
 925                          * the offset
 926                          */
 927                         dprint(0,
 928                             ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
 929                         (void) closeandsetf(fds[i], NULL);
 930                 }
 931                 startoff -= (int)sizeof (int);
 932         }
 933 }
 934 
 935 /*
 936  * Close all file descriptors contained in the control part starting at
 937  * the startoffset.
 938  */
 939 void
 940 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
 941     int startoff)
 942 {
 943         struct cmsghdr *cmsg;
 944 
 945         if (control == NULL)
 946                 return;
 947 
 948         if (oldflg) {
 949                 close_fds(control, controllen, startoff);
 950                 return;
 951         }
 952         /* Scan control part for file descriptors. */
 953         for (cmsg = (struct cmsghdr *)control;
 954             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
 955             cmsg = CMSG_NEXT(cmsg)) {
 956                 if (cmsg->cmsg_level == SOL_SOCKET &&
 957                     cmsg->cmsg_type == SCM_RIGHTS) {
 958                         close_fds(CMSG_CONTENT(cmsg),
 959                             (int)CMSG_CONTENTLEN(cmsg),
 960                             startoff - (int)sizeof (struct cmsghdr));
 961                 }
 962                 startoff -= ROUNDUP_cmsglen(cmsg->cmsg_len);
 963         }
 964 }
 965 
 966 /*
 967  * Handle truncation of a cmsg when the receive buffer is not big enough.
 968  * Adjust the cmsg_len header field in the last cmsg that will be included in
 969  * the buffer to reflect the number of bytes included.
 970  */
 971 void
 972 so_truncatecmsg(void *control, t_uscalar_t controllen, uint_t maxlen)
 973 {
 974         struct cmsghdr *cmsg;
 975         uint_t len = 0;
 976 
 977         if (control == NULL)
 978                 return;
 979 
 980         for (cmsg = control;
 981             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
 982             cmsg = CMSG_NEXT(cmsg)) {
 983 
 984                 len += ROUNDUP_cmsglen(cmsg->cmsg_len);
 985 
 986                 if (len > maxlen) {
 987                         /*
 988                          * This cmsg is the last one that will be included in
 989                          * the truncated buffer.
 990                          */
 991                         socklen_t diff = len - maxlen;
 992 
 993                         if (diff < CMSG_CONTENTLEN(cmsg)) {
 994                                 dprint(1, ("so_truncatecmsg: %d -> %d\n",
 995                                     cmsg->cmsg_len, cmsg->cmsg_len - diff));
 996                                 cmsg->cmsg_len -= diff;
 997                         } else {
 998                                 cmsg->cmsg_len = sizeof (struct cmsghdr);
 999                         }
1000                         break;
1001                 }
1002         }
1003 }
1004 
1005 /*
1006  * Returns a pointer/length for the file descriptors contained
1007  * in the control buffer. Returns with *fdlenp == -1 if there are no
1008  * file descriptor options present. This is different than there being
1009  * a zero-length file descriptor option.
1010  * Fail if there are multiple SCM_RIGHT cmsgs.
1011  */
1012 int
1013 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
1014     void **fdsp, int *fdlenp)
1015 {
1016         struct cmsghdr *cmsg;
1017         void *fds;
1018         int fdlen;
1019 
1020         if (control == NULL) {
1021                 *fdsp = NULL;
1022                 *fdlenp = -1;
1023                 return (0);
1024         }
1025 
1026         if (oldflg) {
1027                 *fdsp = control;
1028                 if (controllen == 0)
1029                         *fdlenp = -1;
1030                 else
1031                         *fdlenp = controllen;
1032                 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
1033                 return (0);
1034         }
1035 
1036         fds = NULL;
1037         fdlen = 0;
1038 
1039         for (cmsg = (struct cmsghdr *)control;
1040             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1041             cmsg = CMSG_NEXT(cmsg)) {
1042                 if (cmsg->cmsg_level == SOL_SOCKET &&
1043                     cmsg->cmsg_type == SCM_RIGHTS) {
1044                         if (fds != NULL)
1045                                 return (EINVAL);
1046                         fds = CMSG_CONTENT(cmsg);
1047                         fdlen = (int)CMSG_CONTENTLEN(cmsg);
1048                         dprint(1, ("so_getfdopt: new %lu\n",
1049                             (size_t)CMSG_CONTENTLEN(cmsg)));
1050                 }
1051         }
1052         if (fds == NULL) {
1053                 dprint(1, ("so_getfdopt: NONE\n"));
1054                 *fdlenp = -1;
1055         } else
1056                 *fdlenp = fdlen;
1057         *fdsp = fds;
1058         return (0);
1059 }
1060 
1061 /*
1062  * Return the length of the options including any file descriptor options.
1063  */
1064 t_uscalar_t
1065 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1066 {
1067         struct cmsghdr *cmsg;
1068         t_uscalar_t optlen = 0;
1069         t_uscalar_t len;
1070 
1071         if (control == NULL)
1072                 return (0);
1073 
1074         if (oldflg)
1075                 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1076                     fdbuf_optlen(controllen)));
1077 
1078         for (cmsg = (struct cmsghdr *)control;
1079             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1080             cmsg = CMSG_NEXT(cmsg)) {
1081                 if (cmsg->cmsg_level == SOL_SOCKET &&
1082                     cmsg->cmsg_type == SCM_RIGHTS) {
1083                         len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1084                 } else {
1085                         len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1086                 }
1087                 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1088                     sizeof (struct T_opthdr));
1089         }
1090         dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1091             controllen, oldflg, optlen));
1092         return (optlen);
1093 }
1094 
1095 /*
1096  * Copy options from control to the mblk. Skip any file descriptor options.
1097  */
1098 void
1099 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1100 {
1101         struct T_opthdr toh;
1102         struct cmsghdr *cmsg;
1103 
1104         if (control == NULL)
1105                 return;
1106 
1107         if (oldflg) {
1108                 /* No real options - caller has handled file descriptors */
1109                 return;
1110         }
1111         for (cmsg = (struct cmsghdr *)control;
1112             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1113             cmsg = CMSG_NEXT(cmsg)) {
1114                 /*
1115                  * Note: The caller handles file descriptors prior
1116                  * to calling this function.
1117                  */
1118                 t_uscalar_t len;
1119 
1120                 if (cmsg->cmsg_level == SOL_SOCKET &&
1121                     cmsg->cmsg_type == SCM_RIGHTS)
1122                         continue;
1123 
1124                 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1125                 toh.level = cmsg->cmsg_level;
1126                 toh.name = cmsg->cmsg_type;
1127                 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1128                 toh.status = 0;
1129 
1130                 soappendmsg(mp, &toh, sizeof (toh));
1131                 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1132                 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1133                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1134         }
1135 }
1136 
1137 /*
1138  * Return the length of the control message derived from the options.
1139  * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1140  * When oldflg is set only include SO_FILEP.
1141  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1142  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1143  * also be checked for any possible impacts.
1144  */
1145 t_uscalar_t
1146 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1147 {
1148         t_uscalar_t cmsglen = 0;
1149         struct T_opthdr *tohp;
1150         t_uscalar_t len;
1151         t_uscalar_t last_roundup = 0;
1152 
1153         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1154 
1155         for (tohp = (struct T_opthdr *)opt;
1156             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1157             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1158                 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1159                     tohp->level, tohp->name, tohp->len));
1160                 if (tohp->level == SOL_SOCKET &&
1161                     (tohp->name == SO_SRCADDR ||
1162                     tohp->name == SO_UNIX_CLOSE)) {
1163                         continue;
1164                 }
1165                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1166                         struct fdbuf *fdbuf;
1167                         int fdbuflen;
1168 
1169                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1170                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1171 
1172                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1173                                 continue;
1174                         if (oldflg) {
1175                                 cmsglen += fdbuf_cmsglen(fdbuflen);
1176                                 continue;
1177                         }
1178                         len = fdbuf_cmsglen(fdbuflen);
1179                 } else if (tohp->level == SOL_SOCKET &&
1180                     tohp->name == SCM_TIMESTAMP) {
1181                         if (oldflg)
1182                                 continue;
1183 
1184                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1185                                 len = sizeof (struct timeval);
1186                         } else {
1187                                 len = sizeof (struct timeval32);
1188                         }
1189                 } else {
1190                         if (oldflg)
1191                                 continue;
1192                         len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1193                 }
1194                 /*
1195                  * Exclude roundup for last option to not set
1196                  * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1197                  */
1198                 last_roundup = (t_uscalar_t)
1199                     (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1200                     (len + (int)sizeof (struct cmsghdr)));
1201                 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1202                     last_roundup;
1203         }
1204         cmsglen -= last_roundup;
1205         dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1206             optlen, oldflg, cmsglen));
1207         return (cmsglen);
1208 }
1209 
1210 /*
1211  * Copy options from options to the control. Convert SO_FILEP to
1212  * file descriptors.
1213  * Returns errno or zero.
1214  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1215  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1216  * also be checked for any possible impacts.
1217  */
1218 int
1219 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1220     void *control, t_uscalar_t controllen)
1221 {
1222         struct T_opthdr *tohp;
1223         struct cmsghdr *cmsg;
1224         struct fdbuf *fdbuf;
1225         int fdbuflen;
1226         int error;
1227 #if defined(DEBUG) || defined(__lint)
1228         struct cmsghdr *cend = (struct cmsghdr *)
1229             (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1230 #endif
1231         cmsg = (struct cmsghdr *)control;
1232 
1233         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1234 
1235         for (tohp = (struct T_opthdr *)opt;
1236             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1237             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1238                 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1239                     tohp->level, tohp->name, tohp->len));
1240 
1241                 if (tohp->level == SOL_SOCKET &&
1242                     (tohp->name == SO_SRCADDR ||
1243                     tohp->name == SO_UNIX_CLOSE)) {
1244                         continue;
1245                 }
1246                 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1247                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1248                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1249                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1250 
1251                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1252                                 return (EPROTO);
1253                         if (oldflg) {
1254                                 error = fdbuf_extract(fdbuf, control,
1255                                     (int)controllen);
1256                                 if (error != 0)
1257                                         return (error);
1258                                 continue;
1259                         } else {
1260                                 int fdlen;
1261 
1262                                 fdlen = (int)fdbuf_cmsglen(
1263                                     (int)_TPI_TOPT_DATALEN(tohp));
1264 
1265                                 cmsg->cmsg_level = tohp->level;
1266                                 cmsg->cmsg_type = SCM_RIGHTS;
1267                                 cmsg->cmsg_len = (socklen_t)(fdlen +
1268                                     sizeof (struct cmsghdr));
1269 
1270                                 error = fdbuf_extract(fdbuf,
1271                                     CMSG_CONTENT(cmsg), fdlen);
1272                                 if (error != 0)
1273                                         return (error);
1274                         }
1275                 } else if (tohp->level == SOL_SOCKET &&
1276                     tohp->name == SCM_TIMESTAMP) {
1277                         timestruc_t *timestamp;
1278 
1279                         if (oldflg)
1280                                 continue;
1281 
1282                         cmsg->cmsg_level = tohp->level;
1283                         cmsg->cmsg_type = tohp->name;
1284 
1285                         timestamp =
1286                             (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1287                             sizeof (intptr_t));
1288 
1289                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1290                                 struct timeval tv;
1291 
1292                                 cmsg->cmsg_len = sizeof (struct timeval) +
1293                                     sizeof (struct cmsghdr);
1294                                 tv.tv_sec = timestamp->tv_sec;
1295                                 tv.tv_usec = timestamp->tv_nsec /
1296                                     (NANOSEC / MICROSEC);
1297                                 /*
1298                                  * on LP64 systems, the struct timeval in
1299                                  * the destination will not be 8-byte aligned,
1300                                  * so use bcopy to avoid alignment trouble
1301                                  */
1302                                 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1303                         } else {
1304                                 struct timeval32 *time32;
1305 
1306                                 cmsg->cmsg_len = sizeof (struct timeval32) +
1307                                     sizeof (struct cmsghdr);
1308                                 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1309                                 time32->tv_sec = (time32_t)timestamp->tv_sec;
1310                                 time32->tv_usec =
1311                                     (int32_t)(timestamp->tv_nsec /
1312                                     (NANOSEC / MICROSEC));
1313                         }
1314 
1315                 } else {
1316                         if (oldflg)
1317                                 continue;
1318 
1319                         cmsg->cmsg_level = tohp->level;
1320                         cmsg->cmsg_type = tohp->name;
1321                         cmsg->cmsg_len = (socklen_t)sizeof (struct cmsghdr);
1322                         if (tohp->level == IPPROTO_IP &&
1323                             (tohp->name == IP_RECVTOS ||
1324                             tohp->name == IP_RECVTTL)) {
1325                                 /*
1326                                  * The data for these is a uint8_t but, in
1327                                  * order to maintain alignment for any
1328                                  * following TPI primitives in the message,
1329                                  * there will be some trailing padding bytes
1330                                  * which are included in the TPI_TOPT_DATALEN.
1331                                  * For these types, we set the cmsg_len
1332                                  * explicitly to the correct value.
1333                                  */
1334                                 cmsg->cmsg_len += (socklen_t)sizeof (uint8_t);
1335                         } else {
1336                                 cmsg->cmsg_len +=
1337                                     (socklen_t)(_TPI_TOPT_DATALEN(tohp));
1338                         }
1339 
1340                         /* copy content to control data part */
1341                         bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1342                             CMSG_CONTENTLEN(cmsg));
1343                 }
1344                 /* move to next CMSG structure! */
1345                 cmsg = CMSG_NEXT(cmsg);
1346         }
1347         dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1348             control, controllen, (void *)cend, (void *)cmsg));
1349         ASSERT(cmsg <= cend);
1350         return (0);
1351 }
1352 
1353 /*
1354  * Extract the SO_SRCADDR option value if present.
1355  */
1356 void
1357 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1358     t_uscalar_t *srclenp)
1359 {
1360         struct T_opthdr         *tohp;
1361 
1362         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1363 
1364         ASSERT(srcp != NULL && srclenp != NULL);
1365         *srcp = NULL;
1366         *srclenp = 0;
1367 
1368         for (tohp = (struct T_opthdr *)opt;
1369             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1370             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1371                 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1372                     tohp->level, tohp->name, tohp->len));
1373                 if (tohp->level == SOL_SOCKET &&
1374                     tohp->name == SO_SRCADDR) {
1375                         *srcp = _TPI_TOPT_DATA(tohp);
1376                         *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1377                 }
1378         }
1379 }
1380 
1381 /*
1382  * Verify if the SO_UNIX_CLOSE option is present.
1383  */
1384 int
1385 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1386 {
1387         struct T_opthdr         *tohp;
1388 
1389         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1390 
1391         for (tohp = (struct T_opthdr *)opt;
1392             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1393             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1394                 dprint(1,
1395                     ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1396                     tohp->level, tohp->name, tohp->len));
1397                 if (tohp->level == SOL_SOCKET &&
1398                     tohp->name == SO_UNIX_CLOSE)
1399                         return (1);
1400         }
1401         return (0);
1402 }
1403 
1404 /*
1405  * Allocate an M_PROTO message.
1406  *
1407  * If allocation fails the behavior depends on sleepflg:
1408  *      _ALLOC_NOSLEEP  fail immediately
1409  *      _ALLOC_INTR     sleep for memory until a signal is caught
1410  *      _ALLOC_SLEEP    sleep forever. Don't return NULL.
1411  */
1412 mblk_t *
1413 soallocproto(size_t size, int sleepflg, cred_t *cr)
1414 {
1415         mblk_t  *mp;
1416 
1417         /* Round up size for reuse */
1418         size = MAX(size, 64);
1419         if (cr != NULL)
1420                 mp = allocb_cred(size, cr, curproc->p_pid);
1421         else
1422                 mp = allocb(size, BPRI_MED);
1423 
1424         if (mp == NULL) {
1425                 int error;      /* Dummy - error not returned to caller */
1426 
1427                 switch (sleepflg) {
1428                 case _ALLOC_SLEEP:
1429                         if (cr != NULL) {
1430                                 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1431                                     cr, curproc->p_pid);
1432                         } else {
1433                                 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1434                                     &error);
1435                         }
1436                         ASSERT(mp);
1437                         break;
1438                 case _ALLOC_INTR:
1439                         if (cr != NULL) {
1440                                 mp = allocb_cred_wait(size, 0, &error, cr,
1441                                     curproc->p_pid);
1442                         } else {
1443                                 mp = allocb_wait(size, BPRI_MED, 0, &error);
1444                         }
1445                         if (mp == NULL) {
1446                                 /* Caught signal while sleeping for memory */
1447                                 eprintline(ENOBUFS);
1448                                 return (NULL);
1449                         }
1450                         break;
1451                 case _ALLOC_NOSLEEP:
1452                 default:
1453                         eprintline(ENOBUFS);
1454                         return (NULL);
1455                 }
1456         }
1457         DB_TYPE(mp) = M_PROTO;
1458         return (mp);
1459 }
1460 
1461 /*
1462  * Allocate an M_PROTO message with a single component.
1463  * len is the length of buf. size is the amount to allocate.
1464  *
1465  * buf can be NULL with a non-zero len.
1466  * This results in a bzero'ed chunk being placed the message.
1467  */
1468 mblk_t *
1469 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1470     cred_t *cr)
1471 {
1472         mblk_t  *mp;
1473 
1474         if (size == 0)
1475                 size = len;
1476 
1477         ASSERT(size >= len);
1478         /* Round up size for reuse */
1479         size = MAX(size, 64);
1480         mp = soallocproto(size, sleepflg, cr);
1481         if (mp == NULL)
1482                 return (NULL);
1483         mp->b_datap->db_type = M_PROTO;
1484         if (len != 0) {
1485                 if (buf != NULL)
1486                         bcopy(buf, mp->b_wptr, len);
1487                 else
1488                         bzero(mp->b_wptr, len);
1489                 mp->b_wptr += len;
1490         }
1491         return (mp);
1492 }
1493 
1494 /*
1495  * Append buf/len to mp.
1496  * The caller has to ensure that there is enough room in the mblk.
1497  *
1498  * buf can be NULL with a non-zero len.
1499  * This results in a bzero'ed chunk being placed the message.
1500  */
1501 void
1502 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1503 {
1504         ASSERT(mp);
1505 
1506         if (len != 0) {
1507                 /* Assert for room left */
1508                 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1509                 if (buf != NULL)
1510                         bcopy(buf, mp->b_wptr, len);
1511                 else
1512                         bzero(mp->b_wptr, len);
1513         }
1514         mp->b_wptr += len;
1515 }
1516 
1517 /*
1518  * Create a message using two kernel buffers.
1519  * If size is set that will determine the allocation size (e.g. for future
1520  * soappendmsg calls). If size is zero it is derived from the buffer
1521  * lengths.
1522  */
1523 mblk_t *
1524 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1525     ssize_t size, int sleepflg, cred_t *cr)
1526 {
1527         mblk_t *mp;
1528 
1529         if (size == 0)
1530                 size = len1 + len2;
1531         ASSERT(size >= len1 + len2);
1532 
1533         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1534         if (mp)
1535                 soappendmsg(mp, buf2, len2);
1536         return (mp);
1537 }
1538 
1539 /*
1540  * Create a message using three kernel buffers.
1541  * If size is set that will determine the allocation size (for future
1542  * soappendmsg calls). If size is zero it is derived from the buffer
1543  * lengths.
1544  */
1545 mblk_t *
1546 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1547     const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1548 {
1549         mblk_t *mp;
1550 
1551         if (size == 0)
1552                 size = len1 + len2 +len3;
1553         ASSERT(size >= len1 + len2 + len3);
1554 
1555         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1556         if (mp != NULL) {
1557                 soappendmsg(mp, buf2, len2);
1558                 soappendmsg(mp, buf3, len3);
1559         }
1560         return (mp);
1561 }
1562 
1563 #ifdef DEBUG
1564 char *
1565 pr_state(uint_t state, uint_t mode)
1566 {
1567         static char buf[1024];
1568 
1569         buf[0] = 0;
1570         if (state & SS_ISCONNECTED)
1571                 (void) strcat(buf, "ISCONNECTED ");
1572         if (state & SS_ISCONNECTING)
1573                 (void) strcat(buf, "ISCONNECTING ");
1574         if (state & SS_ISDISCONNECTING)
1575                 (void) strcat(buf, "ISDISCONNECTING ");
1576         if (state & SS_CANTSENDMORE)
1577                 (void) strcat(buf, "CANTSENDMORE ");
1578 
1579         if (state & SS_CANTRCVMORE)
1580                 (void) strcat(buf, "CANTRCVMORE ");
1581         if (state & SS_ISBOUND)
1582                 (void) strcat(buf, "ISBOUND ");
1583         if (state & SS_NDELAY)
1584                 (void) strcat(buf, "NDELAY ");
1585         if (state & SS_NONBLOCK)
1586                 (void) strcat(buf, "NONBLOCK ");
1587 
1588         if (state & SS_ASYNC)
1589                 (void) strcat(buf, "ASYNC ");
1590         if (state & SS_ACCEPTCONN)
1591                 (void) strcat(buf, "ACCEPTCONN ");
1592         if (state & SS_SAVEDEOR)
1593                 (void) strcat(buf, "SAVEDEOR ");
1594 
1595         if (state & SS_RCVATMARK)
1596                 (void) strcat(buf, "RCVATMARK ");
1597         if (state & SS_OOBPEND)
1598                 (void) strcat(buf, "OOBPEND ");
1599         if (state & SS_HAVEOOBDATA)
1600                 (void) strcat(buf, "HAVEOOBDATA ");
1601         if (state & SS_HADOOBDATA)
1602                 (void) strcat(buf, "HADOOBDATA ");
1603 
1604         if (mode & SM_PRIV)
1605                 (void) strcat(buf, "PRIV ");
1606         if (mode & SM_ATOMIC)
1607                 (void) strcat(buf, "ATOMIC ");
1608         if (mode & SM_ADDR)
1609                 (void) strcat(buf, "ADDR ");
1610         if (mode & SM_CONNREQUIRED)
1611                 (void) strcat(buf, "CONNREQUIRED ");
1612 
1613         if (mode & SM_FDPASSING)
1614                 (void) strcat(buf, "FDPASSING ");
1615         if (mode & SM_EXDATA)
1616                 (void) strcat(buf, "EXDATA ");
1617         if (mode & SM_OPTDATA)
1618                 (void) strcat(buf, "OPTDATA ");
1619         if (mode & SM_BYTESTREAM)
1620                 (void) strcat(buf, "BYTESTREAM ");
1621         return (buf);
1622 }
1623 
1624 char *
1625 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1626 {
1627         static char buf[1024];
1628 
1629         if (addr == NULL || addrlen == 0) {
1630                 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1631                 return (buf);
1632         }
1633         switch (family) {
1634         case AF_INET: {
1635                 struct sockaddr_in sin;
1636 
1637                 bcopy(addr, &sin, sizeof (sin));
1638 
1639                 (void) sprintf(buf, "(len %d) %x/%d",
1640                     addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1641                 break;
1642         }
1643         case AF_INET6: {
1644                 struct sockaddr_in6 sin6;
1645                 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1646 
1647                 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1648                 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1649                     addrlen,
1650                     ntohs(piece[0]), ntohs(piece[1]),
1651                     ntohs(piece[2]), ntohs(piece[3]),
1652                     ntohs(piece[4]), ntohs(piece[5]),
1653                     ntohs(piece[6]), ntohs(piece[7]),
1654                     ntohs(sin6.sin6_port));
1655                 break;
1656         }
1657         case AF_UNIX: {
1658                 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1659 
1660                 (void) sprintf(buf, "(len %d) %s", addrlen,
1661                     (soun == NULL) ? "(none)" : soun->sun_path);
1662                 break;
1663         }
1664         default:
1665                 (void) sprintf(buf, "(unknown af %d)", family);
1666                 break;
1667         }
1668         return (buf);
1669 }
1670 
1671 /* The logical equivalence operator (a if-and-only-if b) */
1672 #define EQUIVALENT(a, b)        (((a) && (b)) || (!(a) && (!(b))))
1673 
1674 /*
1675  * Verify limitations and invariants on oob state.
1676  * Return 1 if OK, otherwise 0 so that it can be used as
1677  *      ASSERT(verify_oobstate(so));
1678  */
1679 int
1680 so_verify_oobstate(struct sonode *so)
1681 {
1682         boolean_t havemark;
1683 
1684         ASSERT(MUTEX_HELD(&so->so_lock));
1685 
1686         /*
1687          * The possible state combinations are:
1688          *      0
1689          *      SS_OOBPEND
1690          *      SS_OOBPEND|SS_HAVEOOBDATA
1691          *      SS_OOBPEND|SS_HADOOBDATA
1692          *      SS_HADOOBDATA
1693          */
1694         switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1695         case 0:
1696         case SS_OOBPEND:
1697         case SS_OOBPEND|SS_HAVEOOBDATA:
1698         case SS_OOBPEND|SS_HADOOBDATA:
1699         case SS_HADOOBDATA:
1700                 break;
1701         default:
1702                 printf("Bad oob state 1 (%p): state %s\n",
1703                     (void *)so, pr_state(so->so_state, so->so_mode));
1704                 return (0);
1705         }
1706 
1707         /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1708         if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1709                 printf("Bad oob state 2 (%p): state %s\n",
1710                     (void *)so, pr_state(so->so_state, so->so_mode));
1711                 return (0);
1712         }
1713 
1714         /*
1715          * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1716          * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1717          */
1718         havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1719             SOTOTPI(so)->sti_oobsigcnt > 0;
1720 
1721         if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1722             so->so_state & SS_OOBPEND)) {
1723                 printf("Bad oob state 3 (%p): state %s\n",
1724                     (void *)so, pr_state(so->so_state, so->so_mode));
1725                 return (0);
1726         }
1727 
1728         /*
1729          * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1730          */
1731         if (!(so->so_options & SO_OOBINLINE) &&
1732             !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1733                 printf("Bad oob state 4 (%p): state %s\n",
1734                     (void *)so, pr_state(so->so_state, so->so_mode));
1735                 return (0);
1736         }
1737 
1738         if (!SOCK_IS_NONSTR(so) &&
1739             SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1740                 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1741                     (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1742                     SOTOTPI(so)->sti_oobcnt,
1743                     pr_state(so->so_state, so->so_mode));
1744                 return (0);
1745         }
1746 
1747         return (1);
1748 }
1749 #undef  EQUIVALENT
1750 #endif /* DEBUG */
1751 
1752 /* initialize sockfs zone specific kstat related items                  */
1753 void *
1754 sock_kstat_init(zoneid_t zoneid)
1755 {
1756         kstat_t *ksp;
1757 
1758         ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1759             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1760 
1761         if (ksp != NULL) {
1762                 ksp->ks_update = sockfs_update;
1763                 ksp->ks_snapshot = sockfs_snapshot;
1764                 ksp->ks_lock = &socklist.sl_lock;
1765                 ksp->ks_private = (void *)(uintptr_t)zoneid;
1766                 kstat_install(ksp);
1767         }
1768 
1769         return (ksp);
1770 }
1771 
1772 /* tear down sockfs zone specific kstat related items                   */
1773 /*ARGSUSED*/
1774 void
1775 sock_kstat_fini(zoneid_t zoneid, void *arg)
1776 {
1777         kstat_t *ksp = (kstat_t *)arg;
1778 
1779         if (ksp != NULL) {
1780                 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1781                 kstat_delete(ksp);
1782         }
1783 }
1784 
1785 /*
1786  * Zones:
1787  * Note that nactive is going to be different for each zone.
1788  * This means we require kstat to call sockfs_update and then sockfs_snapshot
1789  * for the same zone, or sockfs_snapshot will be taken into the wrong size
1790  * buffer. This is safe, but if the buffer is too small, user will not be
1791  * given details of all sockets. However, as this kstat has a ks_lock, kstat
1792  * driver will keep it locked between the update and the snapshot, so no
1793  * other process (zone) can currently get inbetween resulting in a wrong size
1794  * buffer allocation.
1795  */
1796 static int
1797 sockfs_update(kstat_t *ksp, int rw)
1798 {
1799         uint_t  nactive = 0;            /* # of active AF_UNIX sockets  */
1800         struct sonode   *so;            /* current sonode on socklist   */
1801         zoneid_t        myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1802 
1803         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1804 
1805         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1806                 return (EACCES);
1807         }
1808 
1809         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1810                 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1811                         nactive++;
1812                 }
1813         }
1814         ksp->ks_ndata = nactive;
1815         ksp->ks_data_size = nactive * sizeof (struct sockinfo);
1816 
1817         return (0);
1818 }
1819 
1820 static int
1821 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1822 {
1823         int                     ns;     /* # of sonodes we've copied    */
1824         struct sonode           *so;    /* current sonode on socklist   */
1825         struct sockinfo         *psi;   /* where we put sockinfo data   */
1826         t_uscalar_t             sn_len; /* soa_len                      */
1827         zoneid_t                myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1828         sotpi_info_t            *sti;
1829 
1830         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1831 
1832         ksp->ks_snaptime = gethrtime();
1833 
1834         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1835                 return (EACCES);
1836         }
1837 
1838         /*
1839          * For each sonode on the socklist, we massage the important
1840          * info into buf, in sockinfo format.
1841          */
1842         psi = (struct sockinfo *)buf;
1843         ns = 0;
1844         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1845                 vattr_t attr;
1846 
1847                 /* only stuff active sonodes and the same zone:         */
1848                 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1849                         continue;
1850                 }
1851 
1852                 /*
1853                  * If the sonode was activated between the update and the
1854                  * snapshot, we're done - as this is only a snapshot.
1855                  */
1856                 if ((caddr_t)(psi) >= (caddr_t)buf + ksp->ks_data_size) {
1857                         break;
1858                 }
1859 
1860                 sti = SOTOTPI(so);
1861                 /* copy important info into buf:                        */
1862                 psi->si_size = sizeof (struct sockinfo);
1863                 psi->si_family = so->so_family;
1864                 psi->si_type = so->so_type;
1865                 psi->si_flag = so->so_flag;
1866                 psi->si_state = so->so_state;
1867                 psi->si_serv_type = sti->sti_serv_type;
1868                 psi->si_ux_laddr_sou_magic = sti->sti_ux_laddr.soua_magic;
1869                 psi->si_ux_faddr_sou_magic = sti->sti_ux_faddr.soua_magic;
1870                 psi->si_laddr_soa_len = sti->sti_laddr.soa_len;
1871                 psi->si_faddr_soa_len = sti->sti_faddr.soa_len;
1872                 psi->si_szoneid = so->so_zoneid;
1873                 psi->si_faddr_noxlate = sti->sti_faddr_noxlate;
1874 
1875                 /*
1876                  * Grab the inode, if possible.
1877                  * This must be done before entering so_lock as VOP_GETATTR
1878                  * will acquire it.
1879                  */
1880                 if (so->so_vnode == NULL ||
1881                     VOP_GETATTR(so->so_vnode, &attr, 0, CRED(), NULL) != 0)
1882                         attr.va_nodeid = 0;
1883 
1884                 psi->si_inode = attr.va_nodeid;
1885 
1886                 mutex_enter(&so->so_lock);
1887 
1888                 if (sti->sti_laddr_sa != NULL) {
1889                         ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1890                         sn_len = sti->sti_laddr_len;
1891                         ASSERT(sn_len <= sizeof (short) +
1892                             sizeof (psi->si_laddr_sun_path));
1893 
1894                         psi->si_laddr_family =
1895                             sti->sti_laddr_sa->sa_family;
1896                         if (sn_len != 0) {
1897                                 /* AF_UNIX socket names are NULL terminated */
1898                                 (void) strncpy(psi->si_laddr_sun_path,
1899                                     sti->sti_laddr_sa->sa_data,
1900                                     sizeof (psi->si_laddr_sun_path));
1901                                 sn_len = strlen(psi->si_laddr_sun_path);
1902                         }
1903                         psi->si_laddr_sun_path[sn_len] = 0;
1904                 }
1905 
1906                 if (sti->sti_faddr_sa != NULL) {
1907                         ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1908                         sn_len = sti->sti_faddr_len;
1909                         ASSERT(sn_len <= sizeof (short) +
1910                             sizeof (psi->si_faddr_sun_path));
1911 
1912                         psi->si_faddr_family =
1913                             sti->sti_faddr_sa->sa_family;
1914                         if (sn_len != 0) {
1915                                 (void) strncpy(psi->si_faddr_sun_path,
1916                                     sti->sti_faddr_sa->sa_data,
1917                                     sizeof (psi->si_faddr_sun_path));
1918                                 sn_len = strlen(psi->si_faddr_sun_path);
1919                         }
1920                         psi->si_faddr_sun_path[sn_len] = 0;
1921                 }
1922 
1923                 mutex_exit(&so->so_lock);
1924 
1925                 (void) snprintf(psi->si_son_straddr,
1926                     sizeof (psi->si_son_straddr), "%p", (void *)so);
1927                 (void) snprintf(psi->si_lvn_straddr,
1928                     sizeof (psi->si_lvn_straddr), "%p",
1929                     (void *)sti->sti_ux_laddr.soua_vp);
1930                 (void) snprintf(psi->si_fvn_straddr,
1931                     sizeof (psi->si_fvn_straddr), "%p",
1932                     (void *)sti->sti_ux_faddr.soua_vp);
1933 
1934                 ns++;
1935                 psi++;
1936         }
1937 
1938         ksp->ks_ndata = ns;
1939         return (0);
1940 }
1941 
1942 ssize_t
1943 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1944 {
1945         struct uio auio;
1946         struct iovec aiov[1];
1947         register vnode_t *vp;
1948         int ioflag, rwflag;
1949         ssize_t cnt;
1950         int error = 0;
1951         int iovcnt = 0;
1952         short fflag;
1953 
1954         vp = fp->f_vnode;
1955         fflag = fp->f_flag;
1956 
1957         rwflag = 0;
1958         aiov[0].iov_base = (caddr_t)buf;
1959         aiov[0].iov_len = size;
1960         iovcnt = 1;
1961         cnt = (ssize_t)size;
1962         (void) VOP_RWLOCK(vp, rwflag, NULL);
1963 
1964         auio.uio_loffset = fileoff;
1965         auio.uio_iov = aiov;
1966         auio.uio_iovcnt = iovcnt;
1967         auio.uio_resid = cnt;
1968         auio.uio_segflg = UIO_SYSSPACE;
1969         auio.uio_llimit = MAXOFFSET_T;
1970         auio.uio_fmode = fflag;
1971         auio.uio_extflg = UIO_COPY_CACHED;
1972 
1973         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1974 
1975         /* If read sync is not asked for, filter sync flags */
1976         if ((ioflag & FRSYNC) == 0)
1977                 ioflag &= ~(FSYNC|FDSYNC);
1978         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1979         cnt -= auio.uio_resid;
1980 
1981         VOP_RWUNLOCK(vp, rwflag, NULL);
1982 
1983         if (error == EINTR && cnt != 0)
1984                 error = 0;
1985 out:
1986         if (error != 0) {
1987                 *err = error;
1988                 return (0);
1989         } else {
1990                 *err = 0;
1991                 return (cnt);
1992         }
1993 }
1994 
1995 int
1996 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1997 {
1998         if (fromkernel) {
1999                 bcopy(from, to, size);
2000                 return (0);
2001         }
2002         return (xcopyin(from, to, size));
2003 }
2004 
2005 int
2006 so_copyout(const void *from, void *to, size_t size, int tokernel)
2007 {
2008         if (tokernel) {
2009                 bcopy(from, to, size);
2010                 return (0);
2011         }
2012         return (xcopyout(from, to, size));
2013 }