1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/t_lock.h>
  28 #include <sys/param.h>
  29 #include <sys/systm.h>
  30 #include <sys/buf.h>
  31 #include <sys/conf.h>
  32 #include <sys/cred.h>
  33 #include <sys/kmem.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vfs_opreg.h>
  37 #include <sys/vnode.h>
  38 #include <sys/debug.h>
  39 #include <sys/errno.h>
  40 #include <sys/time.h>
  41 #include <sys/file.h>
  42 #include <sys/open.h>
  43 #include <sys/user.h>
  44 #include <sys/termios.h>
  45 #include <sys/stream.h>
  46 #include <sys/strsubr.h>
  47 #include <sys/strsun.h>
  48 #include <sys/esunddi.h>
  49 #include <sys/flock.h>
  50 #include <sys/modctl.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/mkdev.h>
  53 #include <sys/pathname.h>
  54 #include <sys/ddi.h>
  55 #include <sys/stat.h>
  56 #include <sys/fs/snode.h>
  57 #include <sys/fs/dv_node.h>
  58 #include <sys/zone.h>
  59 
  60 #include <sys/socket.h>
  61 #include <sys/socketvar.h>
  62 #include <netinet/in.h>
  63 #include <sys/un.h>
  64 #include <sys/ucred.h>
  65 
  66 #include <sys/tiuser.h>
  67 #define _SUN_TPI_VERSION        2
  68 #include <sys/tihdr.h>
  69 
  70 #include <c2/audit.h>
  71 
  72 #include <fs/sockfs/nl7c.h>
  73 #include <fs/sockfs/sockcommon.h>
  74 #include <fs/sockfs/sockfilter_impl.h>
  75 #include <fs/sockfs/socktpi.h>
  76 #include <fs/sockfs/socktpi_impl.h>
  77 #include <fs/sockfs/sodirect.h>
  78 
  79 /*
  80  * Macros that operate on struct cmsghdr.
  81  * The CMSG_VALID macro does not assume that the last option buffer is padded.
  82  */
  83 #define CMSG_CONTENT(cmsg)      (&((cmsg)[1]))
  84 #define CMSG_CONTENTLEN(cmsg)   ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
  85 #define CMSG_VALID(cmsg, start, end)                                    \
  86         (ISALIGNED_cmsghdr(cmsg) &&                                     \
  87         ((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&                 \
  88         ((uintptr_t)(cmsg) < (uintptr_t)(end)) &&                    \
  89         ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
  90         ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
  91 #define SO_LOCK_WAKEUP_TIME     3000    /* Wakeup time in milliseconds */
  92 
  93 dev_t sockdev;  /* For fsid in getattr */
  94 int sockfs_defer_nl7c_init = 0;
  95 
  96 struct socklist socklist;
  97 
  98 struct kmem_cache *socket_cache;
  99 
 100 /*
 101  * sockconf_lock protects the socket configuration (socket types and
 102  * socket filters) which is changed via the sockconfig system call.
 103  */
 104 krwlock_t sockconf_lock;
 105 
 106 static int sockfs_update(kstat_t *, int);
 107 static int sockfs_snapshot(kstat_t *, void *, int);
 108 extern smod_info_t *sotpi_smod_create(void);
 109 
 110 extern void sendfile_init();
 111 
 112 extern void nl7c_init(void);
 113 
 114 extern int modrootloaded;
 115 
 116 #define ADRSTRLEN (2 * sizeof (void *) + 1)
 117 /*
 118  * kernel structure for passing the sockinfo data back up to the user.
 119  * the strings array allows us to convert AF_UNIX addresses into strings
 120  * with a common method regardless of which n-bit kernel we're running.
 121  */
 122 struct k_sockinfo {
 123         struct sockinfo ks_si;
 124         char            ks_straddr[3][ADRSTRLEN];
 125 };
 126 
 127 /*
 128  * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
 129  * Returns with the vnode held.
 130  */
 131 int
 132 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
 133 {
 134         struct snode *csp;
 135         vnode_t *vp, *dvp;
 136         major_t maj;
 137         int error;
 138 
 139         ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
 140 
 141         /*
 142          * Lookup the underlying filesystem vnode.
 143          */
 144         error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
 145         if (error)
 146                 return (error);
 147 
 148         /* Check that it is the correct vnode */
 149         if (vp->v_type != VCHR) {
 150                 VN_RELE(vp);
 151                 return (ENOTSOCK);
 152         }
 153 
 154         /*
 155          * If devpath went through devfs, the device should already
 156          * be configured. If devpath is a mknod file, however, we
 157          * need to make sure the device is properly configured.
 158          * To do this, we do something similar to spec_open()
 159          * except that we resolve to the minor/leaf level since
 160          * we need to return a vnode.
 161          */
 162         csp = VTOS(VTOS(vp)->s_commonvp);
 163         if (!(csp->s_flag & SDIPSET)) {
 164                 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 165                 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
 166                 if (error == 0)
 167                         error = devfs_lookupname(pathname, NULLVPP, &dvp);
 168                 VN_RELE(vp);
 169                 kmem_free(pathname, MAXPATHLEN);
 170                 if (error != 0)
 171                         return (ENXIO);
 172                 vp = dvp;       /* use the devfs vp */
 173         }
 174 
 175         /* device is configured at this point */
 176         maj = getmajor(vp->v_rdev);
 177         if (!STREAMSTAB(maj)) {
 178                 VN_RELE(vp);
 179                 return (ENOSTR);
 180         }
 181 
 182         *vpp = vp;
 183         return (0);
 184 }
 185 
 186 /*
 187  * Update the accessed, updated, or changed times in an sonode
 188  * with the current time.
 189  *
 190  * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
 191  * attributes in a fstat call. (They return the current time and 0 for
 192  * all timestamps, respectively.) We maintain the current timestamps
 193  * here primarily so that should sockmod be popped the resulting
 194  * file descriptor will behave like a stream w.r.t. the timestamps.
 195  */
 196 void
 197 so_update_attrs(struct sonode *so, int flag)
 198 {
 199         time_t now = gethrestime_sec();
 200 
 201         if (SOCK_IS_NONSTR(so))
 202                 return;
 203 
 204         mutex_enter(&so->so_lock);
 205         so->so_flag |= flag;
 206         if (flag & SOACC)
 207                 SOTOTPI(so)->sti_atime = now;
 208         if (flag & SOMOD)
 209                 SOTOTPI(so)->sti_mtime = now;
 210         mutex_exit(&so->so_lock);
 211 }
 212 
 213 extern so_create_func_t sock_comm_create_function;
 214 extern so_destroy_func_t sock_comm_destroy_function;
 215 /*
 216  * Init function called when sockfs is loaded.
 217  */
 218 int
 219 sockinit(int fstype, char *name)
 220 {
 221         static const fs_operation_def_t sock_vfsops_template[] = {
 222                 NULL, NULL
 223         };
 224         int error;
 225         major_t dev;
 226         char *err_str;
 227 
 228         error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
 229         if (error != 0) {
 230                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 231                     "sockinit: bad vfs ops template");
 232                 return (error);
 233         }
 234 
 235         error = vn_make_ops(name, socket_vnodeops_template,
 236             &socket_vnodeops);
 237         if (error != 0) {
 238                 err_str = "sockinit: bad socket vnode ops template";
 239                 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
 240                 socket_vnodeops = NULL;
 241                 goto failure;
 242         }
 243 
 244         socket_cache = kmem_cache_create("socket_cache",
 245             sizeof (struct sonode), 0, sonode_constructor,
 246             sonode_destructor, NULL, NULL, NULL, 0);
 247 
 248         rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
 249 
 250         error = socktpi_init();
 251         if (error != 0) {
 252                 err_str = NULL;
 253                 goto failure;
 254         }
 255 
 256         error = sod_init();
 257         if (error != 0) {
 258                 err_str = NULL;
 259                 goto failure;
 260         }
 261 
 262         /*
 263          * Set up the default create and destroy functions
 264          */
 265         sock_comm_create_function = socket_sonode_create;
 266         sock_comm_destroy_function = socket_sonode_destroy;
 267 
 268         /*
 269          * Build initial list mapping socket parameters to vnode.
 270          */
 271         smod_init();
 272         smod_add(sotpi_smod_create());
 273 
 274         sockparams_init();
 275 
 276         /*
 277          * If sockets are needed before init runs /sbin/soconfig
 278          * it is possible to preload the sockparams list here using
 279          * calls like:
 280          *      sockconfig(1,2,3, "/dev/tcp", 0);
 281          */
 282 
 283         /*
 284          * Create a unique dev_t for use in so_fsid.
 285          */
 286 
 287         if ((dev = getudev()) == (major_t)-1)
 288                 dev = 0;
 289         sockdev = makedevice(dev, 0);
 290 
 291         mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
 292         sendfile_init();
 293         if (!modrootloaded) {
 294                 sockfs_defer_nl7c_init = 1;
 295         } else {
 296                 nl7c_init();
 297         }
 298 
 299         /* Initialize socket filters */
 300         sof_init();
 301 
 302         return (0);
 303 
 304 failure:
 305         (void) vfs_freevfsops_by_type(fstype);
 306         if (socket_vnodeops != NULL)
 307                 vn_freevnodeops(socket_vnodeops);
 308         if (err_str != NULL)
 309                 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
 310         return (error);
 311 }
 312 
 313 /*
 314  * Caller must hold the mutex. Used to set SOLOCKED.
 315  */
 316 void
 317 so_lock_single(struct sonode *so)
 318 {
 319         ASSERT(MUTEX_HELD(&so->so_lock));
 320 
 321         while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
 322                 cv_wait_stop(&so->so_single_cv, &so->so_lock,
 323                     SO_LOCK_WAKEUP_TIME);
 324         }
 325         so->so_flag |= SOLOCKED;
 326 }
 327 
 328 /*
 329  * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
 330  * Used to clear SOLOCKED or SOASYNC_UNBIND.
 331  */
 332 void
 333 so_unlock_single(struct sonode *so, int flag)
 334 {
 335         ASSERT(MUTEX_HELD(&so->so_lock));
 336         ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
 337         ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
 338         ASSERT(so->so_flag & flag);
 339         /*
 340          * Process the T_DISCON_IND on sti_discon_ind_mp.
 341          *
 342          * Call to so_drain_discon_ind will result in so_lock
 343          * being dropped and re-acquired later.
 344          */
 345         if (!SOCK_IS_NONSTR(so)) {
 346                 sotpi_info_t *sti = SOTOTPI(so);
 347 
 348                 if (sti->sti_discon_ind_mp != NULL)
 349                         so_drain_discon_ind(so);
 350         }
 351 
 352         cv_signal(&so->so_single_cv);
 353         so->so_flag &= ~flag;
 354 }
 355 
 356 /*
 357  * Caller must hold the mutex. Used to set SOREADLOCKED.
 358  * If the caller wants nonblocking behavior it should set fmode.
 359  */
 360 int
 361 so_lock_read(struct sonode *so, int fmode)
 362 {
 363         ASSERT(MUTEX_HELD(&so->so_lock));
 364 
 365         while (so->so_flag & SOREADLOCKED) {
 366                 if (fmode & (FNDELAY|FNONBLOCK))
 367                         return (EWOULDBLOCK);
 368                 cv_wait_stop(&so->so_read_cv, &so->so_lock,
 369                     SO_LOCK_WAKEUP_TIME);
 370         }
 371         so->so_flag |= SOREADLOCKED;
 372         return (0);
 373 }
 374 
 375 /*
 376  * Like so_lock_read above but allows signals.
 377  */
 378 int
 379 so_lock_read_intr(struct sonode *so, int fmode)
 380 {
 381         ASSERT(MUTEX_HELD(&so->so_lock));
 382 
 383         while (so->so_flag & SOREADLOCKED) {
 384                 if (fmode & (FNDELAY|FNONBLOCK))
 385                         return (EWOULDBLOCK);
 386                 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
 387                         return (EINTR);
 388         }
 389         so->so_flag |= SOREADLOCKED;
 390         return (0);
 391 }
 392 
 393 /*
 394  * Caller must hold the mutex. Used to clear SOREADLOCKED,
 395  * set in so_lock_read() or so_lock_read_intr().
 396  */
 397 void
 398 so_unlock_read(struct sonode *so)
 399 {
 400         ASSERT(MUTEX_HELD(&so->so_lock));
 401         ASSERT(so->so_flag & SOREADLOCKED);
 402 
 403         cv_signal(&so->so_read_cv);
 404         so->so_flag &= ~SOREADLOCKED;
 405 }
 406 
 407 /*
 408  * Verify that the specified offset falls within the mblk and
 409  * that the resulting pointer is aligned.
 410  * Returns NULL if not.
 411  */
 412 void *
 413 sogetoff(mblk_t *mp, t_uscalar_t offset,
 414     t_uscalar_t length, uint_t align_size)
 415 {
 416         uintptr_t ptr1, ptr2;
 417 
 418         ASSERT(mp && mp->b_wptr >= mp->b_rptr);
 419         ptr1 = (uintptr_t)mp->b_rptr + offset;
 420         ptr2 = (uintptr_t)ptr1 + length;
 421         if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
 422                 eprintline(0);
 423                 return (NULL);
 424         }
 425         if ((ptr1 & (align_size - 1)) != 0) {
 426                 eprintline(0);
 427                 return (NULL);
 428         }
 429         return ((void *)ptr1);
 430 }
 431 
 432 /*
 433  * Return the AF_UNIX underlying filesystem vnode matching a given name.
 434  * Makes sure the sending and the destination sonodes are compatible.
 435  * The vnode is returned held.
 436  *
 437  * The underlying filesystem VSOCK vnode has a v_stream pointer that
 438  * references the actual stream head (hence indirectly the actual sonode).
 439  */
 440 static int
 441 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
 442                 vnode_t **vpp)
 443 {
 444         vnode_t         *vp;    /* Underlying filesystem vnode */
 445         vnode_t         *rvp;   /* real vnode */
 446         vnode_t         *svp;   /* sockfs vnode */
 447         struct sonode   *so2;
 448         int             error;
 449 
 450         dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
 451             soun->sun_path));
 452 
 453         error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
 454         if (error) {
 455                 eprintsoline(so, error);
 456                 return (error);
 457         }
 458 
 459         /*
 460          * Traverse lofs mounts get the real vnode
 461          */
 462         if (VOP_REALVP(vp, &rvp, NULL) == 0) {
 463                 VN_HOLD(rvp);           /* hold the real vnode */
 464                 VN_RELE(vp);            /* release hold from lookup */
 465                 vp = rvp;
 466         }
 467 
 468         if (vp->v_type != VSOCK) {
 469                 error = ENOTSOCK;
 470                 eprintsoline(so, error);
 471                 goto done2;
 472         }
 473 
 474         if (checkaccess) {
 475                 /*
 476                  * Check that we have permissions to access the destination
 477                  * vnode. This check is not done in BSD but it is required
 478                  * by X/Open.
 479                  */
 480                 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
 481                         eprintsoline(so, error);
 482                         goto done2;
 483                 }
 484         }
 485 
 486         /*
 487          * Check if the remote socket has been closed.
 488          *
 489          * Synchronize with vn_rele_stream by holding v_lock while traversing
 490          * v_stream->sd_vnode.
 491          */
 492         mutex_enter(&vp->v_lock);
 493         if (vp->v_stream == NULL) {
 494                 mutex_exit(&vp->v_lock);
 495                 if (so->so_type == SOCK_DGRAM)
 496                         error = EDESTADDRREQ;
 497                 else
 498                         error = ECONNREFUSED;
 499 
 500                 eprintsoline(so, error);
 501                 goto done2;
 502         }
 503         ASSERT(vp->v_stream->sd_vnode);
 504         svp = vp->v_stream->sd_vnode;
 505         /*
 506          * holding v_lock on underlying filesystem vnode and acquiring
 507          * it on sockfs vnode. Assumes that no code ever attempts to
 508          * acquire these locks in the reverse order.
 509          */
 510         VN_HOLD(svp);
 511         mutex_exit(&vp->v_lock);
 512 
 513         if (svp->v_type != VSOCK) {
 514                 error = ENOTSOCK;
 515                 eprintsoline(so, error);
 516                 goto done;
 517         }
 518 
 519         so2 = VTOSO(svp);
 520 
 521         if (so->so_type != so2->so_type) {
 522                 error = EPROTOTYPE;
 523                 eprintsoline(so, error);
 524                 goto done;
 525         }
 526 
 527         VN_RELE(svp);
 528         *vpp = vp;
 529         return (0);
 530 
 531 done:
 532         VN_RELE(svp);
 533 done2:
 534         VN_RELE(vp);
 535         return (error);
 536 }
 537 
 538 /*
 539  * Verify peer address for connect and sendto/sendmsg.
 540  * Since sendto/sendmsg would not get synchronous errors from the transport
 541  * provider we have to do these ugly checks in the socket layer to
 542  * preserve compatibility with SunOS 4.X.
 543  */
 544 int
 545 so_addr_verify(struct sonode *so, const struct sockaddr *name,
 546     socklen_t namelen)
 547 {
 548         int             family;
 549 
 550         dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
 551             (void *)so, (void *)name, namelen));
 552 
 553         ASSERT(name != NULL);
 554 
 555         family = so->so_family;
 556         switch (family) {
 557         case AF_INET:
 558                 if (name->sa_family != family) {
 559                         eprintsoline(so, EAFNOSUPPORT);
 560                         return (EAFNOSUPPORT);
 561                 }
 562                 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
 563                         eprintsoline(so, EINVAL);
 564                         return (EINVAL);
 565                 }
 566                 break;
 567         case AF_INET6: {
 568 #ifdef DEBUG
 569                 struct sockaddr_in6 *sin6;
 570 #endif /* DEBUG */
 571 
 572                 if (name->sa_family != family) {
 573                         eprintsoline(so, EAFNOSUPPORT);
 574                         return (EAFNOSUPPORT);
 575                 }
 576                 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
 577                         eprintsoline(so, EINVAL);
 578                         return (EINVAL);
 579                 }
 580 #ifdef DEBUG
 581                 /* Verify that apps don't forget to clear sin6_scope_id etc */
 582                 sin6 = (struct sockaddr_in6 *)name;
 583                 if (sin6->sin6_scope_id != 0 &&
 584                     !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 585                         zcmn_err(getzoneid(), CE_WARN,
 586                             "connect/send* with uninitialized sin6_scope_id "
 587                             "(%d) on socket. Pid = %d\n",
 588                             (int)sin6->sin6_scope_id, (int)curproc->p_pid);
 589                 }
 590 #endif /* DEBUG */
 591                 break;
 592         }
 593         case AF_UNIX:
 594                 if (SOTOTPI(so)->sti_faddr_noxlate) {
 595                         return (0);
 596                 }
 597                 if (namelen < (socklen_t)sizeof (short)) {
 598                         eprintsoline(so, ENOENT);
 599                         return (ENOENT);
 600                 }
 601                 if (name->sa_family != family) {
 602                         eprintsoline(so, EAFNOSUPPORT);
 603                         return (EAFNOSUPPORT);
 604                 }
 605                 /* MAXPATHLEN + soun_family + nul termination */
 606                 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 607                         eprintsoline(so, ENAMETOOLONG);
 608                         return (ENAMETOOLONG);
 609                 }
 610 
 611                 break;
 612 
 613         default:
 614                 /*
 615                  * Default is don't do any length or sa_family check
 616                  * to allow non-sockaddr style addresses.
 617                  */
 618                 break;
 619         }
 620 
 621         return (0);
 622 }
 623 
 624 
 625 /*
 626  * Translate an AF_UNIX sockaddr_un to the transport internal name.
 627  * Assumes caller has called so_addr_verify first.
 628  */
 629 /*ARGSUSED*/
 630 int
 631 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
 632     socklen_t namelen, int checkaccess,
 633     void **addrp, socklen_t *addrlenp)
 634 {
 635         int                     error;
 636         struct sockaddr_un      *soun;
 637         vnode_t                 *vp;
 638         void                    *addr;
 639         socklen_t               addrlen;
 640         sotpi_info_t            *sti = SOTOTPI(so);
 641 
 642         dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
 643             (void *)so, (void *)name, namelen, checkaccess));
 644 
 645         ASSERT(name != NULL);
 646         ASSERT(so->so_family == AF_UNIX);
 647         ASSERT(!sti->sti_faddr_noxlate);
 648         ASSERT(namelen >= (socklen_t)sizeof (short));
 649         ASSERT(name->sa_family == AF_UNIX);
 650         soun = (struct sockaddr_un *)name;
 651         /*
 652          * Lookup vnode for the specified path name and verify that
 653          * it is a socket.
 654          */
 655         error = so_ux_lookup(so, soun, checkaccess, &vp);
 656         if (error) {
 657                 eprintsoline(so, error);
 658                 return (error);
 659         }
 660         /*
 661          * Use the address of the peer vnode as the address to send
 662          * to. We release the peer vnode here. In case it has been
 663          * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the
 664          * transport the message will get an error or be dropped.
 665          */
 666         sti->sti_ux_faddr.soua_vp = vp;
 667         sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT;
 668         addr = &sti->sti_ux_faddr;
 669         addrlen = (socklen_t)sizeof (sti->sti_ux_faddr);
 670         dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
 671             addrlen, (void *)vp));
 672         VN_RELE(vp);
 673         *addrp = addr;
 674         *addrlenp = (socklen_t)addrlen;
 675         return (0);
 676 }
 677 
 678 /*
 679  * Esballoc free function for messages that contain SO_FILEP option.
 680  * Decrement the reference count on the file pointers using closef.
 681  */
 682 void
 683 fdbuf_free(struct fdbuf *fdbuf)
 684 {
 685         int     i;
 686         struct file *fp;
 687 
 688         dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
 689         for (i = 0; i < fdbuf->fd_numfd; i++) {
 690                 /*
 691                  * We need pointer size alignment for fd_fds. On a LP64
 692                  * kernel, the required alignment is 8 bytes while
 693                  * the option headers and values are only 4 bytes
 694                  * aligned. So its safer to do a bcopy compared to
 695                  * assigning fdbuf->fd_fds[i] to fp.
 696                  */
 697                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 698                 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
 699                 (void) closef(fp);
 700         }
 701         if (fdbuf->fd_ebuf != NULL)
 702                 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
 703         kmem_free(fdbuf, fdbuf->fd_size);
 704 }
 705 
 706 /*
 707  * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
 708  * Waits if memory is not available.
 709  */
 710 mblk_t *
 711 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
 712 {
 713         uchar_t *buf;
 714         mblk_t  *mp;
 715 
 716         dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
 717         buf = kmem_alloc(size, KM_SLEEP);
 718         fdbuf->fd_ebuf = (caddr_t)buf;
 719         fdbuf->fd_ebuflen = size;
 720         fdbuf->fd_frtn.free_func = fdbuf_free;
 721         fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
 722 
 723         mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
 724         mp->b_datap->db_type = M_PROTO;
 725         return (mp);
 726 }
 727 
 728 /*
 729  * Extract file descriptors from a fdbuf.
 730  * Return list in rights/rightslen.
 731  */
 732 /*ARGSUSED*/
 733 static int
 734 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
 735 {
 736         int     i, fd;
 737         int     *rp;
 738         struct file *fp;
 739         int     numfd;
 740 
 741         dprint(1, ("fdbuf_extract: %d fds, len %d\n",
 742             fdbuf->fd_numfd, rightslen));
 743 
 744         numfd = fdbuf->fd_numfd;
 745         ASSERT(rightslen == numfd * (int)sizeof (int));
 746 
 747         /*
 748          * Allocate a file descriptor and increment the f_count.
 749          * The latter is needed since we always call fdbuf_free
 750          * which performs a closef.
 751          */
 752         rp = (int *)rights;
 753         for (i = 0; i < numfd; i++) {
 754                 if ((fd = ufalloc(0)) == -1)
 755                         goto cleanup;
 756                 /*
 757                  * We need pointer size alignment for fd_fds. On a LP64
 758                  * kernel, the required alignment is 8 bytes while
 759                  * the option headers and values are only 4 bytes
 760                  * aligned. So its safer to do a bcopy compared to
 761                  * assigning fdbuf->fd_fds[i] to fp.
 762                  */
 763                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 764                 mutex_enter(&fp->f_tlock);
 765                 fp->f_count++;
 766                 mutex_exit(&fp->f_tlock);
 767                 setf(fd, fp);
 768                 *rp++ = fd;
 769                 if (AU_AUDITING())
 770                         audit_fdrecv(fd, fp);
 771                 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
 772                     i, fd, (void *)fp, fp->f_count));
 773         }
 774         return (0);
 775 
 776 cleanup:
 777         /*
 778          * Undo whatever partial work the loop above has done.
 779          */
 780         {
 781                 int j;
 782 
 783                 rp = (int *)rights;
 784                 for (j = 0; j < i; j++) {
 785                         dprint(0,
 786                             ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
 787                         (void) closeandsetf(*rp++, NULL);
 788                 }
 789         }
 790 
 791         return (EMFILE);
 792 }
 793 
 794 /*
 795  * Insert file descriptors into an fdbuf.
 796  * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
 797  * by calling fdbuf_free().
 798  */
 799 int
 800 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
 801 {
 802         int             numfd, i;
 803         int             *fds;
 804         struct file     *fp;
 805         struct fdbuf    *fdbuf;
 806         int             fdbufsize;
 807 
 808         dprint(1, ("fdbuf_create: len %d\n", rightslen));
 809 
 810         numfd = rightslen / (int)sizeof (int);
 811 
 812         fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
 813         fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
 814         fdbuf->fd_size = fdbufsize;
 815         fdbuf->fd_numfd = 0;
 816         fdbuf->fd_ebuf = NULL;
 817         fdbuf->fd_ebuflen = 0;
 818         fds = (int *)rights;
 819         for (i = 0; i < numfd; i++) {
 820                 if ((fp = getf(fds[i])) == NULL) {
 821                         fdbuf_free(fdbuf);
 822                         return (EBADF);
 823                 }
 824                 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
 825                     i, fds[i], (void *)fp, fp->f_count));
 826                 mutex_enter(&fp->f_tlock);
 827                 fp->f_count++;
 828                 mutex_exit(&fp->f_tlock);
 829                 /*
 830                  * The maximum alignment for fdbuf (or any option header
 831                  * and its value) it 4 bytes. On a LP64 kernel, the alignment
 832                  * is not sufficient for pointers (fd_fds in this case). Since
 833                  * we just did a kmem_alloc (we get a double word alignment),
 834                  * we don't need to do anything on the send side (we loose
 835                  * the double word alignment because fdbuf goes after an
 836                  * option header (eg T_unitdata_req) which is only 4 byte
 837                  * aligned). We take care of this when we extract the file
 838                  * descriptor in fdbuf_extract or fdbuf_free.
 839                  */
 840                 fdbuf->fd_fds[i] = fp;
 841                 fdbuf->fd_numfd++;
 842                 releasef(fds[i]);
 843                 if (AU_AUDITING())
 844                         audit_fdsend(fds[i], fp, 0);
 845         }
 846         *fdbufp = fdbuf;
 847         return (0);
 848 }
 849 
 850 static int
 851 fdbuf_optlen(int rightslen)
 852 {
 853         int numfd;
 854 
 855         numfd = rightslen / (int)sizeof (int);
 856 
 857         return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
 858 }
 859 
 860 static t_uscalar_t
 861 fdbuf_cmsglen(int fdbuflen)
 862 {
 863         return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
 864             (int)sizeof (struct file *) * (int)sizeof (int));
 865 }
 866 
 867 
 868 /*
 869  * Return non-zero if the mblk and fdbuf are consistent.
 870  */
 871 static int
 872 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
 873 {
 874         if (fdbuflen >= FDBUF_HDRSIZE &&
 875             fdbuflen == fdbuf->fd_size) {
 876                 frtn_t *frp = mp->b_datap->db_frtnp;
 877                 /*
 878                  * Check that the SO_FILEP portion of the
 879                  * message has not been modified by
 880                  * the loopback transport. The sending sockfs generates
 881                  * a message that is esballoc'ed with the free function
 882                  * being fdbuf_free() and where free_arg contains the
 883                  * identical information as the SO_FILEP content.
 884                  *
 885                  * If any of these constraints are not satisfied we
 886                  * silently ignore the option.
 887                  */
 888                 ASSERT(mp);
 889                 if (frp != NULL &&
 890                     frp->free_func == fdbuf_free &&
 891                     frp->free_arg != NULL &&
 892                     bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
 893                         dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
 894                             (void *)fdbuf, fdbuflen));
 895                         return (1);
 896                 } else {
 897                         zcmn_err(getzoneid(), CE_WARN,
 898                             "sockfs: mismatched fdbuf content (%p)",
 899                             (void *)mp);
 900                         return (0);
 901                 }
 902         } else {
 903                 zcmn_err(getzoneid(), CE_WARN,
 904                     "sockfs: mismatched fdbuf len %d, %d\n",
 905                     fdbuflen, fdbuf->fd_size);
 906                 return (0);
 907         }
 908 }
 909 
 910 /*
 911  * When the file descriptors returned by sorecvmsg can not be passed
 912  * to the application this routine will cleanup the references on
 913  * the files. Start at startoff bytes into the buffer.
 914  */
 915 static void
 916 close_fds(void *fdbuf, int fdbuflen, int startoff)
 917 {
 918         int *fds = (int *)fdbuf;
 919         int numfd = fdbuflen / (int)sizeof (int);
 920         int i;
 921 
 922         dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
 923 
 924         for (i = 0; i < numfd; i++) {
 925                 if (startoff < 0)
 926                         startoff = 0;
 927                 if (startoff < (int)sizeof (int)) {
 928                         /*
 929                          * This file descriptor is partially or fully after
 930                          * the offset
 931                          */
 932                         dprint(0,
 933                             ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
 934                         (void) closeandsetf(fds[i], NULL);
 935                 }
 936                 startoff -= (int)sizeof (int);
 937         }
 938 }
 939 
 940 /*
 941  * Close all file descriptors contained in the control part starting at
 942  * the startoffset.
 943  */
 944 void
 945 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
 946     int startoff)
 947 {
 948         struct cmsghdr *cmsg;
 949 
 950         if (control == NULL)
 951                 return;
 952 
 953         if (oldflg) {
 954                 close_fds(control, controllen, startoff);
 955                 return;
 956         }
 957         /* Scan control part for file descriptors. */
 958         for (cmsg = (struct cmsghdr *)control;
 959             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
 960             cmsg = CMSG_NEXT(cmsg)) {
 961                 if (cmsg->cmsg_level == SOL_SOCKET &&
 962                     cmsg->cmsg_type == SCM_RIGHTS) {
 963                         close_fds(CMSG_CONTENT(cmsg),
 964                             (int)CMSG_CONTENTLEN(cmsg),
 965                             startoff - (int)sizeof (struct cmsghdr));
 966                 }
 967                 startoff -= cmsg->cmsg_len;
 968         }
 969 }
 970 
 971 /*
 972  * Returns a pointer/length for the file descriptors contained
 973  * in the control buffer. Returns with *fdlenp == -1 if there are no
 974  * file descriptor options present. This is different than there being
 975  * a zero-length file descriptor option.
 976  * Fail if there are multiple SCM_RIGHT cmsgs.
 977  */
 978 int
 979 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
 980     void **fdsp, int *fdlenp)
 981 {
 982         struct cmsghdr *cmsg;
 983         void *fds;
 984         int fdlen;
 985 
 986         if (control == NULL) {
 987                 *fdsp = NULL;
 988                 *fdlenp = -1;
 989                 return (0);
 990         }
 991 
 992         if (oldflg) {
 993                 *fdsp = control;
 994                 if (controllen == 0)
 995                         *fdlenp = -1;
 996                 else
 997                         *fdlenp = controllen;
 998                 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
 999                 return (0);
1000         }
1001 
1002         fds = NULL;
1003         fdlen = 0;
1004 
1005         for (cmsg = (struct cmsghdr *)control;
1006             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1007             cmsg = CMSG_NEXT(cmsg)) {
1008                 if (cmsg->cmsg_level == SOL_SOCKET &&
1009                     cmsg->cmsg_type == SCM_RIGHTS) {
1010                         if (fds != NULL)
1011                                 return (EINVAL);
1012                         fds = CMSG_CONTENT(cmsg);
1013                         fdlen = (int)CMSG_CONTENTLEN(cmsg);
1014                         dprint(1, ("so_getfdopt: new %lu\n",
1015                             (size_t)CMSG_CONTENTLEN(cmsg)));
1016                 }
1017         }
1018         if (fds == NULL) {
1019                 dprint(1, ("so_getfdopt: NONE\n"));
1020                 *fdlenp = -1;
1021         } else
1022                 *fdlenp = fdlen;
1023         *fdsp = fds;
1024         return (0);
1025 }
1026 
1027 /*
1028  * Return the length of the options including any file descriptor options.
1029  */
1030 t_uscalar_t
1031 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1032 {
1033         struct cmsghdr *cmsg;
1034         t_uscalar_t optlen = 0;
1035         t_uscalar_t len;
1036 
1037         if (control == NULL)
1038                 return (0);
1039 
1040         if (oldflg)
1041                 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1042                     fdbuf_optlen(controllen)));
1043 
1044         for (cmsg = (struct cmsghdr *)control;
1045             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1046             cmsg = CMSG_NEXT(cmsg)) {
1047                 if (cmsg->cmsg_level == SOL_SOCKET &&
1048                     cmsg->cmsg_type == SCM_RIGHTS) {
1049                         len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1050                 } else {
1051                         len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1052                 }
1053                 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1054                     sizeof (struct T_opthdr));
1055         }
1056         dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1057             controllen, oldflg, optlen));
1058         return (optlen);
1059 }
1060 
1061 /*
1062  * Copy options from control to the mblk. Skip any file descriptor options.
1063  */
1064 void
1065 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1066 {
1067         struct T_opthdr toh;
1068         struct cmsghdr *cmsg;
1069 
1070         if (control == NULL)
1071                 return;
1072 
1073         if (oldflg) {
1074                 /* No real options - caller has handled file descriptors */
1075                 return;
1076         }
1077         for (cmsg = (struct cmsghdr *)control;
1078             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1079             cmsg = CMSG_NEXT(cmsg)) {
1080                 /*
1081                  * Note: The caller handles file descriptors prior
1082                  * to calling this function.
1083                  */
1084                 t_uscalar_t len;
1085 
1086                 if (cmsg->cmsg_level == SOL_SOCKET &&
1087                     cmsg->cmsg_type == SCM_RIGHTS)
1088                         continue;
1089 
1090                 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1091                 toh.level = cmsg->cmsg_level;
1092                 toh.name = cmsg->cmsg_type;
1093                 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1094                 toh.status = 0;
1095 
1096                 soappendmsg(mp, &toh, sizeof (toh));
1097                 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1098                 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1099                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1100         }
1101 }
1102 
1103 /*
1104  * Return the length of the control message derived from the options.
1105  * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1106  * When oldflg is set only include SO_FILEP.
1107  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1108  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1109  * also be checked for any possible impacts.
1110  */
1111 t_uscalar_t
1112 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1113 {
1114         t_uscalar_t cmsglen = 0;
1115         struct T_opthdr *tohp;
1116         t_uscalar_t len;
1117         t_uscalar_t last_roundup = 0;
1118 
1119         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1120 
1121         for (tohp = (struct T_opthdr *)opt;
1122             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1123             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1124                 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1125                     tohp->level, tohp->name, tohp->len));
1126                 if (tohp->level == SOL_SOCKET &&
1127                     (tohp->name == SO_SRCADDR ||
1128                     tohp->name == SO_UNIX_CLOSE)) {
1129                         continue;
1130                 }
1131                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1132                         struct fdbuf *fdbuf;
1133                         int fdbuflen;
1134 
1135                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1136                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1137 
1138                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1139                                 continue;
1140                         if (oldflg) {
1141                                 cmsglen += fdbuf_cmsglen(fdbuflen);
1142                                 continue;
1143                         }
1144                         len = fdbuf_cmsglen(fdbuflen);
1145                 } else if (tohp->level == SOL_SOCKET &&
1146                     tohp->name == SCM_TIMESTAMP) {
1147                         if (oldflg)
1148                                 continue;
1149 
1150                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1151                                 len = sizeof (struct timeval);
1152                         } else {
1153                                 len = sizeof (struct timeval32);
1154                         }
1155                 } else {
1156                         if (oldflg)
1157                                 continue;
1158                         len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1159                 }
1160                 /*
1161                  * Exclude roundup for last option to not set
1162                  * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1163                  */
1164                 last_roundup = (t_uscalar_t)
1165                     (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1166                     (len + (int)sizeof (struct cmsghdr)));
1167                 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1168                     last_roundup;
1169         }
1170         cmsglen -= last_roundup;
1171         dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1172             optlen, oldflg, cmsglen));
1173         return (cmsglen);
1174 }
1175 
1176 /*
1177  * Copy options from options to the control. Convert SO_FILEP to
1178  * file descriptors.
1179  * Returns errno or zero.
1180  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1181  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1182  * also be checked for any possible impacts.
1183  */
1184 int
1185 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1186     void *control, t_uscalar_t controllen)
1187 {
1188         struct T_opthdr *tohp;
1189         struct cmsghdr *cmsg;
1190         struct fdbuf *fdbuf;
1191         int fdbuflen;
1192         int error;
1193 #if defined(DEBUG) || defined(__lint)
1194         struct cmsghdr *cend = (struct cmsghdr *)
1195             (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1196 #endif
1197         cmsg = (struct cmsghdr *)control;
1198 
1199         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1200 
1201         for (tohp = (struct T_opthdr *)opt;
1202             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1203             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1204                 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1205                     tohp->level, tohp->name, tohp->len));
1206 
1207                 if (tohp->level == SOL_SOCKET &&
1208                     (tohp->name == SO_SRCADDR ||
1209                     tohp->name == SO_UNIX_CLOSE)) {
1210                         continue;
1211                 }
1212                 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1213                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1214                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1215                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1216 
1217                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1218                                 return (EPROTO);
1219                         if (oldflg) {
1220                                 error = fdbuf_extract(fdbuf, control,
1221                                     (int)controllen);
1222                                 if (error != 0)
1223                                         return (error);
1224                                 continue;
1225                         } else {
1226                                 int fdlen;
1227 
1228                                 fdlen = (int)fdbuf_cmsglen(
1229                                     (int)_TPI_TOPT_DATALEN(tohp));
1230 
1231                                 cmsg->cmsg_level = tohp->level;
1232                                 cmsg->cmsg_type = SCM_RIGHTS;
1233                                 cmsg->cmsg_len = (socklen_t)(fdlen +
1234                                     sizeof (struct cmsghdr));
1235 
1236                                 error = fdbuf_extract(fdbuf,
1237                                     CMSG_CONTENT(cmsg), fdlen);
1238                                 if (error != 0)
1239                                         return (error);
1240                         }
1241                 } else if (tohp->level == SOL_SOCKET &&
1242                     tohp->name == SCM_TIMESTAMP) {
1243                         timestruc_t *timestamp;
1244 
1245                         if (oldflg)
1246                                 continue;
1247 
1248                         cmsg->cmsg_level = tohp->level;
1249                         cmsg->cmsg_type = tohp->name;
1250 
1251                         timestamp =
1252                             (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1253                             sizeof (intptr_t));
1254 
1255                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1256                                 struct timeval tv;
1257 
1258                                 cmsg->cmsg_len = sizeof (struct timeval) +
1259                                     sizeof (struct cmsghdr);
1260                                 tv.tv_sec = timestamp->tv_sec;
1261                                 tv.tv_usec = timestamp->tv_nsec /
1262                                     (NANOSEC / MICROSEC);
1263                                 /*
1264                                  * on LP64 systems, the struct timeval in
1265                                  * the destination will not be 8-byte aligned,
1266                                  * so use bcopy to avoid alignment trouble
1267                                  */
1268                                 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1269                         } else {
1270                                 struct timeval32 *time32;
1271 
1272                                 cmsg->cmsg_len = sizeof (struct timeval32) +
1273                                     sizeof (struct cmsghdr);
1274                                 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1275                                 time32->tv_sec = (time32_t)timestamp->tv_sec;
1276                                 time32->tv_usec =
1277                                     (int32_t)(timestamp->tv_nsec /
1278                                     (NANOSEC / MICROSEC));
1279                         }
1280 
1281                 } else {
1282                         if (oldflg)
1283                                 continue;
1284 
1285                         cmsg->cmsg_level = tohp->level;
1286                         cmsg->cmsg_type = tohp->name;
1287                         cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1288                             sizeof (struct cmsghdr));
1289 
1290                         /* copy content to control data part */
1291                         bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1292                             CMSG_CONTENTLEN(cmsg));
1293                 }
1294                 /* move to next CMSG structure! */
1295                 cmsg = CMSG_NEXT(cmsg);
1296         }
1297         dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1298             control, controllen, (void *)cend, (void *)cmsg));
1299         ASSERT(cmsg <= cend);
1300         return (0);
1301 }
1302 
1303 /*
1304  * Extract the SO_SRCADDR option value if present.
1305  */
1306 void
1307 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1308     t_uscalar_t *srclenp)
1309 {
1310         struct T_opthdr         *tohp;
1311 
1312         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1313 
1314         ASSERT(srcp != NULL && srclenp != NULL);
1315         *srcp = NULL;
1316         *srclenp = 0;
1317 
1318         for (tohp = (struct T_opthdr *)opt;
1319             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1320             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1321                 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1322                     tohp->level, tohp->name, tohp->len));
1323                 if (tohp->level == SOL_SOCKET &&
1324                     tohp->name == SO_SRCADDR) {
1325                         *srcp = _TPI_TOPT_DATA(tohp);
1326                         *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1327                 }
1328         }
1329 }
1330 
1331 /*
1332  * Verify if the SO_UNIX_CLOSE option is present.
1333  */
1334 int
1335 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1336 {
1337         struct T_opthdr         *tohp;
1338 
1339         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1340 
1341         for (tohp = (struct T_opthdr *)opt;
1342             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1343             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1344                 dprint(1,
1345                     ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1346                     tohp->level, tohp->name, tohp->len));
1347                 if (tohp->level == SOL_SOCKET &&
1348                     tohp->name == SO_UNIX_CLOSE)
1349                         return (1);
1350         }
1351         return (0);
1352 }
1353 
1354 /*
1355  * Allocate an M_PROTO message.
1356  *
1357  * If allocation fails the behavior depends on sleepflg:
1358  *      _ALLOC_NOSLEEP  fail immediately
1359  *      _ALLOC_INTR     sleep for memory until a signal is caught
1360  *      _ALLOC_SLEEP    sleep forever. Don't return NULL.
1361  */
1362 mblk_t *
1363 soallocproto(size_t size, int sleepflg, cred_t *cr)
1364 {
1365         mblk_t  *mp;
1366 
1367         /* Round up size for reuse */
1368         size = MAX(size, 64);
1369         if (cr != NULL)
1370                 mp = allocb_cred(size, cr, curproc->p_pid);
1371         else
1372                 mp = allocb(size, BPRI_MED);
1373 
1374         if (mp == NULL) {
1375                 int error;      /* Dummy - error not returned to caller */
1376 
1377                 switch (sleepflg) {
1378                 case _ALLOC_SLEEP:
1379                         if (cr != NULL) {
1380                                 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1381                                     cr, curproc->p_pid);
1382                         } else {
1383                                 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1384                                     &error);
1385                         }
1386                         ASSERT(mp);
1387                         break;
1388                 case _ALLOC_INTR:
1389                         if (cr != NULL) {
1390                                 mp = allocb_cred_wait(size, 0, &error, cr,
1391                                     curproc->p_pid);
1392                         } else {
1393                                 mp = allocb_wait(size, BPRI_MED, 0, &error);
1394                         }
1395                         if (mp == NULL) {
1396                                 /* Caught signal while sleeping for memory */
1397                                 eprintline(ENOBUFS);
1398                                 return (NULL);
1399                         }
1400                         break;
1401                 case _ALLOC_NOSLEEP:
1402                 default:
1403                         eprintline(ENOBUFS);
1404                         return (NULL);
1405                 }
1406         }
1407         DB_TYPE(mp) = M_PROTO;
1408         return (mp);
1409 }
1410 
1411 /*
1412  * Allocate an M_PROTO message with a single component.
1413  * len is the length of buf. size is the amount to allocate.
1414  *
1415  * buf can be NULL with a non-zero len.
1416  * This results in a bzero'ed chunk being placed the message.
1417  */
1418 mblk_t *
1419 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1420     cred_t *cr)
1421 {
1422         mblk_t  *mp;
1423 
1424         if (size == 0)
1425                 size = len;
1426 
1427         ASSERT(size >= len);
1428         /* Round up size for reuse */
1429         size = MAX(size, 64);
1430         mp = soallocproto(size, sleepflg, cr);
1431         if (mp == NULL)
1432                 return (NULL);
1433         mp->b_datap->db_type = M_PROTO;
1434         if (len != 0) {
1435                 if (buf != NULL)
1436                         bcopy(buf, mp->b_wptr, len);
1437                 else
1438                         bzero(mp->b_wptr, len);
1439                 mp->b_wptr += len;
1440         }
1441         return (mp);
1442 }
1443 
1444 /*
1445  * Append buf/len to mp.
1446  * The caller has to ensure that there is enough room in the mblk.
1447  *
1448  * buf can be NULL with a non-zero len.
1449  * This results in a bzero'ed chunk being placed the message.
1450  */
1451 void
1452 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1453 {
1454         ASSERT(mp);
1455 
1456         if (len != 0) {
1457                 /* Assert for room left */
1458                 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1459                 if (buf != NULL)
1460                         bcopy(buf, mp->b_wptr, len);
1461                 else
1462                         bzero(mp->b_wptr, len);
1463         }
1464         mp->b_wptr += len;
1465 }
1466 
1467 /*
1468  * Create a message using two kernel buffers.
1469  * If size is set that will determine the allocation size (e.g. for future
1470  * soappendmsg calls). If size is zero it is derived from the buffer
1471  * lengths.
1472  */
1473 mblk_t *
1474 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1475     ssize_t size, int sleepflg, cred_t *cr)
1476 {
1477         mblk_t *mp;
1478 
1479         if (size == 0)
1480                 size = len1 + len2;
1481         ASSERT(size >= len1 + len2);
1482 
1483         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1484         if (mp)
1485                 soappendmsg(mp, buf2, len2);
1486         return (mp);
1487 }
1488 
1489 /*
1490  * Create a message using three kernel buffers.
1491  * If size is set that will determine the allocation size (for future
1492  * soappendmsg calls). If size is zero it is derived from the buffer
1493  * lengths.
1494  */
1495 mblk_t *
1496 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1497     const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1498 {
1499         mblk_t *mp;
1500 
1501         if (size == 0)
1502                 size = len1 + len2 +len3;
1503         ASSERT(size >= len1 + len2 + len3);
1504 
1505         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1506         if (mp != NULL) {
1507                 soappendmsg(mp, buf2, len2);
1508                 soappendmsg(mp, buf3, len3);
1509         }
1510         return (mp);
1511 }
1512 
1513 #ifdef DEBUG
1514 char *
1515 pr_state(uint_t state, uint_t mode)
1516 {
1517         static char buf[1024];
1518 
1519         buf[0] = 0;
1520         if (state & SS_ISCONNECTED)
1521                 (void) strcat(buf, "ISCONNECTED ");
1522         if (state & SS_ISCONNECTING)
1523                 (void) strcat(buf, "ISCONNECTING ");
1524         if (state & SS_ISDISCONNECTING)
1525                 (void) strcat(buf, "ISDISCONNECTING ");
1526         if (state & SS_CANTSENDMORE)
1527                 (void) strcat(buf, "CANTSENDMORE ");
1528 
1529         if (state & SS_CANTRCVMORE)
1530                 (void) strcat(buf, "CANTRCVMORE ");
1531         if (state & SS_ISBOUND)
1532                 (void) strcat(buf, "ISBOUND ");
1533         if (state & SS_NDELAY)
1534                 (void) strcat(buf, "NDELAY ");
1535         if (state & SS_NONBLOCK)
1536                 (void) strcat(buf, "NONBLOCK ");
1537 
1538         if (state & SS_ASYNC)
1539                 (void) strcat(buf, "ASYNC ");
1540         if (state & SS_ACCEPTCONN)
1541                 (void) strcat(buf, "ACCEPTCONN ");
1542         if (state & SS_SAVEDEOR)
1543                 (void) strcat(buf, "SAVEDEOR ");
1544 
1545         if (state & SS_RCVATMARK)
1546                 (void) strcat(buf, "RCVATMARK ");
1547         if (state & SS_OOBPEND)
1548                 (void) strcat(buf, "OOBPEND ");
1549         if (state & SS_HAVEOOBDATA)
1550                 (void) strcat(buf, "HAVEOOBDATA ");
1551         if (state & SS_HADOOBDATA)
1552                 (void) strcat(buf, "HADOOBDATA ");
1553 
1554         if (mode & SM_PRIV)
1555                 (void) strcat(buf, "PRIV ");
1556         if (mode & SM_ATOMIC)
1557                 (void) strcat(buf, "ATOMIC ");
1558         if (mode & SM_ADDR)
1559                 (void) strcat(buf, "ADDR ");
1560         if (mode & SM_CONNREQUIRED)
1561                 (void) strcat(buf, "CONNREQUIRED ");
1562 
1563         if (mode & SM_FDPASSING)
1564                 (void) strcat(buf, "FDPASSING ");
1565         if (mode & SM_EXDATA)
1566                 (void) strcat(buf, "EXDATA ");
1567         if (mode & SM_OPTDATA)
1568                 (void) strcat(buf, "OPTDATA ");
1569         if (mode & SM_BYTESTREAM)
1570                 (void) strcat(buf, "BYTESTREAM ");
1571         return (buf);
1572 }
1573 
1574 char *
1575 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1576 {
1577         static char buf[1024];
1578 
1579         if (addr == NULL || addrlen == 0) {
1580                 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1581                 return (buf);
1582         }
1583         switch (family) {
1584         case AF_INET: {
1585                 struct sockaddr_in sin;
1586 
1587                 bcopy(addr, &sin, sizeof (sin));
1588 
1589                 (void) sprintf(buf, "(len %d) %x/%d",
1590                     addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1591                 break;
1592         }
1593         case AF_INET6: {
1594                 struct sockaddr_in6 sin6;
1595                 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1596 
1597                 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1598                 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1599                     addrlen,
1600                     ntohs(piece[0]), ntohs(piece[1]),
1601                     ntohs(piece[2]), ntohs(piece[3]),
1602                     ntohs(piece[4]), ntohs(piece[5]),
1603                     ntohs(piece[6]), ntohs(piece[7]),
1604                     ntohs(sin6.sin6_port));
1605                 break;
1606         }
1607         case AF_UNIX: {
1608                 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1609 
1610                 (void) sprintf(buf, "(len %d) %s", addrlen,
1611                     (soun == NULL) ? "(none)" : soun->sun_path);
1612                 break;
1613         }
1614         default:
1615                 (void) sprintf(buf, "(unknown af %d)", family);
1616                 break;
1617         }
1618         return (buf);
1619 }
1620 
1621 /* The logical equivalence operator (a if-and-only-if b) */
1622 #define EQUIVALENT(a, b)        (((a) && (b)) || (!(a) && (!(b))))
1623 
1624 /*
1625  * Verify limitations and invariants on oob state.
1626  * Return 1 if OK, otherwise 0 so that it can be used as
1627  *      ASSERT(verify_oobstate(so));
1628  */
1629 int
1630 so_verify_oobstate(struct sonode *so)
1631 {
1632         boolean_t havemark;
1633 
1634         ASSERT(MUTEX_HELD(&so->so_lock));
1635 
1636         /*
1637          * The possible state combinations are:
1638          *      0
1639          *      SS_OOBPEND
1640          *      SS_OOBPEND|SS_HAVEOOBDATA
1641          *      SS_OOBPEND|SS_HADOOBDATA
1642          *      SS_HADOOBDATA
1643          */
1644         switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1645         case 0:
1646         case SS_OOBPEND:
1647         case SS_OOBPEND|SS_HAVEOOBDATA:
1648         case SS_OOBPEND|SS_HADOOBDATA:
1649         case SS_HADOOBDATA:
1650                 break;
1651         default:
1652                 printf("Bad oob state 1 (%p): state %s\n",
1653                     (void *)so, pr_state(so->so_state, so->so_mode));
1654                 return (0);
1655         }
1656 
1657         /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1658         if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1659                 printf("Bad oob state 2 (%p): state %s\n",
1660                     (void *)so, pr_state(so->so_state, so->so_mode));
1661                 return (0);
1662         }
1663 
1664         /*
1665          * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1666          * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1667          */
1668         havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1669             SOTOTPI(so)->sti_oobsigcnt > 0;
1670 
1671         if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1672             so->so_state & SS_OOBPEND)) {
1673                 printf("Bad oob state 3 (%p): state %s\n",
1674                     (void *)so, pr_state(so->so_state, so->so_mode));
1675                 return (0);
1676         }
1677 
1678         /*
1679          * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1680          */
1681         if (!(so->so_options & SO_OOBINLINE) &&
1682             !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1683                 printf("Bad oob state 4 (%p): state %s\n",
1684                     (void *)so, pr_state(so->so_state, so->so_mode));
1685                 return (0);
1686         }
1687 
1688         if (!SOCK_IS_NONSTR(so) &&
1689             SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1690                 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1691                     (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1692                     SOTOTPI(so)->sti_oobcnt,
1693                     pr_state(so->so_state, so->so_mode));
1694                 return (0);
1695         }
1696 
1697         return (1);
1698 }
1699 #undef  EQUIVALENT
1700 #endif /* DEBUG */
1701 
1702 /* initialize sockfs zone specific kstat related items                  */
1703 void *
1704 sock_kstat_init(zoneid_t zoneid)
1705 {
1706         kstat_t *ksp;
1707 
1708         ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1709             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1710 
1711         if (ksp != NULL) {
1712                 ksp->ks_update = sockfs_update;
1713                 ksp->ks_snapshot = sockfs_snapshot;
1714                 ksp->ks_lock = &socklist.sl_lock;
1715                 ksp->ks_private = (void *)(uintptr_t)zoneid;
1716                 kstat_install(ksp);
1717         }
1718 
1719         return (ksp);
1720 }
1721 
1722 /* tear down sockfs zone specific kstat related items                   */
1723 /*ARGSUSED*/
1724 void
1725 sock_kstat_fini(zoneid_t zoneid, void *arg)
1726 {
1727         kstat_t *ksp = (kstat_t *)arg;
1728 
1729         if (ksp != NULL) {
1730                 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1731                 kstat_delete(ksp);
1732         }
1733 }
1734 
1735 /*
1736  * Zones:
1737  * Note that nactive is going to be different for each zone.
1738  * This means we require kstat to call sockfs_update and then sockfs_snapshot
1739  * for the same zone, or sockfs_snapshot will be taken into the wrong size
1740  * buffer. This is safe, but if the buffer is too small, user will not be
1741  * given details of all sockets. However, as this kstat has a ks_lock, kstat
1742  * driver will keep it locked between the update and the snapshot, so no
1743  * other process (zone) can currently get inbetween resulting in a wrong size
1744  * buffer allocation.
1745  */
1746 static int
1747 sockfs_update(kstat_t *ksp, int rw)
1748 {
1749         uint_t  nactive = 0;            /* # of active AF_UNIX sockets  */
1750         struct sonode   *so;            /* current sonode on socklist   */
1751         zoneid_t        myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1752 
1753         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1754 
1755         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1756                 return (EACCES);
1757         }
1758 
1759         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1760                 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1761                         nactive++;
1762                 }
1763         }
1764         ksp->ks_ndata = nactive;
1765         ksp->ks_data_size = nactive * sizeof (struct k_sockinfo);
1766 
1767         return (0);
1768 }
1769 
1770 static int
1771 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1772 {
1773         int                     ns;     /* # of sonodes we've copied    */
1774         struct sonode           *so;    /* current sonode on socklist   */
1775         struct k_sockinfo       *pksi;  /* where we put sockinfo data   */
1776         t_uscalar_t             sn_len; /* soa_len                      */
1777         zoneid_t                myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1778         sotpi_info_t            *sti;
1779 
1780         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1781 
1782         ksp->ks_snaptime = gethrtime();
1783 
1784         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1785                 return (EACCES);
1786         }
1787 
1788         /*
1789          * for each sonode on the socklist, we massage the important
1790          * info into buf, in k_sockinfo format.
1791          */
1792         pksi = (struct k_sockinfo *)buf;
1793         ns = 0;
1794         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1795                 /* only stuff active sonodes and the same zone:         */
1796                 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1797                         continue;
1798                 }
1799 
1800                 /*
1801                  * If the sonode was activated between the update and the
1802                  * snapshot, we're done - as this is only a snapshot.
1803                  */
1804                 if ((caddr_t)(pksi) >= (caddr_t)buf + ksp->ks_data_size) {
1805                         break;
1806                 }
1807 
1808                 sti = SOTOTPI(so);
1809                 /* copy important info into buf:                        */
1810                 pksi->ks_si.si_size = sizeof (struct k_sockinfo);
1811                 pksi->ks_si.si_family = so->so_family;
1812                 pksi->ks_si.si_type = so->so_type;
1813                 pksi->ks_si.si_flag = so->so_flag;
1814                 pksi->ks_si.si_state = so->so_state;
1815                 pksi->ks_si.si_serv_type = sti->sti_serv_type;
1816                 pksi->ks_si.si_ux_laddr_sou_magic =
1817                     sti->sti_ux_laddr.soua_magic;
1818                 pksi->ks_si.si_ux_faddr_sou_magic =
1819                     sti->sti_ux_faddr.soua_magic;
1820                 pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len;
1821                 pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len;
1822                 pksi->ks_si.si_szoneid = so->so_zoneid;
1823                 pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate;
1824 
1825                 mutex_enter(&so->so_lock);
1826 
1827                 if (sti->sti_laddr_sa != NULL) {
1828                         ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1829                         sn_len = sti->sti_laddr_len;
1830                         ASSERT(sn_len <= sizeof (short) +
1831                             sizeof (pksi->ks_si.si_laddr_sun_path));
1832 
1833                         pksi->ks_si.si_laddr_family =
1834                             sti->sti_laddr_sa->sa_family;
1835                         if (sn_len != 0) {
1836                                 /* AF_UNIX socket names are NULL terminated */
1837                                 (void) strncpy(pksi->ks_si.si_laddr_sun_path,
1838                                     sti->sti_laddr_sa->sa_data,
1839                                     sizeof (pksi->ks_si.si_laddr_sun_path));
1840                                 sn_len = strlen(pksi->ks_si.si_laddr_sun_path);
1841                         }
1842                         pksi->ks_si.si_laddr_sun_path[sn_len] = 0;
1843                 }
1844 
1845                 if (sti->sti_faddr_sa != NULL) {
1846                         ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1847                         sn_len = sti->sti_faddr_len;
1848                         ASSERT(sn_len <= sizeof (short) +
1849                             sizeof (pksi->ks_si.si_faddr_sun_path));
1850 
1851                         pksi->ks_si.si_faddr_family =
1852                             sti->sti_faddr_sa->sa_family;
1853                         if (sn_len != 0) {
1854                                 (void) strncpy(pksi->ks_si.si_faddr_sun_path,
1855                                     sti->sti_faddr_sa->sa_data,
1856                                     sizeof (pksi->ks_si.si_faddr_sun_path));
1857                                 sn_len = strlen(pksi->ks_si.si_faddr_sun_path);
1858                         }
1859                         pksi->ks_si.si_faddr_sun_path[sn_len] = 0;
1860                 }
1861 
1862                 mutex_exit(&so->so_lock);
1863 
1864                 (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so);
1865                 (void) sprintf(pksi->ks_straddr[1], "%p",
1866                     (void *)sti->sti_ux_laddr.soua_vp);
1867                 (void) sprintf(pksi->ks_straddr[2], "%p",
1868                     (void *)sti->sti_ux_faddr.soua_vp);
1869 
1870                 ns++;
1871                 pksi++;
1872         }
1873 
1874         ksp->ks_ndata = ns;
1875         return (0);
1876 }
1877 
1878 ssize_t
1879 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1880 {
1881         struct uio auio;
1882         struct iovec aiov[MSG_MAXIOVLEN];
1883         register vnode_t *vp;
1884         int ioflag, rwflag;
1885         ssize_t cnt;
1886         int error = 0;
1887         int iovcnt = 0;
1888         short fflag;
1889 
1890         vp = fp->f_vnode;
1891         fflag = fp->f_flag;
1892 
1893         rwflag = 0;
1894         aiov[0].iov_base = (caddr_t)buf;
1895         aiov[0].iov_len = size;
1896         iovcnt = 1;
1897         cnt = (ssize_t)size;
1898         (void) VOP_RWLOCK(vp, rwflag, NULL);
1899 
1900         auio.uio_loffset = fileoff;
1901         auio.uio_iov = aiov;
1902         auio.uio_iovcnt = iovcnt;
1903         auio.uio_resid = cnt;
1904         auio.uio_segflg = UIO_SYSSPACE;
1905         auio.uio_llimit = MAXOFFSET_T;
1906         auio.uio_fmode = fflag;
1907         auio.uio_extflg = UIO_COPY_CACHED;
1908 
1909         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1910 
1911         /* If read sync is not asked for, filter sync flags */
1912         if ((ioflag & FRSYNC) == 0)
1913                 ioflag &= ~(FSYNC|FDSYNC);
1914         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1915         cnt -= auio.uio_resid;
1916 
1917         VOP_RWUNLOCK(vp, rwflag, NULL);
1918 
1919         if (error == EINTR && cnt != 0)
1920                 error = 0;
1921 out:
1922         if (error != 0) {
1923                 *err = error;
1924                 return (0);
1925         } else {
1926                 *err = 0;
1927                 return (cnt);
1928         }
1929 }
1930 
1931 int
1932 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1933 {
1934         if (fromkernel) {
1935                 bcopy(from, to, size);
1936                 return (0);
1937         }
1938         return (xcopyin(from, to, size));
1939 }
1940 
1941 int
1942 so_copyout(const void *from, void *to, size_t size, int tokernel)
1943 {
1944         if (tokernel) {
1945                 bcopy(from, to, size);
1946                 return (0);
1947         }
1948         return (xcopyout(from, to, size));
1949 }