1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/t_lock.h>
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/buf.h>
  33 #include <sys/conf.h>
  34 #include <sys/cred.h>
  35 #include <sys/kmem.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vfs_opreg.h>
  39 #include <sys/vnode.h>
  40 #include <sys/debug.h>
  41 #include <sys/errno.h>
  42 #include <sys/time.h>
  43 #include <sys/file.h>
  44 #include <sys/open.h>
  45 #include <sys/user.h>
  46 #include <sys/termios.h>
  47 #include <sys/stream.h>
  48 #include <sys/strsubr.h>
  49 #include <sys/strsun.h>
  50 #include <sys/esunddi.h>
  51 #include <sys/flock.h>
  52 #include <sys/modctl.h>
  53 #include <sys/cmn_err.h>
  54 #include <sys/mkdev.h>
  55 #include <sys/pathname.h>
  56 #include <sys/ddi.h>
  57 #include <sys/stat.h>
  58 #include <sys/fs/snode.h>
  59 #include <sys/fs/dv_node.h>
  60 #include <sys/zone.h>
  61 
  62 #include <sys/socket.h>
  63 #include <sys/socketvar.h>
  64 #include <netinet/in.h>
  65 #include <sys/un.h>
  66 #include <sys/ucred.h>
  67 
  68 #include <sys/tiuser.h>
  69 #define _SUN_TPI_VERSION        2
  70 #include <sys/tihdr.h>
  71 
  72 #include <c2/audit.h>
  73 
  74 #include <fs/sockfs/nl7c.h>
  75 #include <fs/sockfs/sockcommon.h>
  76 #include <fs/sockfs/sockfilter_impl.h>
  77 #include <fs/sockfs/socktpi.h>
  78 #include <fs/sockfs/socktpi_impl.h>
  79 #include <fs/sockfs/sodirect.h>
  80 
  81 /*
  82  * Macros that operate on struct cmsghdr.
  83  * The CMSG_VALID macro does not assume that the last option buffer is padded.
  84  */
  85 #define CMSG_CONTENT(cmsg)      (&((cmsg)[1]))
  86 #define CMSG_CONTENTLEN(cmsg)   ((cmsg)->cmsg_len - sizeof (struct cmsghdr))
  87 #define CMSG_VALID(cmsg, start, end)                                    \
  88         (ISALIGNED_cmsghdr(cmsg) &&                                     \
  89         ((uintptr_t)(cmsg) >= (uintptr_t)(start)) &&                 \
  90         ((uintptr_t)(cmsg) < (uintptr_t)(end)) &&                    \
  91         ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \
  92         ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
  93 #define SO_LOCK_WAKEUP_TIME     3000    /* Wakeup time in milliseconds */
  94 
  95 dev_t sockdev;  /* For fsid in getattr */
  96 int sockfs_defer_nl7c_init = 0;
  97 
  98 struct socklist socklist;
  99 
 100 struct kmem_cache *socket_cache;
 101 
 102 /*
 103  * sockconf_lock protects the socket configuration (socket types and
 104  * socket filters) which is changed via the sockconfig system call.
 105  */
 106 krwlock_t sockconf_lock;
 107 
 108 static int sockfs_update(kstat_t *, int);
 109 static int sockfs_snapshot(kstat_t *, void *, int);
 110 extern smod_info_t *sotpi_smod_create(void);
 111 
 112 extern void sendfile_init();
 113 
 114 extern void nl7c_init(void);
 115 
 116 extern int modrootloaded;
 117 
 118 /*
 119  * Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
 120  * Returns with the vnode held.
 121  */
 122 int
 123 sogetvp(char *devpath, vnode_t **vpp, int uioflag)
 124 {
 125         struct snode *csp;
 126         vnode_t *vp, *dvp;
 127         major_t maj;
 128         int error;
 129 
 130         ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
 131 
 132         /*
 133          * Lookup the underlying filesystem vnode.
 134          */
 135         error = lookupname(devpath, uioflag, FOLLOW, NULLVPP, &vp);
 136         if (error)
 137                 return (error);
 138 
 139         /* Check that it is the correct vnode */
 140         if (vp->v_type != VCHR) {
 141                 VN_RELE(vp);
 142                 return (ENOTSOCK);
 143         }
 144 
 145         /*
 146          * If devpath went through devfs, the device should already
 147          * be configured. If devpath is a mknod file, however, we
 148          * need to make sure the device is properly configured.
 149          * To do this, we do something similar to spec_open()
 150          * except that we resolve to the minor/leaf level since
 151          * we need to return a vnode.
 152          */
 153         csp = VTOS(VTOS(vp)->s_commonvp);
 154         if (!(csp->s_flag & SDIPSET)) {
 155                 char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 156                 error = ddi_dev_pathname(vp->v_rdev, S_IFCHR, pathname);
 157                 if (error == 0)
 158                         error = devfs_lookupname(pathname, NULLVPP, &dvp);
 159                 VN_RELE(vp);
 160                 kmem_free(pathname, MAXPATHLEN);
 161                 if (error != 0)
 162                         return (ENXIO);
 163                 vp = dvp;       /* use the devfs vp */
 164         }
 165 
 166         /* device is configured at this point */
 167         maj = getmajor(vp->v_rdev);
 168         if (!STREAMSTAB(maj)) {
 169                 VN_RELE(vp);
 170                 return (ENOSTR);
 171         }
 172 
 173         *vpp = vp;
 174         return (0);
 175 }
 176 
 177 /*
 178  * Update the accessed, updated, or changed times in an sonode
 179  * with the current time.
 180  *
 181  * Note that both SunOS 4.X and 4.4BSD sockets do not present reasonable
 182  * attributes in a fstat call. (They return the current time and 0 for
 183  * all timestamps, respectively.) We maintain the current timestamps
 184  * here primarily so that should sockmod be popped the resulting
 185  * file descriptor will behave like a stream w.r.t. the timestamps.
 186  */
 187 void
 188 so_update_attrs(struct sonode *so, int flag)
 189 {
 190         time_t now = gethrestime_sec();
 191 
 192         if (SOCK_IS_NONSTR(so))
 193                 return;
 194 
 195         mutex_enter(&so->so_lock);
 196         so->so_flag |= flag;
 197         if (flag & SOACC)
 198                 SOTOTPI(so)->sti_atime = now;
 199         if (flag & SOMOD)
 200                 SOTOTPI(so)->sti_mtime = now;
 201         mutex_exit(&so->so_lock);
 202 }
 203 
 204 extern so_create_func_t sock_comm_create_function;
 205 extern so_destroy_func_t sock_comm_destroy_function;
 206 /*
 207  * Init function called when sockfs is loaded.
 208  */
 209 int
 210 sockinit(int fstype, char *name)
 211 {
 212         static const fs_operation_def_t sock_vfsops_template[] = {
 213                 NULL, NULL
 214         };
 215         int error;
 216         major_t dev;
 217         char *err_str;
 218 
 219         error = vfs_setfsops(fstype, sock_vfsops_template, NULL);
 220         if (error != 0) {
 221                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 222                     "sockinit: bad vfs ops template");
 223                 return (error);
 224         }
 225 
 226         error = vn_make_ops(name, socket_vnodeops_template,
 227             &socket_vnodeops);
 228         if (error != 0) {
 229                 err_str = "sockinit: bad socket vnode ops template";
 230                 /* vn_make_ops() does not reset socktpi_vnodeops on failure. */
 231                 socket_vnodeops = NULL;
 232                 goto failure;
 233         }
 234 
 235         socket_cache = kmem_cache_create("socket_cache",
 236             sizeof (struct sonode), 0, sonode_constructor,
 237             sonode_destructor, NULL, NULL, NULL, 0);
 238 
 239         rw_init(&sockconf_lock, NULL, RW_DEFAULT, NULL);
 240 
 241         error = socktpi_init();
 242         if (error != 0) {
 243                 err_str = NULL;
 244                 goto failure;
 245         }
 246 
 247         error = sod_init();
 248         if (error != 0) {
 249                 err_str = NULL;
 250                 goto failure;
 251         }
 252 
 253         /*
 254          * Set up the default create and destroy functions
 255          */
 256         sock_comm_create_function = socket_sonode_create;
 257         sock_comm_destroy_function = socket_sonode_destroy;
 258 
 259         /*
 260          * Build initial list mapping socket parameters to vnode.
 261          */
 262         smod_init();
 263         smod_add(sotpi_smod_create());
 264 
 265         sockparams_init();
 266 
 267         /*
 268          * If sockets are needed before init runs /sbin/soconfig
 269          * it is possible to preload the sockparams list here using
 270          * calls like:
 271          *      sockconfig(1,2,3, "/dev/tcp", 0);
 272          */
 273 
 274         /*
 275          * Create a unique dev_t for use in so_fsid.
 276          */
 277 
 278         if ((dev = getudev()) == (major_t)-1)
 279                 dev = 0;
 280         sockdev = makedevice(dev, 0);
 281 
 282         mutex_init(&socklist.sl_lock, NULL, MUTEX_DEFAULT, NULL);
 283         sendfile_init();
 284         if (!modrootloaded) {
 285                 sockfs_defer_nl7c_init = 1;
 286         } else {
 287                 nl7c_init();
 288         }
 289 
 290         /* Initialize socket filters */
 291         sof_init();
 292 
 293         return (0);
 294 
 295 failure:
 296         (void) vfs_freevfsops_by_type(fstype);
 297         if (socket_vnodeops != NULL)
 298                 vn_freevnodeops(socket_vnodeops);
 299         if (err_str != NULL)
 300                 zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
 301         return (error);
 302 }
 303 
 304 /*
 305  * Caller must hold the mutex. Used to set SOLOCKED.
 306  */
 307 void
 308 so_lock_single(struct sonode *so)
 309 {
 310         ASSERT(MUTEX_HELD(&so->so_lock));
 311 
 312         while (so->so_flag & (SOLOCKED | SOASYNC_UNBIND)) {
 313                 cv_wait_stop(&so->so_single_cv, &so->so_lock,
 314                     SO_LOCK_WAKEUP_TIME);
 315         }
 316         so->so_flag |= SOLOCKED;
 317 }
 318 
 319 /*
 320  * Caller must hold the mutex and pass in SOLOCKED or SOASYNC_UNBIND.
 321  * Used to clear SOLOCKED or SOASYNC_UNBIND.
 322  */
 323 void
 324 so_unlock_single(struct sonode *so, int flag)
 325 {
 326         ASSERT(MUTEX_HELD(&so->so_lock));
 327         ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
 328         ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
 329         ASSERT(so->so_flag & flag);
 330         /*
 331          * Process the T_DISCON_IND on sti_discon_ind_mp.
 332          *
 333          * Call to so_drain_discon_ind will result in so_lock
 334          * being dropped and re-acquired later.
 335          */
 336         if (!SOCK_IS_NONSTR(so)) {
 337                 sotpi_info_t *sti = SOTOTPI(so);
 338 
 339                 if (sti->sti_discon_ind_mp != NULL)
 340                         so_drain_discon_ind(so);
 341         }
 342 
 343         cv_signal(&so->so_single_cv);
 344         so->so_flag &= ~flag;
 345 }
 346 
 347 /*
 348  * Caller must hold the mutex. Used to set SOREADLOCKED.
 349  * If the caller wants nonblocking behavior it should set fmode.
 350  */
 351 int
 352 so_lock_read(struct sonode *so, int fmode)
 353 {
 354         ASSERT(MUTEX_HELD(&so->so_lock));
 355 
 356         while (so->so_flag & SOREADLOCKED) {
 357                 if (fmode & (FNDELAY|FNONBLOCK))
 358                         return (EWOULDBLOCK);
 359                 cv_wait_stop(&so->so_read_cv, &so->so_lock,
 360                     SO_LOCK_WAKEUP_TIME);
 361         }
 362         so->so_flag |= SOREADLOCKED;
 363         return (0);
 364 }
 365 
 366 /*
 367  * Like so_lock_read above but allows signals.
 368  */
 369 int
 370 so_lock_read_intr(struct sonode *so, int fmode)
 371 {
 372         ASSERT(MUTEX_HELD(&so->so_lock));
 373 
 374         while (so->so_flag & SOREADLOCKED) {
 375                 if (fmode & (FNDELAY|FNONBLOCK))
 376                         return (EWOULDBLOCK);
 377                 if (!cv_wait_sig(&so->so_read_cv, &so->so_lock))
 378                         return (EINTR);
 379         }
 380         so->so_flag |= SOREADLOCKED;
 381         return (0);
 382 }
 383 
 384 /*
 385  * Caller must hold the mutex. Used to clear SOREADLOCKED,
 386  * set in so_lock_read() or so_lock_read_intr().
 387  */
 388 void
 389 so_unlock_read(struct sonode *so)
 390 {
 391         ASSERT(MUTEX_HELD(&so->so_lock));
 392         ASSERT(so->so_flag & SOREADLOCKED);
 393 
 394         cv_signal(&so->so_read_cv);
 395         so->so_flag &= ~SOREADLOCKED;
 396 }
 397 
 398 /*
 399  * Verify that the specified offset falls within the mblk and
 400  * that the resulting pointer is aligned.
 401  * Returns NULL if not.
 402  */
 403 void *
 404 sogetoff(mblk_t *mp, t_uscalar_t offset,
 405     t_uscalar_t length, uint_t align_size)
 406 {
 407         uintptr_t ptr1, ptr2;
 408 
 409         ASSERT(mp && mp->b_wptr >= mp->b_rptr);
 410         ptr1 = (uintptr_t)mp->b_rptr + offset;
 411         ptr2 = (uintptr_t)ptr1 + length;
 412         if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
 413                 eprintline(0);
 414                 return (NULL);
 415         }
 416         if ((ptr1 & (align_size - 1)) != 0) {
 417                 eprintline(0);
 418                 return (NULL);
 419         }
 420         return ((void *)ptr1);
 421 }
 422 
 423 /*
 424  * Return the AF_UNIX underlying filesystem vnode matching a given name.
 425  * Makes sure the sending and the destination sonodes are compatible.
 426  * The vnode is returned held.
 427  *
 428  * The underlying filesystem VSOCK vnode has a v_stream pointer that
 429  * references the actual stream head (hence indirectly the actual sonode).
 430  */
 431 static int
 432 so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess,
 433     vnode_t **vpp)
 434 {
 435         vnode_t         *vp;    /* Underlying filesystem vnode */
 436         vnode_t         *rvp;   /* real vnode */
 437         vnode_t         *svp;   /* sockfs vnode */
 438         struct sonode   *so2;
 439         int             error;
 440 
 441         dprintso(so, 1, ("so_ux_lookup(%p) name <%s>\n", (void *)so,
 442             soun->sun_path));
 443 
 444         error = lookupname(soun->sun_path, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
 445         if (error) {
 446                 eprintsoline(so, error);
 447                 return (error);
 448         }
 449 
 450         /*
 451          * Traverse lofs mounts get the real vnode
 452          */
 453         if (VOP_REALVP(vp, &rvp, NULL) == 0) {
 454                 VN_HOLD(rvp);           /* hold the real vnode */
 455                 VN_RELE(vp);            /* release hold from lookup */
 456                 vp = rvp;
 457         }
 458 
 459         if (vp->v_type != VSOCK) {
 460                 error = ENOTSOCK;
 461                 eprintsoline(so, error);
 462                 goto done2;
 463         }
 464 
 465         if (checkaccess) {
 466                 /*
 467                  * Check that we have permissions to access the destination
 468                  * vnode. This check is not done in BSD but it is required
 469                  * by X/Open.
 470                  */
 471                 if (error = VOP_ACCESS(vp, VREAD|VWRITE, 0, CRED(), NULL)) {
 472                         eprintsoline(so, error);
 473                         goto done2;
 474                 }
 475         }
 476 
 477         /*
 478          * Check if the remote socket has been closed.
 479          *
 480          * Synchronize with vn_rele_stream by holding v_lock while traversing
 481          * v_stream->sd_vnode.
 482          */
 483         mutex_enter(&vp->v_lock);
 484         if (vp->v_stream == NULL) {
 485                 mutex_exit(&vp->v_lock);
 486                 if (so->so_type == SOCK_DGRAM)
 487                         error = EDESTADDRREQ;
 488                 else
 489                         error = ECONNREFUSED;
 490 
 491                 eprintsoline(so, error);
 492                 goto done2;
 493         }
 494         ASSERT(vp->v_stream->sd_vnode);
 495         svp = vp->v_stream->sd_vnode;
 496         /*
 497          * holding v_lock on underlying filesystem vnode and acquiring
 498          * it on sockfs vnode. Assumes that no code ever attempts to
 499          * acquire these locks in the reverse order.
 500          */
 501         VN_HOLD(svp);
 502         mutex_exit(&vp->v_lock);
 503 
 504         if (svp->v_type != VSOCK) {
 505                 error = ENOTSOCK;
 506                 eprintsoline(so, error);
 507                 goto done;
 508         }
 509 
 510         so2 = VTOSO(svp);
 511 
 512         if (so->so_type != so2->so_type) {
 513                 error = EPROTOTYPE;
 514                 eprintsoline(so, error);
 515                 goto done;
 516         }
 517 
 518         VN_RELE(svp);
 519         *vpp = vp;
 520         return (0);
 521 
 522 done:
 523         VN_RELE(svp);
 524 done2:
 525         VN_RELE(vp);
 526         return (error);
 527 }
 528 
 529 /*
 530  * Verify peer address for connect and sendto/sendmsg.
 531  * Since sendto/sendmsg would not get synchronous errors from the transport
 532  * provider we have to do these ugly checks in the socket layer to
 533  * preserve compatibility with SunOS 4.X.
 534  */
 535 int
 536 so_addr_verify(struct sonode *so, const struct sockaddr *name,
 537     socklen_t namelen)
 538 {
 539         int             family;
 540 
 541         dprintso(so, 1, ("so_addr_verify(%p, %p, %d)\n",
 542             (void *)so, (void *)name, namelen));
 543 
 544         ASSERT(name != NULL);
 545 
 546         family = so->so_family;
 547         switch (family) {
 548         case AF_INET:
 549                 if (name->sa_family != family) {
 550                         eprintsoline(so, EAFNOSUPPORT);
 551                         return (EAFNOSUPPORT);
 552                 }
 553                 if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
 554                         eprintsoline(so, EINVAL);
 555                         return (EINVAL);
 556                 }
 557                 break;
 558         case AF_INET6: {
 559 #ifdef DEBUG
 560                 struct sockaddr_in6 *sin6;
 561 #endif /* DEBUG */
 562 
 563                 if (name->sa_family != family) {
 564                         eprintsoline(so, EAFNOSUPPORT);
 565                         return (EAFNOSUPPORT);
 566                 }
 567                 if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
 568                         eprintsoline(so, EINVAL);
 569                         return (EINVAL);
 570                 }
 571 #ifdef DEBUG
 572                 /* Verify that apps don't forget to clear sin6_scope_id etc */
 573                 sin6 = (struct sockaddr_in6 *)name;
 574                 if (sin6->sin6_scope_id != 0 &&
 575                     !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
 576                         zcmn_err(getzoneid(), CE_WARN,
 577                             "connect/send* with uninitialized sin6_scope_id "
 578                             "(%d) on socket. Pid = %d\n",
 579                             (int)sin6->sin6_scope_id, (int)curproc->p_pid);
 580                 }
 581 #endif /* DEBUG */
 582                 break;
 583         }
 584         case AF_UNIX:
 585                 if (SOTOTPI(so)->sti_faddr_noxlate) {
 586                         return (0);
 587                 }
 588                 if (namelen < (socklen_t)sizeof (short)) {
 589                         eprintsoline(so, ENOENT);
 590                         return (ENOENT);
 591                 }
 592                 if (name->sa_family != family) {
 593                         eprintsoline(so, EAFNOSUPPORT);
 594                         return (EAFNOSUPPORT);
 595                 }
 596                 /* MAXPATHLEN + soun_family + nul termination */
 597                 if (namelen > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
 598                         eprintsoline(so, ENAMETOOLONG);
 599                         return (ENAMETOOLONG);
 600                 }
 601 
 602                 break;
 603 
 604         default:
 605                 /*
 606                  * Default is don't do any length or sa_family check
 607                  * to allow non-sockaddr style addresses.
 608                  */
 609                 break;
 610         }
 611 
 612         return (0);
 613 }
 614 
 615 
 616 /*
 617  * Translate an AF_UNIX sockaddr_un to the transport internal name.
 618  * Assumes caller has called so_addr_verify first.  The translated
 619  * (internal form) address is stored in sti->sti_ux_taddr.
 620  */
 621 /*ARGSUSED*/
 622 int
 623 so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
 624     socklen_t namelen, int checkaccess,
 625     void **addrp, socklen_t *addrlenp)
 626 {
 627         int                     error;
 628         struct sockaddr_un      *soun;
 629         vnode_t                 *vp;
 630         void                    *addr;
 631         socklen_t               addrlen;
 632         sotpi_info_t            *sti = SOTOTPI(so);
 633 
 634         dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
 635             (void *)so, (void *)name, namelen, checkaccess));
 636 
 637         ASSERT(name != NULL);
 638         ASSERT(so->so_family == AF_UNIX);
 639         ASSERT(!sti->sti_faddr_noxlate);
 640         ASSERT(namelen >= (socklen_t)sizeof (short));
 641         ASSERT(name->sa_family == AF_UNIX);
 642         soun = (struct sockaddr_un *)name;
 643         /*
 644          * Lookup vnode for the specified path name and verify that
 645          * it is a socket.
 646          */
 647         error = so_ux_lookup(so, soun, checkaccess, &vp);
 648         if (error) {
 649                 eprintsoline(so, error);
 650                 return (error);
 651         }
 652         /*
 653          * Use the address of the peer vnode as the address to send
 654          * to. We release the peer vnode here. In case it has been
 655          * closed by the time the T_CONN_REQ or T_UNITDATA_REQ reaches the
 656          * transport the message will get an error or be dropped.
 657          * Note that that soua_vp is never dereferenced; it's just a
 658          * convenient value by which we can identify the peer.
 659          */
 660         sti->sti_ux_taddr.soua_vp = vp;
 661         sti->sti_ux_taddr.soua_magic = SOU_MAGIC_EXPLICIT;
 662         addr = &sti->sti_ux_taddr;
 663         addrlen = (socklen_t)sizeof (sti->sti_ux_taddr);
 664         dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
 665             addrlen, (void *)vp));
 666         VN_RELE(vp);
 667         *addrp = addr;
 668         *addrlenp = (socklen_t)addrlen;
 669         return (0);
 670 }
 671 
 672 /*
 673  * Esballoc free function for messages that contain SO_FILEP option.
 674  * Decrement the reference count on the file pointers using closef.
 675  */
 676 void
 677 fdbuf_free(struct fdbuf *fdbuf)
 678 {
 679         int     i;
 680         struct file *fp;
 681 
 682         dprint(1, ("fdbuf_free: %d fds\n", fdbuf->fd_numfd));
 683         for (i = 0; i < fdbuf->fd_numfd; i++) {
 684                 /*
 685                  * We need pointer size alignment for fd_fds. On a LP64
 686                  * kernel, the required alignment is 8 bytes while
 687                  * the option headers and values are only 4 bytes
 688                  * aligned. So its safer to do a bcopy compared to
 689                  * assigning fdbuf->fd_fds[i] to fp.
 690                  */
 691                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 692                 dprint(1, ("fdbuf_free: [%d] = %p\n", i, (void *)fp));
 693                 (void) closef(fp);
 694         }
 695         if (fdbuf->fd_ebuf != NULL)
 696                 kmem_free(fdbuf->fd_ebuf, fdbuf->fd_ebuflen);
 697         kmem_free(fdbuf, fdbuf->fd_size);
 698 }
 699 
 700 /*
 701  * Allocate an esballoc'ed message for AF_UNIX file descriptor passing.
 702  * Waits if memory is not available.
 703  */
 704 mblk_t *
 705 fdbuf_allocmsg(int size, struct fdbuf *fdbuf)
 706 {
 707         uchar_t *buf;
 708         mblk_t  *mp;
 709 
 710         dprint(1, ("fdbuf_allocmsg: size %d, %d fds\n", size, fdbuf->fd_numfd));
 711         buf = kmem_alloc(size, KM_SLEEP);
 712         fdbuf->fd_ebuf = (caddr_t)buf;
 713         fdbuf->fd_ebuflen = size;
 714         fdbuf->fd_frtn.free_func = fdbuf_free;
 715         fdbuf->fd_frtn.free_arg = (caddr_t)fdbuf;
 716 
 717         mp = esballoc_wait(buf, size, BPRI_MED, &fdbuf->fd_frtn);
 718         mp->b_datap->db_type = M_PROTO;
 719         return (mp);
 720 }
 721 
 722 /*
 723  * Extract file descriptors from a fdbuf.
 724  * Return list in rights/rightslen.
 725  */
 726 /*ARGSUSED*/
 727 static int
 728 fdbuf_extract(struct fdbuf *fdbuf, void *rights, int rightslen)
 729 {
 730         int     i, fd;
 731         int     *rp;
 732         struct file *fp;
 733         int     numfd;
 734 
 735         dprint(1, ("fdbuf_extract: %d fds, len %d\n",
 736             fdbuf->fd_numfd, rightslen));
 737 
 738         numfd = fdbuf->fd_numfd;
 739         ASSERT(rightslen == numfd * (int)sizeof (int));
 740 
 741         /*
 742          * Allocate a file descriptor and increment the f_count.
 743          * The latter is needed since we always call fdbuf_free
 744          * which performs a closef.
 745          */
 746         rp = (int *)rights;
 747         for (i = 0; i < numfd; i++) {
 748                 if ((fd = ufalloc(0)) == -1)
 749                         goto cleanup;
 750                 /*
 751                  * We need pointer size alignment for fd_fds. On a LP64
 752                  * kernel, the required alignment is 8 bytes while
 753                  * the option headers and values are only 4 bytes
 754                  * aligned. So its safer to do a bcopy compared to
 755                  * assigning fdbuf->fd_fds[i] to fp.
 756                  */
 757                 bcopy((char *)&fdbuf->fd_fds[i], (char *)&fp, sizeof (fp));
 758                 mutex_enter(&fp->f_tlock);
 759                 fp->f_count++;
 760                 mutex_exit(&fp->f_tlock);
 761                 setf(fd, fp);
 762                 *rp++ = fd;
 763                 if (AU_AUDITING())
 764                         audit_fdrecv(fd, fp);
 765                 dprint(1, ("fdbuf_extract: [%d] = %d, %p refcnt %d\n",
 766                     i, fd, (void *)fp, fp->f_count));
 767         }
 768         return (0);
 769 
 770 cleanup:
 771         /*
 772          * Undo whatever partial work the loop above has done.
 773          */
 774         {
 775                 int j;
 776 
 777                 rp = (int *)rights;
 778                 for (j = 0; j < i; j++) {
 779                         dprint(0,
 780                             ("fdbuf_extract: cleanup[%d] = %d\n", j, *rp));
 781                         (void) closeandsetf(*rp++, NULL);
 782                 }
 783         }
 784 
 785         return (EMFILE);
 786 }
 787 
 788 /*
 789  * Insert file descriptors into an fdbuf.
 790  * Returns a kmem_alloc'ed fdbuf. The fdbuf should be freed
 791  * by calling fdbuf_free().
 792  */
 793 int
 794 fdbuf_create(void *rights, int rightslen, struct fdbuf **fdbufp)
 795 {
 796         int             numfd, i;
 797         int             *fds;
 798         struct file     *fp;
 799         struct fdbuf    *fdbuf;
 800         int             fdbufsize;
 801 
 802         dprint(1, ("fdbuf_create: len %d\n", rightslen));
 803 
 804         numfd = rightslen / (int)sizeof (int);
 805 
 806         fdbufsize = (int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *));
 807         fdbuf = kmem_alloc(fdbufsize, KM_SLEEP);
 808         fdbuf->fd_size = fdbufsize;
 809         fdbuf->fd_numfd = 0;
 810         fdbuf->fd_ebuf = NULL;
 811         fdbuf->fd_ebuflen = 0;
 812         fds = (int *)rights;
 813         for (i = 0; i < numfd; i++) {
 814                 if ((fp = getf(fds[i])) == NULL) {
 815                         fdbuf_free(fdbuf);
 816                         return (EBADF);
 817                 }
 818                 dprint(1, ("fdbuf_create: [%d] = %d, %p refcnt %d\n",
 819                     i, fds[i], (void *)fp, fp->f_count));
 820                 mutex_enter(&fp->f_tlock);
 821                 fp->f_count++;
 822                 mutex_exit(&fp->f_tlock);
 823                 /*
 824                  * The maximum alignment for fdbuf (or any option header
 825                  * and its value) it 4 bytes. On a LP64 kernel, the alignment
 826                  * is not sufficient for pointers (fd_fds in this case). Since
 827                  * we just did a kmem_alloc (we get a double word alignment),
 828                  * we don't need to do anything on the send side (we loose
 829                  * the double word alignment because fdbuf goes after an
 830                  * option header (eg T_unitdata_req) which is only 4 byte
 831                  * aligned). We take care of this when we extract the file
 832                  * descriptor in fdbuf_extract or fdbuf_free.
 833                  */
 834                 fdbuf->fd_fds[i] = fp;
 835                 fdbuf->fd_numfd++;
 836                 releasef(fds[i]);
 837                 if (AU_AUDITING())
 838                         audit_fdsend(fds[i], fp, 0);
 839         }
 840         *fdbufp = fdbuf;
 841         return (0);
 842 }
 843 
 844 static int
 845 fdbuf_optlen(int rightslen)
 846 {
 847         int numfd;
 848 
 849         numfd = rightslen / (int)sizeof (int);
 850 
 851         return ((int)FDBUF_HDRSIZE + (numfd * (int)sizeof (struct file *)));
 852 }
 853 
 854 static t_uscalar_t
 855 fdbuf_cmsglen(int fdbuflen)
 856 {
 857         return (t_uscalar_t)((fdbuflen - FDBUF_HDRSIZE) /
 858             (int)sizeof (struct file *) * (int)sizeof (int));
 859 }
 860 
 861 
 862 /*
 863  * Return non-zero if the mblk and fdbuf are consistent.
 864  */
 865 static int
 866 fdbuf_verify(mblk_t *mp, struct fdbuf *fdbuf, int fdbuflen)
 867 {
 868         if (fdbuflen >= FDBUF_HDRSIZE &&
 869             fdbuflen == fdbuf->fd_size) {
 870                 frtn_t *frp = mp->b_datap->db_frtnp;
 871                 /*
 872                  * Check that the SO_FILEP portion of the
 873                  * message has not been modified by
 874                  * the loopback transport. The sending sockfs generates
 875                  * a message that is esballoc'ed with the free function
 876                  * being fdbuf_free() and where free_arg contains the
 877                  * identical information as the SO_FILEP content.
 878                  *
 879                  * If any of these constraints are not satisfied we
 880                  * silently ignore the option.
 881                  */
 882                 ASSERT(mp);
 883                 if (frp != NULL &&
 884                     frp->free_func == fdbuf_free &&
 885                     frp->free_arg != NULL &&
 886                     bcmp(frp->free_arg, fdbuf, fdbuflen) == 0) {
 887                         dprint(1, ("fdbuf_verify: fdbuf %p len %d\n",
 888                             (void *)fdbuf, fdbuflen));
 889                         return (1);
 890                 } else {
 891                         zcmn_err(getzoneid(), CE_WARN,
 892                             "sockfs: mismatched fdbuf content (%p)",
 893                             (void *)mp);
 894                         return (0);
 895                 }
 896         } else {
 897                 zcmn_err(getzoneid(), CE_WARN,
 898                     "sockfs: mismatched fdbuf len %d, %d\n",
 899                     fdbuflen, fdbuf->fd_size);
 900                 return (0);
 901         }
 902 }
 903 
 904 /*
 905  * When the file descriptors returned by sorecvmsg can not be passed
 906  * to the application this routine will cleanup the references on
 907  * the files. Start at startoff bytes into the buffer.
 908  */
 909 static void
 910 close_fds(void *fdbuf, int fdbuflen, int startoff)
 911 {
 912         int *fds = (int *)fdbuf;
 913         int numfd = fdbuflen / (int)sizeof (int);
 914         int i;
 915 
 916         dprint(1, ("close_fds(%p, %d, %d)\n", fdbuf, fdbuflen, startoff));
 917 
 918         for (i = 0; i < numfd; i++) {
 919                 if (startoff < 0)
 920                         startoff = 0;
 921                 if (startoff < (int)sizeof (int)) {
 922                         /*
 923                          * This file descriptor is partially or fully after
 924                          * the offset
 925                          */
 926                         dprint(0,
 927                             ("close_fds: cleanup[%d] = %d\n", i, fds[i]));
 928                         (void) closeandsetf(fds[i], NULL);
 929                 }
 930                 startoff -= (int)sizeof (int);
 931         }
 932 }
 933 
 934 /*
 935  * Close all file descriptors contained in the control part starting at
 936  * the startoffset.
 937  */
 938 void
 939 so_closefds(void *control, t_uscalar_t controllen, int oldflg,
 940     int startoff)
 941 {
 942         struct cmsghdr *cmsg;
 943 
 944         if (control == NULL)
 945                 return;
 946 
 947         if (oldflg) {
 948                 close_fds(control, controllen, startoff);
 949                 return;
 950         }
 951         /* Scan control part for file descriptors. */
 952         for (cmsg = (struct cmsghdr *)control;
 953             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
 954             cmsg = CMSG_NEXT(cmsg)) {
 955                 if (cmsg->cmsg_level == SOL_SOCKET &&
 956                     cmsg->cmsg_type == SCM_RIGHTS) {
 957                         close_fds(CMSG_CONTENT(cmsg),
 958                             (int)CMSG_CONTENTLEN(cmsg),
 959                             startoff - (int)sizeof (struct cmsghdr));
 960                 }
 961                 startoff -= cmsg->cmsg_len;
 962         }
 963 }
 964 
 965 /*
 966  * Returns a pointer/length for the file descriptors contained
 967  * in the control buffer. Returns with *fdlenp == -1 if there are no
 968  * file descriptor options present. This is different than there being
 969  * a zero-length file descriptor option.
 970  * Fail if there are multiple SCM_RIGHT cmsgs.
 971  */
 972 int
 973 so_getfdopt(void *control, t_uscalar_t controllen, int oldflg,
 974     void **fdsp, int *fdlenp)
 975 {
 976         struct cmsghdr *cmsg;
 977         void *fds;
 978         int fdlen;
 979 
 980         if (control == NULL) {
 981                 *fdsp = NULL;
 982                 *fdlenp = -1;
 983                 return (0);
 984         }
 985 
 986         if (oldflg) {
 987                 *fdsp = control;
 988                 if (controllen == 0)
 989                         *fdlenp = -1;
 990                 else
 991                         *fdlenp = controllen;
 992                 dprint(1, ("so_getfdopt: old %d\n", *fdlenp));
 993                 return (0);
 994         }
 995 
 996         fds = NULL;
 997         fdlen = 0;
 998 
 999         for (cmsg = (struct cmsghdr *)control;
1000             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1001             cmsg = CMSG_NEXT(cmsg)) {
1002                 if (cmsg->cmsg_level == SOL_SOCKET &&
1003                     cmsg->cmsg_type == SCM_RIGHTS) {
1004                         if (fds != NULL)
1005                                 return (EINVAL);
1006                         fds = CMSG_CONTENT(cmsg);
1007                         fdlen = (int)CMSG_CONTENTLEN(cmsg);
1008                         dprint(1, ("so_getfdopt: new %lu\n",
1009                             (size_t)CMSG_CONTENTLEN(cmsg)));
1010                 }
1011         }
1012         if (fds == NULL) {
1013                 dprint(1, ("so_getfdopt: NONE\n"));
1014                 *fdlenp = -1;
1015         } else
1016                 *fdlenp = fdlen;
1017         *fdsp = fds;
1018         return (0);
1019 }
1020 
1021 /*
1022  * Return the length of the options including any file descriptor options.
1023  */
1024 t_uscalar_t
1025 so_optlen(void *control, t_uscalar_t controllen, int oldflg)
1026 {
1027         struct cmsghdr *cmsg;
1028         t_uscalar_t optlen = 0;
1029         t_uscalar_t len;
1030 
1031         if (control == NULL)
1032                 return (0);
1033 
1034         if (oldflg)
1035                 return ((t_uscalar_t)(sizeof (struct T_opthdr) +
1036                     fdbuf_optlen(controllen)));
1037 
1038         for (cmsg = (struct cmsghdr *)control;
1039             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1040             cmsg = CMSG_NEXT(cmsg)) {
1041                 if (cmsg->cmsg_level == SOL_SOCKET &&
1042                     cmsg->cmsg_type == SCM_RIGHTS) {
1043                         len = fdbuf_optlen((int)CMSG_CONTENTLEN(cmsg));
1044                 } else {
1045                         len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1046                 }
1047                 optlen += (t_uscalar_t)(_TPI_ALIGN_TOPT(len) +
1048                     sizeof (struct T_opthdr));
1049         }
1050         dprint(1, ("so_optlen: controllen %d, flg %d -> optlen %d\n",
1051             controllen, oldflg, optlen));
1052         return (optlen);
1053 }
1054 
1055 /*
1056  * Copy options from control to the mblk. Skip any file descriptor options.
1057  */
1058 void
1059 so_cmsg2opt(void *control, t_uscalar_t controllen, int oldflg, mblk_t *mp)
1060 {
1061         struct T_opthdr toh;
1062         struct cmsghdr *cmsg;
1063 
1064         if (control == NULL)
1065                 return;
1066 
1067         if (oldflg) {
1068                 /* No real options - caller has handled file descriptors */
1069                 return;
1070         }
1071         for (cmsg = (struct cmsghdr *)control;
1072             CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
1073             cmsg = CMSG_NEXT(cmsg)) {
1074                 /*
1075                  * Note: The caller handles file descriptors prior
1076                  * to calling this function.
1077                  */
1078                 t_uscalar_t len;
1079 
1080                 if (cmsg->cmsg_level == SOL_SOCKET &&
1081                     cmsg->cmsg_type == SCM_RIGHTS)
1082                         continue;
1083 
1084                 len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
1085                 toh.level = cmsg->cmsg_level;
1086                 toh.name = cmsg->cmsg_type;
1087                 toh.len = len + (t_uscalar_t)sizeof (struct T_opthdr);
1088                 toh.status = 0;
1089 
1090                 soappendmsg(mp, &toh, sizeof (toh));
1091                 soappendmsg(mp, CMSG_CONTENT(cmsg), len);
1092                 mp->b_wptr += _TPI_ALIGN_TOPT(len) - len;
1093                 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
1094         }
1095 }
1096 
1097 /*
1098  * Return the length of the control message derived from the options.
1099  * Exclude SO_SRCADDR and SO_UNIX_CLOSE options. Include SO_FILEP.
1100  * When oldflg is set only include SO_FILEP.
1101  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1102  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1103  * also be checked for any possible impacts.
1104  */
1105 t_uscalar_t
1106 so_cmsglen(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg)
1107 {
1108         t_uscalar_t cmsglen = 0;
1109         struct T_opthdr *tohp;
1110         t_uscalar_t len;
1111         t_uscalar_t last_roundup = 0;
1112 
1113         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1114 
1115         for (tohp = (struct T_opthdr *)opt;
1116             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1117             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1118                 dprint(1, ("so_cmsglen: level 0x%x, name %d, len %d\n",
1119                     tohp->level, tohp->name, tohp->len));
1120                 if (tohp->level == SOL_SOCKET &&
1121                     (tohp->name == SO_SRCADDR ||
1122                     tohp->name == SO_UNIX_CLOSE)) {
1123                         continue;
1124                 }
1125                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1126                         struct fdbuf *fdbuf;
1127                         int fdbuflen;
1128 
1129                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1130                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1131 
1132                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1133                                 continue;
1134                         if (oldflg) {
1135                                 cmsglen += fdbuf_cmsglen(fdbuflen);
1136                                 continue;
1137                         }
1138                         len = fdbuf_cmsglen(fdbuflen);
1139                 } else if (tohp->level == SOL_SOCKET &&
1140                     tohp->name == SCM_TIMESTAMP) {
1141                         if (oldflg)
1142                                 continue;
1143 
1144                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1145                                 len = sizeof (struct timeval);
1146                         } else {
1147                                 len = sizeof (struct timeval32);
1148                         }
1149                 } else {
1150                         if (oldflg)
1151                                 continue;
1152                         len = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1153                 }
1154                 /*
1155                  * Exclude roundup for last option to not set
1156                  * MSG_CTRUNC when the cmsg fits but the padding doesn't fit.
1157                  */
1158                 last_roundup = (t_uscalar_t)
1159                     (ROUNDUP_cmsglen(len + (int)sizeof (struct cmsghdr)) -
1160                     (len + (int)sizeof (struct cmsghdr)));
1161                 cmsglen += (t_uscalar_t)(len + (int)sizeof (struct cmsghdr)) +
1162                     last_roundup;
1163         }
1164         cmsglen -= last_roundup;
1165         dprint(1, ("so_cmsglen: optlen %d, flg %d -> cmsglen %d\n",
1166             optlen, oldflg, cmsglen));
1167         return (cmsglen);
1168 }
1169 
1170 /*
1171  * Copy options from options to the control. Convert SO_FILEP to
1172  * file descriptors.
1173  * Returns errno or zero.
1174  * so_opt2cmsg and so_cmsglen are inter-related since so_cmsglen
1175  * allocates the space that so_opt2cmsg fills. If one changes, the other should
1176  * also be checked for any possible impacts.
1177  */
1178 int
1179 so_opt2cmsg(mblk_t *mp, void *opt, t_uscalar_t optlen, int oldflg,
1180     void *control, t_uscalar_t controllen)
1181 {
1182         struct T_opthdr *tohp;
1183         struct cmsghdr *cmsg;
1184         struct fdbuf *fdbuf;
1185         int fdbuflen;
1186         int error;
1187 #if defined(DEBUG) || defined(__lint)
1188         struct cmsghdr *cend = (struct cmsghdr *)
1189             (((uint8_t *)control) + ROUNDUP_cmsglen(controllen));
1190 #endif
1191         cmsg = (struct cmsghdr *)control;
1192 
1193         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1194 
1195         for (tohp = (struct T_opthdr *)opt;
1196             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1197             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1198                 dprint(1, ("so_opt2cmsg: level 0x%x, name %d, len %d\n",
1199                     tohp->level, tohp->name, tohp->len));
1200 
1201                 if (tohp->level == SOL_SOCKET &&
1202                     (tohp->name == SO_SRCADDR ||
1203                     tohp->name == SO_UNIX_CLOSE)) {
1204                         continue;
1205                 }
1206                 ASSERT((uintptr_t)cmsg <= (uintptr_t)control + controllen);
1207                 if (tohp->level == SOL_SOCKET && tohp->name == SO_FILEP) {
1208                         fdbuf = (struct fdbuf *)_TPI_TOPT_DATA(tohp);
1209                         fdbuflen = (int)_TPI_TOPT_DATALEN(tohp);
1210 
1211                         if (!fdbuf_verify(mp, fdbuf, fdbuflen))
1212                                 return (EPROTO);
1213                         if (oldflg) {
1214                                 error = fdbuf_extract(fdbuf, control,
1215                                     (int)controllen);
1216                                 if (error != 0)
1217                                         return (error);
1218                                 continue;
1219                         } else {
1220                                 int fdlen;
1221 
1222                                 fdlen = (int)fdbuf_cmsglen(
1223                                     (int)_TPI_TOPT_DATALEN(tohp));
1224 
1225                                 cmsg->cmsg_level = tohp->level;
1226                                 cmsg->cmsg_type = SCM_RIGHTS;
1227                                 cmsg->cmsg_len = (socklen_t)(fdlen +
1228                                     sizeof (struct cmsghdr));
1229 
1230                                 error = fdbuf_extract(fdbuf,
1231                                     CMSG_CONTENT(cmsg), fdlen);
1232                                 if (error != 0)
1233                                         return (error);
1234                         }
1235                 } else if (tohp->level == SOL_SOCKET &&
1236                     tohp->name == SCM_TIMESTAMP) {
1237                         timestruc_t *timestamp;
1238 
1239                         if (oldflg)
1240                                 continue;
1241 
1242                         cmsg->cmsg_level = tohp->level;
1243                         cmsg->cmsg_type = tohp->name;
1244 
1245                         timestamp =
1246                             (timestruc_t *)P2ROUNDUP((intptr_t)&tohp[1],
1247                             sizeof (intptr_t));
1248 
1249                         if (get_udatamodel() == DATAMODEL_NATIVE) {
1250                                 struct timeval tv;
1251 
1252                                 cmsg->cmsg_len = sizeof (struct timeval) +
1253                                     sizeof (struct cmsghdr);
1254                                 tv.tv_sec = timestamp->tv_sec;
1255                                 tv.tv_usec = timestamp->tv_nsec /
1256                                     (NANOSEC / MICROSEC);
1257                                 /*
1258                                  * on LP64 systems, the struct timeval in
1259                                  * the destination will not be 8-byte aligned,
1260                                  * so use bcopy to avoid alignment trouble
1261                                  */
1262                                 bcopy(&tv, CMSG_CONTENT(cmsg), sizeof (tv));
1263                         } else {
1264                                 struct timeval32 *time32;
1265 
1266                                 cmsg->cmsg_len = sizeof (struct timeval32) +
1267                                     sizeof (struct cmsghdr);
1268                                 time32 = (struct timeval32 *)CMSG_CONTENT(cmsg);
1269                                 time32->tv_sec = (time32_t)timestamp->tv_sec;
1270                                 time32->tv_usec =
1271                                     (int32_t)(timestamp->tv_nsec /
1272                                     (NANOSEC / MICROSEC));
1273                         }
1274 
1275                 } else {
1276                         if (oldflg)
1277                                 continue;
1278 
1279                         cmsg->cmsg_level = tohp->level;
1280                         cmsg->cmsg_type = tohp->name;
1281                         cmsg->cmsg_len = (socklen_t)(_TPI_TOPT_DATALEN(tohp) +
1282                             sizeof (struct cmsghdr));
1283 
1284                         /* copy content to control data part */
1285                         bcopy(&tohp[1], CMSG_CONTENT(cmsg),
1286                             CMSG_CONTENTLEN(cmsg));
1287                 }
1288                 /* move to next CMSG structure! */
1289                 cmsg = CMSG_NEXT(cmsg);
1290         }
1291         dprint(1, ("so_opt2cmsg: buf %p len %d; cend %p; final cmsg %p\n",
1292             control, controllen, (void *)cend, (void *)cmsg));
1293         ASSERT(cmsg <= cend);
1294         return (0);
1295 }
1296 
1297 /*
1298  * Extract the SO_SRCADDR option value if present.
1299  */
1300 void
1301 so_getopt_srcaddr(void *opt, t_uscalar_t optlen, void **srcp,
1302     t_uscalar_t *srclenp)
1303 {
1304         struct T_opthdr         *tohp;
1305 
1306         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1307 
1308         ASSERT(srcp != NULL && srclenp != NULL);
1309         *srcp = NULL;
1310         *srclenp = 0;
1311 
1312         for (tohp = (struct T_opthdr *)opt;
1313             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1314             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1315                 dprint(1, ("so_getopt_srcaddr: level 0x%x, name %d, len %d\n",
1316                     tohp->level, tohp->name, tohp->len));
1317                 if (tohp->level == SOL_SOCKET &&
1318                     tohp->name == SO_SRCADDR) {
1319                         *srcp = _TPI_TOPT_DATA(tohp);
1320                         *srclenp = (t_uscalar_t)_TPI_TOPT_DATALEN(tohp);
1321                 }
1322         }
1323 }
1324 
1325 /*
1326  * Verify if the SO_UNIX_CLOSE option is present.
1327  */
1328 int
1329 so_getopt_unix_close(void *opt, t_uscalar_t optlen)
1330 {
1331         struct T_opthdr         *tohp;
1332 
1333         ASSERT(__TPI_TOPT_ISALIGNED(opt));
1334 
1335         for (tohp = (struct T_opthdr *)opt;
1336             tohp && _TPI_TOPT_VALID(tohp, opt, (uintptr_t)opt + optlen);
1337             tohp = _TPI_TOPT_NEXTHDR(opt, optlen, tohp)) {
1338                 dprint(1,
1339                     ("so_getopt_unix_close: level 0x%x, name %d, len %d\n",
1340                     tohp->level, tohp->name, tohp->len));
1341                 if (tohp->level == SOL_SOCKET &&
1342                     tohp->name == SO_UNIX_CLOSE)
1343                         return (1);
1344         }
1345         return (0);
1346 }
1347 
1348 /*
1349  * Allocate an M_PROTO message.
1350  *
1351  * If allocation fails the behavior depends on sleepflg:
1352  *      _ALLOC_NOSLEEP  fail immediately
1353  *      _ALLOC_INTR     sleep for memory until a signal is caught
1354  *      _ALLOC_SLEEP    sleep forever. Don't return NULL.
1355  */
1356 mblk_t *
1357 soallocproto(size_t size, int sleepflg, cred_t *cr)
1358 {
1359         mblk_t  *mp;
1360 
1361         /* Round up size for reuse */
1362         size = MAX(size, 64);
1363         if (cr != NULL)
1364                 mp = allocb_cred(size, cr, curproc->p_pid);
1365         else
1366                 mp = allocb(size, BPRI_MED);
1367 
1368         if (mp == NULL) {
1369                 int error;      /* Dummy - error not returned to caller */
1370 
1371                 switch (sleepflg) {
1372                 case _ALLOC_SLEEP:
1373                         if (cr != NULL) {
1374                                 mp = allocb_cred_wait(size, STR_NOSIG, &error,
1375                                     cr, curproc->p_pid);
1376                         } else {
1377                                 mp = allocb_wait(size, BPRI_MED, STR_NOSIG,
1378                                     &error);
1379                         }
1380                         ASSERT(mp);
1381                         break;
1382                 case _ALLOC_INTR:
1383                         if (cr != NULL) {
1384                                 mp = allocb_cred_wait(size, 0, &error, cr,
1385                                     curproc->p_pid);
1386                         } else {
1387                                 mp = allocb_wait(size, BPRI_MED, 0, &error);
1388                         }
1389                         if (mp == NULL) {
1390                                 /* Caught signal while sleeping for memory */
1391                                 eprintline(ENOBUFS);
1392                                 return (NULL);
1393                         }
1394                         break;
1395                 case _ALLOC_NOSLEEP:
1396                 default:
1397                         eprintline(ENOBUFS);
1398                         return (NULL);
1399                 }
1400         }
1401         DB_TYPE(mp) = M_PROTO;
1402         return (mp);
1403 }
1404 
1405 /*
1406  * Allocate an M_PROTO message with a single component.
1407  * len is the length of buf. size is the amount to allocate.
1408  *
1409  * buf can be NULL with a non-zero len.
1410  * This results in a bzero'ed chunk being placed the message.
1411  */
1412 mblk_t *
1413 soallocproto1(const void *buf, ssize_t len, ssize_t size, int sleepflg,
1414     cred_t *cr)
1415 {
1416         mblk_t  *mp;
1417 
1418         if (size == 0)
1419                 size = len;
1420 
1421         ASSERT(size >= len);
1422         /* Round up size for reuse */
1423         size = MAX(size, 64);
1424         mp = soallocproto(size, sleepflg, cr);
1425         if (mp == NULL)
1426                 return (NULL);
1427         mp->b_datap->db_type = M_PROTO;
1428         if (len != 0) {
1429                 if (buf != NULL)
1430                         bcopy(buf, mp->b_wptr, len);
1431                 else
1432                         bzero(mp->b_wptr, len);
1433                 mp->b_wptr += len;
1434         }
1435         return (mp);
1436 }
1437 
1438 /*
1439  * Append buf/len to mp.
1440  * The caller has to ensure that there is enough room in the mblk.
1441  *
1442  * buf can be NULL with a non-zero len.
1443  * This results in a bzero'ed chunk being placed the message.
1444  */
1445 void
1446 soappendmsg(mblk_t *mp, const void *buf, ssize_t len)
1447 {
1448         ASSERT(mp);
1449 
1450         if (len != 0) {
1451                 /* Assert for room left */
1452                 ASSERT(mp->b_datap->db_lim - mp->b_wptr >= len);
1453                 if (buf != NULL)
1454                         bcopy(buf, mp->b_wptr, len);
1455                 else
1456                         bzero(mp->b_wptr, len);
1457         }
1458         mp->b_wptr += len;
1459 }
1460 
1461 /*
1462  * Create a message using two kernel buffers.
1463  * If size is set that will determine the allocation size (e.g. for future
1464  * soappendmsg calls). If size is zero it is derived from the buffer
1465  * lengths.
1466  */
1467 mblk_t *
1468 soallocproto2(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1469     ssize_t size, int sleepflg, cred_t *cr)
1470 {
1471         mblk_t *mp;
1472 
1473         if (size == 0)
1474                 size = len1 + len2;
1475         ASSERT(size >= len1 + len2);
1476 
1477         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1478         if (mp)
1479                 soappendmsg(mp, buf2, len2);
1480         return (mp);
1481 }
1482 
1483 /*
1484  * Create a message using three kernel buffers.
1485  * If size is set that will determine the allocation size (for future
1486  * soappendmsg calls). If size is zero it is derived from the buffer
1487  * lengths.
1488  */
1489 mblk_t *
1490 soallocproto3(const void *buf1, ssize_t len1, const void *buf2, ssize_t len2,
1491     const void *buf3, ssize_t len3, ssize_t size, int sleepflg, cred_t *cr)
1492 {
1493         mblk_t *mp;
1494 
1495         if (size == 0)
1496                 size = len1 + len2 +len3;
1497         ASSERT(size >= len1 + len2 + len3);
1498 
1499         mp = soallocproto1(buf1, len1, size, sleepflg, cr);
1500         if (mp != NULL) {
1501                 soappendmsg(mp, buf2, len2);
1502                 soappendmsg(mp, buf3, len3);
1503         }
1504         return (mp);
1505 }
1506 
1507 #ifdef DEBUG
1508 char *
1509 pr_state(uint_t state, uint_t mode)
1510 {
1511         static char buf[1024];
1512 
1513         buf[0] = 0;
1514         if (state & SS_ISCONNECTED)
1515                 (void) strcat(buf, "ISCONNECTED ");
1516         if (state & SS_ISCONNECTING)
1517                 (void) strcat(buf, "ISCONNECTING ");
1518         if (state & SS_ISDISCONNECTING)
1519                 (void) strcat(buf, "ISDISCONNECTING ");
1520         if (state & SS_CANTSENDMORE)
1521                 (void) strcat(buf, "CANTSENDMORE ");
1522 
1523         if (state & SS_CANTRCVMORE)
1524                 (void) strcat(buf, "CANTRCVMORE ");
1525         if (state & SS_ISBOUND)
1526                 (void) strcat(buf, "ISBOUND ");
1527         if (state & SS_NDELAY)
1528                 (void) strcat(buf, "NDELAY ");
1529         if (state & SS_NONBLOCK)
1530                 (void) strcat(buf, "NONBLOCK ");
1531 
1532         if (state & SS_ASYNC)
1533                 (void) strcat(buf, "ASYNC ");
1534         if (state & SS_ACCEPTCONN)
1535                 (void) strcat(buf, "ACCEPTCONN ");
1536         if (state & SS_SAVEDEOR)
1537                 (void) strcat(buf, "SAVEDEOR ");
1538 
1539         if (state & SS_RCVATMARK)
1540                 (void) strcat(buf, "RCVATMARK ");
1541         if (state & SS_OOBPEND)
1542                 (void) strcat(buf, "OOBPEND ");
1543         if (state & SS_HAVEOOBDATA)
1544                 (void) strcat(buf, "HAVEOOBDATA ");
1545         if (state & SS_HADOOBDATA)
1546                 (void) strcat(buf, "HADOOBDATA ");
1547 
1548         if (mode & SM_PRIV)
1549                 (void) strcat(buf, "PRIV ");
1550         if (mode & SM_ATOMIC)
1551                 (void) strcat(buf, "ATOMIC ");
1552         if (mode & SM_ADDR)
1553                 (void) strcat(buf, "ADDR ");
1554         if (mode & SM_CONNREQUIRED)
1555                 (void) strcat(buf, "CONNREQUIRED ");
1556 
1557         if (mode & SM_FDPASSING)
1558                 (void) strcat(buf, "FDPASSING ");
1559         if (mode & SM_EXDATA)
1560                 (void) strcat(buf, "EXDATA ");
1561         if (mode & SM_OPTDATA)
1562                 (void) strcat(buf, "OPTDATA ");
1563         if (mode & SM_BYTESTREAM)
1564                 (void) strcat(buf, "BYTESTREAM ");
1565         return (buf);
1566 }
1567 
1568 char *
1569 pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
1570 {
1571         static char buf[1024];
1572 
1573         if (addr == NULL || addrlen == 0) {
1574                 (void) sprintf(buf, "(len %d) %p", addrlen, (void *)addr);
1575                 return (buf);
1576         }
1577         switch (family) {
1578         case AF_INET: {
1579                 struct sockaddr_in sin;
1580 
1581                 bcopy(addr, &sin, sizeof (sin));
1582 
1583                 (void) sprintf(buf, "(len %d) %x/%d",
1584                     addrlen, ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1585                 break;
1586         }
1587         case AF_INET6: {
1588                 struct sockaddr_in6 sin6;
1589                 uint16_t *piece = (uint16_t *)&sin6.sin6_addr;
1590 
1591                 bcopy((char *)addr, (char *)&sin6, sizeof (sin6));
1592                 (void) sprintf(buf, "(len %d) %x:%x:%x:%x:%x:%x:%x:%x/%d",
1593                     addrlen,
1594                     ntohs(piece[0]), ntohs(piece[1]),
1595                     ntohs(piece[2]), ntohs(piece[3]),
1596                     ntohs(piece[4]), ntohs(piece[5]),
1597                     ntohs(piece[6]), ntohs(piece[7]),
1598                     ntohs(sin6.sin6_port));
1599                 break;
1600         }
1601         case AF_UNIX: {
1602                 struct sockaddr_un *soun = (struct sockaddr_un *)addr;
1603 
1604                 (void) sprintf(buf, "(len %d) %s", addrlen,
1605                     (soun == NULL) ? "(none)" : soun->sun_path);
1606                 break;
1607         }
1608         default:
1609                 (void) sprintf(buf, "(unknown af %d)", family);
1610                 break;
1611         }
1612         return (buf);
1613 }
1614 
1615 /* The logical equivalence operator (a if-and-only-if b) */
1616 #define EQUIVALENT(a, b)        (((a) && (b)) || (!(a) && (!(b))))
1617 
1618 /*
1619  * Verify limitations and invariants on oob state.
1620  * Return 1 if OK, otherwise 0 so that it can be used as
1621  *      ASSERT(verify_oobstate(so));
1622  */
1623 int
1624 so_verify_oobstate(struct sonode *so)
1625 {
1626         boolean_t havemark;
1627 
1628         ASSERT(MUTEX_HELD(&so->so_lock));
1629 
1630         /*
1631          * The possible state combinations are:
1632          *      0
1633          *      SS_OOBPEND
1634          *      SS_OOBPEND|SS_HAVEOOBDATA
1635          *      SS_OOBPEND|SS_HADOOBDATA
1636          *      SS_HADOOBDATA
1637          */
1638         switch (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA)) {
1639         case 0:
1640         case SS_OOBPEND:
1641         case SS_OOBPEND|SS_HAVEOOBDATA:
1642         case SS_OOBPEND|SS_HADOOBDATA:
1643         case SS_HADOOBDATA:
1644                 break;
1645         default:
1646                 printf("Bad oob state 1 (%p): state %s\n",
1647                     (void *)so, pr_state(so->so_state, so->so_mode));
1648                 return (0);
1649         }
1650 
1651         /* SS_RCVATMARK should only be set when SS_OOBPEND is set */
1652         if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
1653                 printf("Bad oob state 2 (%p): state %s\n",
1654                     (void *)so, pr_state(so->so_state, so->so_mode));
1655                 return (0);
1656         }
1657 
1658         /*
1659          * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
1660          * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
1661          */
1662         havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
1663             SOTOTPI(so)->sti_oobsigcnt > 0;
1664 
1665         if (!EQUIVALENT(havemark || (so->so_state & SS_RCVATMARK),
1666             so->so_state & SS_OOBPEND)) {
1667                 printf("Bad oob state 3 (%p): state %s\n",
1668                     (void *)so, pr_state(so->so_state, so->so_mode));
1669                 return (0);
1670         }
1671 
1672         /*
1673          * Unless SO_OOBINLINE we have so_oobmsg != NULL iff SS_HAVEOOBDATA
1674          */
1675         if (!(so->so_options & SO_OOBINLINE) &&
1676             !EQUIVALENT(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
1677                 printf("Bad oob state 4 (%p): state %s\n",
1678                     (void *)so, pr_state(so->so_state, so->so_mode));
1679                 return (0);
1680         }
1681 
1682         if (!SOCK_IS_NONSTR(so) &&
1683             SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
1684                 printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
1685                     (void *)so, SOTOTPI(so)->sti_oobsigcnt,
1686                     SOTOTPI(so)->sti_oobcnt,
1687                     pr_state(so->so_state, so->so_mode));
1688                 return (0);
1689         }
1690 
1691         return (1);
1692 }
1693 #undef  EQUIVALENT
1694 #endif /* DEBUG */
1695 
1696 /* initialize sockfs zone specific kstat related items                  */
1697 void *
1698 sock_kstat_init(zoneid_t zoneid)
1699 {
1700         kstat_t *ksp;
1701 
1702         ksp = kstat_create_zone("sockfs", 0, "sock_unix_list", "misc",
1703             KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE|KSTAT_FLAG_VIRTUAL, zoneid);
1704 
1705         if (ksp != NULL) {
1706                 ksp->ks_update = sockfs_update;
1707                 ksp->ks_snapshot = sockfs_snapshot;
1708                 ksp->ks_lock = &socklist.sl_lock;
1709                 ksp->ks_private = (void *)(uintptr_t)zoneid;
1710                 kstat_install(ksp);
1711         }
1712 
1713         return (ksp);
1714 }
1715 
1716 /* tear down sockfs zone specific kstat related items                   */
1717 /*ARGSUSED*/
1718 void
1719 sock_kstat_fini(zoneid_t zoneid, void *arg)
1720 {
1721         kstat_t *ksp = (kstat_t *)arg;
1722 
1723         if (ksp != NULL) {
1724                 ASSERT(zoneid == (zoneid_t)(uintptr_t)ksp->ks_private);
1725                 kstat_delete(ksp);
1726         }
1727 }
1728 
1729 /*
1730  * Zones:
1731  * Note that nactive is going to be different for each zone.
1732  * This means we require kstat to call sockfs_update and then sockfs_snapshot
1733  * for the same zone, or sockfs_snapshot will be taken into the wrong size
1734  * buffer. This is safe, but if the buffer is too small, user will not be
1735  * given details of all sockets. However, as this kstat has a ks_lock, kstat
1736  * driver will keep it locked between the update and the snapshot, so no
1737  * other process (zone) can currently get inbetween resulting in a wrong size
1738  * buffer allocation.
1739  */
1740 static int
1741 sockfs_update(kstat_t *ksp, int rw)
1742 {
1743         uint_t  nactive = 0;            /* # of active AF_UNIX sockets  */
1744         struct sonode   *so;            /* current sonode on socklist   */
1745         zoneid_t        myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1746 
1747         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1748 
1749         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1750                 return (EACCES);
1751         }
1752 
1753         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1754                 if (so->so_count != 0 && so->so_zoneid == myzoneid) {
1755                         nactive++;
1756                 }
1757         }
1758         ksp->ks_ndata = nactive;
1759         ksp->ks_data_size = nactive * sizeof (struct sockinfo);
1760 
1761         return (0);
1762 }
1763 
1764 static int
1765 sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
1766 {
1767         int                     ns;     /* # of sonodes we've copied    */
1768         struct sonode           *so;    /* current sonode on socklist   */
1769         struct sockinfo         *psi;   /* where we put sockinfo data   */
1770         t_uscalar_t             sn_len; /* soa_len                      */
1771         zoneid_t                myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
1772         sotpi_info_t            *sti;
1773 
1774         ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
1775 
1776         ksp->ks_snaptime = gethrtime();
1777 
1778         if (rw == KSTAT_WRITE) {        /* bounce all writes            */
1779                 return (EACCES);
1780         }
1781 
1782         /*
1783          * For each sonode on the socklist, we massage the important
1784          * info into buf, in sockinfo format.
1785          */
1786         psi = (struct sockinfo *)buf;
1787         ns = 0;
1788         for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
1789                 vattr_t attr;
1790 
1791                 /* only stuff active sonodes and the same zone:         */
1792                 if (so->so_count == 0 || so->so_zoneid != myzoneid) {
1793                         continue;
1794                 }
1795 
1796                 /*
1797                  * If the sonode was activated between the update and the
1798                  * snapshot, we're done - as this is only a snapshot.
1799                  */
1800                 if ((caddr_t)(psi) >= (caddr_t)buf + ksp->ks_data_size) {
1801                         break;
1802                 }
1803 
1804                 sti = SOTOTPI(so);
1805                 /* copy important info into buf:                        */
1806                 psi->si_size = sizeof (struct sockinfo);
1807                 psi->si_family = so->so_family;
1808                 psi->si_type = so->so_type;
1809                 psi->si_flag = so->so_flag;
1810                 psi->si_state = so->so_state;
1811                 psi->si_serv_type = sti->sti_serv_type;
1812                 psi->si_ux_laddr_sou_magic = sti->sti_ux_laddr.soua_magic;
1813                 psi->si_ux_faddr_sou_magic = sti->sti_ux_faddr.soua_magic;
1814                 psi->si_laddr_soa_len = sti->sti_laddr.soa_len;
1815                 psi->si_faddr_soa_len = sti->sti_faddr.soa_len;
1816                 psi->si_szoneid = so->so_zoneid;
1817                 psi->si_faddr_noxlate = sti->sti_faddr_noxlate;
1818 
1819                 /*
1820                  * Grab the inode, if possible.
1821                  * This must be done before entering so_lock as VOP_GETATTR
1822                  * will acquire it.
1823                  */
1824                 if (so->so_vnode == NULL ||
1825                     VOP_GETATTR(so->so_vnode, &attr, 0, CRED(), NULL) != 0)
1826                         attr.va_nodeid = 0;
1827 
1828                 psi->si_inode = attr.va_nodeid;
1829 
1830                 mutex_enter(&so->so_lock);
1831 
1832                 if (sti->sti_laddr_sa != NULL) {
1833                         ASSERT(sti->sti_laddr_sa->sa_data != NULL);
1834                         sn_len = sti->sti_laddr_len;
1835                         ASSERT(sn_len <= sizeof (short) +
1836                             sizeof (psi->si_laddr_sun_path));
1837 
1838                         psi->si_laddr_family =
1839                             sti->sti_laddr_sa->sa_family;
1840                         if (sn_len != 0) {
1841                                 /* AF_UNIX socket names are NULL terminated */
1842                                 (void) strncpy(psi->si_laddr_sun_path,
1843                                     sti->sti_laddr_sa->sa_data,
1844                                     sizeof (psi->si_laddr_sun_path));
1845                                 sn_len = strlen(psi->si_laddr_sun_path);
1846                         }
1847                         psi->si_laddr_sun_path[sn_len] = 0;
1848                 }
1849 
1850                 if (sti->sti_faddr_sa != NULL) {
1851                         ASSERT(sti->sti_faddr_sa->sa_data != NULL);
1852                         sn_len = sti->sti_faddr_len;
1853                         ASSERT(sn_len <= sizeof (short) +
1854                             sizeof (psi->si_faddr_sun_path));
1855 
1856                         psi->si_faddr_family =
1857                             sti->sti_faddr_sa->sa_family;
1858                         if (sn_len != 0) {
1859                                 (void) strncpy(psi->si_faddr_sun_path,
1860                                     sti->sti_faddr_sa->sa_data,
1861                                     sizeof (psi->si_faddr_sun_path));
1862                                 sn_len = strlen(psi->si_faddr_sun_path);
1863                         }
1864                         psi->si_faddr_sun_path[sn_len] = 0;
1865                 }
1866 
1867                 mutex_exit(&so->so_lock);
1868 
1869                 (void) snprintf(psi->si_son_straddr,
1870                     sizeof (psi->si_son_straddr), "%p", (void *)so);
1871                 (void) snprintf(psi->si_lvn_straddr,
1872                     sizeof (psi->si_lvn_straddr), "%p",
1873                     (void *)sti->sti_ux_laddr.soua_vp);
1874                 (void) snprintf(psi->si_fvn_straddr,
1875                     sizeof (psi->si_fvn_straddr), "%p",
1876                     (void *)sti->sti_ux_faddr.soua_vp);
1877 
1878                 ns++;
1879                 psi++;
1880         }
1881 
1882         ksp->ks_ndata = ns;
1883         return (0);
1884 }
1885 
1886 ssize_t
1887 soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size)
1888 {
1889         struct uio auio;
1890         struct iovec aiov[MSG_MAXIOVLEN];
1891         register vnode_t *vp;
1892         int ioflag, rwflag;
1893         ssize_t cnt;
1894         int error = 0;
1895         int iovcnt = 0;
1896         short fflag;
1897 
1898         vp = fp->f_vnode;
1899         fflag = fp->f_flag;
1900 
1901         rwflag = 0;
1902         aiov[0].iov_base = (caddr_t)buf;
1903         aiov[0].iov_len = size;
1904         iovcnt = 1;
1905         cnt = (ssize_t)size;
1906         (void) VOP_RWLOCK(vp, rwflag, NULL);
1907 
1908         auio.uio_loffset = fileoff;
1909         auio.uio_iov = aiov;
1910         auio.uio_iovcnt = iovcnt;
1911         auio.uio_resid = cnt;
1912         auio.uio_segflg = UIO_SYSSPACE;
1913         auio.uio_llimit = MAXOFFSET_T;
1914         auio.uio_fmode = fflag;
1915         auio.uio_extflg = UIO_COPY_CACHED;
1916 
1917         ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
1918 
1919         /* If read sync is not asked for, filter sync flags */
1920         if ((ioflag & FRSYNC) == 0)
1921                 ioflag &= ~(FSYNC|FDSYNC);
1922         error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
1923         cnt -= auio.uio_resid;
1924 
1925         VOP_RWUNLOCK(vp, rwflag, NULL);
1926 
1927         if (error == EINTR && cnt != 0)
1928                 error = 0;
1929 out:
1930         if (error != 0) {
1931                 *err = error;
1932                 return (0);
1933         } else {
1934                 *err = 0;
1935                 return (cnt);
1936         }
1937 }
1938 
1939 int
1940 so_copyin(const void *from, void *to, size_t size, int fromkernel)
1941 {
1942         if (fromkernel) {
1943                 bcopy(from, to, size);
1944                 return (0);
1945         }
1946         return (xcopyin(from, to, size));
1947 }
1948 
1949 int
1950 so_copyout(const void *from, void *to, size_t size, int tokernel)
1951 {
1952         if (tokernel) {
1953                 bcopy(from, to, size);
1954                 return (0);
1955         }
1956         return (xcopyout(from, to, size));
1957 }