1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
  27 /*
  28  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/buf.h>
  36 #include <sys/conf.h>
  37 #include <sys/cred.h>
  38 #include <sys/kmem.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/vfs.h>
  41 #include <sys/vnode.h>
  42 #include <sys/debug.h>
  43 #include <sys/errno.h>
  44 #include <sys/time.h>
  45 #include <sys/file.h>
  46 #include <sys/user.h>
  47 #include <sys/stream.h>
  48 #include <sys/strsubr.h>
  49 #include <sys/strsun.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/esunddi.h>
  52 #include <sys/flock.h>
  53 #include <sys/modctl.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/vmsystm.h>
  56 #include <sys/policy.h>
  57 
  58 #include <sys/socket.h>
  59 #include <sys/socketvar.h>
  60 
  61 #include <sys/isa_defs.h>
  62 #include <sys/inttypes.h>
  63 #include <sys/systm.h>
  64 #include <sys/cpuvar.h>
  65 #include <sys/filio.h>
  66 #include <sys/sendfile.h>
  67 #include <sys/ddi.h>
  68 #include <vm/seg.h>
  69 #include <vm/seg_map.h>
  70 #include <vm/seg_kpm.h>
  71 
  72 #include <fs/sockfs/nl7c.h>
  73 #include <fs/sockfs/sockcommon.h>
  74 #include <fs/sockfs/sockfilter_impl.h>
  75 #include <fs/sockfs/socktpi.h>
  76 
  77 #ifdef SOCK_TEST
  78 int do_useracc = 1;             /* Controlled by setting SO_DEBUG to 4 */
  79 #else
  80 #define do_useracc      1
  81 #endif /* SOCK_TEST */
  82 
  83 extern int      xnet_truncate_print;
  84 
  85 extern void     nl7c_init(void);
  86 extern int      sockfs_defer_nl7c_init;
  87 
  88 /*
  89  * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
  90  *       as there isn't a formal definition of IOV_MAX ???
  91  */
  92 #define MSG_MAXIOVLEN   16
  93 
  94 /*
  95  * Kernel component of socket creation.
  96  *
  97  * The socket library determines which version number to use.
  98  * First the library calls this with a NULL devpath. If this fails
  99  * to find a transport (using solookup) the library will look in /etc/netconfig
 100  * for the appropriate transport. If one is found it will pass in the
 101  * devpath for the kernel to use.
 102  */
 103 int
 104 so_socket(int family, int type_w_flags, int protocol, char *devpath,
 105     int version)
 106 {
 107         struct sonode *so;
 108         vnode_t *vp;
 109         struct file *fp;
 110         int fd;
 111         int error;
 112         int type;
 113 
 114         type = type_w_flags & SOCK_TYPE_MASK;
 115         type_w_flags &= ~SOCK_TYPE_MASK;
 116         if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
 117                 return (set_errno(EINVAL));
 118 
 119         if (devpath != NULL) {
 120                 char *buf;
 121                 size_t kdevpathlen = 0;
 122 
 123                 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 124                 if ((error = copyinstr(devpath, buf,
 125                     MAXPATHLEN, &kdevpathlen)) != 0) {
 126                         kmem_free(buf, MAXPATHLEN);
 127                         return (set_errno(error));
 128                 }
 129                 so = socket_create(family, type, protocol, buf, NULL,
 130                     SOCKET_SLEEP, version, CRED(), &error);
 131                 kmem_free(buf, MAXPATHLEN);
 132         } else {
 133                 so = socket_create(family, type, protocol, NULL, NULL,
 134                     SOCKET_SLEEP, version, CRED(), &error);
 135         }
 136         if (so == NULL)
 137                 return (set_errno(error));
 138 
 139         /* Allocate a file descriptor for the socket */
 140         vp = SOTOV(so);
 141         if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
 142                 (void) socket_close(so, 0, CRED());
 143                 socket_destroy(so);
 144                 return (set_errno(error));
 145         }
 146 
 147         /*
 148          * Now fill in the entries that falloc reserved
 149          */
 150         if (type_w_flags & SOCK_NDELAY) {
 151                 so->so_state |= SS_NDELAY;
 152                 fp->f_flag |= FNDELAY;
 153         }
 154         if (type_w_flags & SOCK_NONBLOCK) {
 155                 so->so_state |= SS_NONBLOCK;
 156                 fp->f_flag |= FNONBLOCK;
 157         }
 158         mutex_exit(&fp->f_tlock);
 159         setf(fd, fp);
 160         if ((type_w_flags & SOCK_CLOEXEC) != 0) {
 161                 f_setfd(fd, FD_CLOEXEC);
 162         }
 163 
 164         return (fd);
 165 }
 166 
 167 /*
 168  * Map from a file descriptor to a socket node.
 169  * Returns with the file descriptor held i.e. the caller has to
 170  * use releasef when done with the file descriptor.
 171  */
 172 struct sonode *
 173 getsonode(int sock, int *errorp, file_t **fpp)
 174 {
 175         file_t *fp;
 176         vnode_t *vp;
 177         struct sonode *so;
 178 
 179         if ((fp = getf(sock)) == NULL) {
 180                 *errorp = EBADF;
 181                 eprintline(*errorp);
 182                 return (NULL);
 183         }
 184         vp = fp->f_vnode;
 185         /* Check if it is a socket */
 186         if (vp->v_type != VSOCK) {
 187                 releasef(sock);
 188                 *errorp = ENOTSOCK;
 189                 eprintline(*errorp);
 190                 return (NULL);
 191         }
 192         /*
 193          * Use the stream head to find the real socket vnode.
 194          * This is needed when namefs sits above sockfs.
 195          */
 196         if (vp->v_stream) {
 197                 ASSERT(vp->v_stream->sd_vnode);
 198                 vp = vp->v_stream->sd_vnode;
 199 
 200                 so = VTOSO(vp);
 201                 if (so->so_version == SOV_STREAM) {
 202                         releasef(sock);
 203                         *errorp = ENOTSOCK;
 204                         eprintsoline(so, *errorp);
 205                         return (NULL);
 206                 }
 207         } else {
 208                 so = VTOSO(vp);
 209         }
 210         if (fpp)
 211                 *fpp = fp;
 212         return (so);
 213 }
 214 
 215 /*
 216  * Allocate and copyin a sockaddr.
 217  * Ensures NULL termination for AF_UNIX addresses by extending them
 218  * with one NULL byte if need be. Verifies that the length is not
 219  * excessive to prevent an application from consuming all of kernel
 220  * memory. Returns NULL when an error occurred.
 221  */
 222 static struct sockaddr *
 223 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
 224     int *errorp)
 225 {
 226         char    *faddr;
 227         size_t  namelen = (size_t)*namelenp;
 228 
 229         ASSERT(namelen != 0);
 230         if (namelen > SO_MAXARGSIZE) {
 231                 *errorp = EINVAL;
 232                 eprintsoline(so, *errorp);
 233                 return (NULL);
 234         }
 235 
 236         faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
 237         if (copyin(name, faddr, namelen)) {
 238                 kmem_free(faddr, namelen);
 239                 *errorp = EFAULT;
 240                 eprintsoline(so, *errorp);
 241                 return (NULL);
 242         }
 243 
 244         /*
 245          * Add space for NULL termination if needed.
 246          * Do a quick check if the last byte is NUL.
 247          */
 248         if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
 249                 /* Check if there is any NULL termination */
 250                 size_t  i;
 251                 int foundnull = 0;
 252 
 253                 for (i = sizeof (name->sa_family); i < namelen; i++) {
 254                         if (faddr[i] == '\0') {
 255                                 foundnull = 1;
 256                                 break;
 257                         }
 258                 }
 259                 if (!foundnull) {
 260                         /* Add extra byte for NUL padding */
 261                         char *nfaddr;
 262 
 263                         nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
 264                         bcopy(faddr, nfaddr, namelen);
 265                         kmem_free(faddr, namelen);
 266 
 267                         /* NUL terminate */
 268                         nfaddr[namelen] = '\0';
 269                         namelen++;
 270                         ASSERT((socklen_t)namelen == namelen);
 271                         *namelenp = (socklen_t)namelen;
 272                         faddr = nfaddr;
 273                 }
 274         }
 275         return ((struct sockaddr *)faddr);
 276 }
 277 
 278 /*
 279  * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 280  */
 281 static int
 282 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp, void *kaddr,
 283     socklen_t klen)
 284 {
 285         if (uaddr != NULL) {
 286                 if (ulen > klen)
 287                         ulen = klen;
 288 
 289                 if (ulen != 0) {
 290                         if (copyout(kaddr, uaddr, ulen))
 291                                 return (EFAULT);
 292                 }
 293         } else
 294                 ulen = 0;
 295 
 296         if (ulenp != NULL) {
 297                 if (copyout(&ulen, ulenp, sizeof (ulen)))
 298                         return (EFAULT);
 299         }
 300         return (0);
 301 }
 302 
 303 /*
 304  * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 305  * If klen is greater than ulen it still uses the non-truncated
 306  * klen to update ulenp.
 307  */
 308 static int
 309 copyout_name(void *uaddr, socklen_t ulen, void *ulenp, void *kaddr,
 310     socklen_t klen)
 311 {
 312         if (uaddr != NULL) {
 313                 if (ulen >= klen)
 314                         ulen = klen;
 315                 else if (ulen != 0 && xnet_truncate_print) {
 316                         printf("sockfs: truncating copyout of address using "
 317                             "XNET semantics for pid = %d. Lengths %d, %d\n",
 318                             curproc->p_pid, klen, ulen);
 319                 }
 320 
 321                 if (ulen != 0) {
 322                         if (copyout(kaddr, uaddr, ulen))
 323                                 return (EFAULT);
 324                 } else
 325                         klen = 0;
 326         } else
 327                 klen = 0;
 328 
 329         if (ulenp != NULL) {
 330                 if (copyout(&klen, ulenp, sizeof (klen)))
 331                         return (EFAULT);
 332         }
 333         return (0);
 334 }
 335 
 336 /*
 337  * The socketpair() code in libsocket creates two sockets (using
 338  * the /etc/netconfig fallback if needed) before calling this routine
 339  * to connect the two sockets together.
 340  *
 341  * For a SOCK_STREAM socketpair a listener is needed - in that case this
 342  * routine will create a new file descriptor as part of accepting the
 343  * connection. The library socketpair() will check if svs[2] has changed
 344  * in which case it will close the changed fd.
 345  *
 346  * Note that this code could use the TPI feature of accepting the connection
 347  * on the listening endpoint. However, that would require significant changes
 348  * to soaccept.
 349  */
 350 int
 351 so_socketpair(int sv[2])
 352 {
 353         int svs[2];
 354         struct sonode *so1, *so2;
 355         int error;
 356         int orig_flags;
 357         struct sockaddr_ux *name;
 358         size_t namelen;
 359         sotpi_info_t *sti1;
 360         sotpi_info_t *sti2;
 361 
 362         dprint(1, ("so_socketpair(%p)\n", (void *)sv));
 363 
 364         error = useracc(sv, sizeof (svs), B_WRITE);
 365         if (error && do_useracc)
 366                 return (set_errno(EFAULT));
 367 
 368         if (copyin(sv, svs, sizeof (svs)))
 369                 return (set_errno(EFAULT));
 370 
 371         if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
 372                 return (set_errno(error));
 373 
 374         if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
 375                 releasef(svs[0]);
 376                 return (set_errno(error));
 377         }
 378 
 379         if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
 380                 error = EOPNOTSUPP;
 381                 goto done;
 382         }
 383 
 384         sti1 = SOTOTPI(so1);
 385         sti2 = SOTOTPI(so2);
 386 
 387         /*
 388          * The code below makes assumptions about the "sockfs" implementation.
 389          * So make sure that the correct implementation is really used.
 390          */
 391         ASSERT(so1->so_ops == &sotpi_sonodeops);
 392         ASSERT(so2->so_ops == &sotpi_sonodeops);
 393 
 394         if (so1->so_type == SOCK_DGRAM) {
 395                 /*
 396                  * Bind both sockets and connect them with each other.
 397                  * Need to allocate name/namelen for soconnect.
 398                  */
 399                 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
 400                 if (error) {
 401                         eprintsoline(so1, error);
 402                         goto done;
 403                 }
 404                 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 405                 if (error) {
 406                         eprintsoline(so2, error);
 407                         goto done;
 408                 }
 409                 namelen = sizeof (struct sockaddr_ux);
 410                 name = kmem_alloc(namelen, KM_SLEEP);
 411                 name->sou_family = AF_UNIX;
 412                 name->sou_addr = sti2->sti_ux_laddr;
 413                 error = socket_connect(so1,
 414                     (struct sockaddr *)name,
 415                     (socklen_t)namelen,
 416                     0, _SOCONNECT_NOXLATE, CRED());
 417                 if (error) {
 418                         kmem_free(name, namelen);
 419                         eprintsoline(so1, error);
 420                         goto done;
 421                 }
 422                 name->sou_addr = sti1->sti_ux_laddr;
 423                 error = socket_connect(so2,
 424                     (struct sockaddr *)name,
 425                     (socklen_t)namelen,
 426                     0, _SOCONNECT_NOXLATE, CRED());
 427                 kmem_free(name, namelen);
 428                 if (error) {
 429                         eprintsoline(so2, error);
 430                         goto done;
 431                 }
 432                 releasef(svs[0]);
 433                 releasef(svs[1]);
 434         } else {
 435                 /*
 436                  * Bind both sockets, with so1 being a listener.
 437                  * Connect so2 to so1 - nonblocking to avoid waiting for
 438                  * soaccept to complete.
 439                  * Accept a connection on so1. Pass out the new fd as sv[0].
 440                  * The library will detect the changed fd and close
 441                  * the original one.
 442                  */
 443                 struct sonode *nso;
 444                 struct vnode *nvp;
 445                 struct file *nfp;
 446                 int nfd;
 447 
 448                 /*
 449                  * We could simply call socket_listen() here (which would do the
 450                  * binding automatically) if the code didn't rely on passing
 451                  * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
 452                  */
 453                 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
 454                     _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
 455                     CRED());
 456                 if (error) {
 457                         eprintsoline(so1, error);
 458                         goto done;
 459                 }
 460                 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 461                 if (error) {
 462                         eprintsoline(so2, error);
 463                         goto done;
 464                 }
 465 
 466                 namelen = sizeof (struct sockaddr_ux);
 467                 name = kmem_alloc(namelen, KM_SLEEP);
 468                 name->sou_family = AF_UNIX;
 469                 name->sou_addr = sti1->sti_ux_laddr;
 470                 error = socket_connect(so2,
 471                     (struct sockaddr *)name,
 472                     (socklen_t)namelen,
 473                     FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
 474                 kmem_free(name, namelen);
 475                 if (error) {
 476                         if (error != EINPROGRESS) {
 477                                 eprintsoline(so2, error); goto done;
 478                         }
 479                 }
 480 
 481                 error = socket_accept(so1, 0, CRED(), &nso);
 482                 if (error) {
 483                         eprintsoline(so1, error);
 484                         goto done;
 485                 }
 486 
 487                 /* wait for so2 being SS_CONNECTED ignoring signals */
 488                 mutex_enter(&so2->so_lock);
 489                 error = sowaitconnected(so2, 0, 1);
 490                 mutex_exit(&so2->so_lock);
 491                 if (error != 0) {
 492                         (void) socket_close(nso, 0, CRED());
 493                         socket_destroy(nso);
 494                         eprintsoline(so2, error);
 495                         goto done;
 496                 }
 497 
 498                 nvp = SOTOV(nso);
 499                 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
 500                         (void) socket_close(nso, 0, CRED());
 501                         socket_destroy(nso);
 502                         eprintsoline(nso, error);
 503                         goto done;
 504                 }
 505                 /*
 506                  * copy over FNONBLOCK and FNDELAY flags should they exist
 507                  */
 508                 if (so1->so_state & SS_NONBLOCK)
 509                         nfp->f_flag |= FNONBLOCK;
 510                 if (so1->so_state & SS_NDELAY)
 511                         nfp->f_flag |= FNDELAY;
 512 
 513                 /*
 514                  * fill in the entries that falloc reserved
 515                  */
 516                 mutex_exit(&nfp->f_tlock);
 517                 setf(nfd, nfp);
 518 
 519                 /*
 520                  * get the original flags before we release
 521                  */
 522                 VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
 523 
 524                 releasef(svs[0]);
 525                 releasef(svs[1]);
 526 
 527                 /*
 528                  * If FD_CLOEXEC was set on the filedescriptor we're
 529                  * swapping out, we should set it on the new one too.
 530                  */
 531                 if (orig_flags & FD_CLOEXEC) {
 532                         f_setfd(nfd, FD_CLOEXEC);
 533                 }
 534 
 535                 /*
 536                  * The socketpair library routine will close the original
 537                  * svs[0] when this code passes out a different file
 538                  * descriptor.
 539                  */
 540                 svs[0] = nfd;
 541 
 542                 if (copyout(svs, sv, sizeof (svs))) {
 543                         (void) closeandsetf(nfd, NULL);
 544                         eprintline(EFAULT);
 545                         return (set_errno(EFAULT));
 546                 }
 547         }
 548         return (0);
 549 
 550 done:
 551         releasef(svs[0]);
 552         releasef(svs[1]);
 553         return (set_errno(error));
 554 }
 555 
 556 int
 557 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
 558 {
 559         struct sonode *so;
 560         int error;
 561 
 562         dprint(1, ("bind(%d, %p, %d)\n",
 563             sock, (void *)name, namelen));
 564 
 565         if ((so = getsonode(sock, &error, NULL)) == NULL)
 566                 return (set_errno(error));
 567 
 568         /* Allocate and copyin name */
 569         /*
 570          * X/Open test does not expect EFAULT with NULL name and non-zero
 571          * namelen.
 572          */
 573         if (name != NULL && namelen != 0) {
 574                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 575                 name = copyin_name(so, name, &namelen, &error);
 576                 if (name == NULL) {
 577                         releasef(sock);
 578                         return (set_errno(error));
 579                 }
 580         } else {
 581                 name = NULL;
 582                 namelen = 0;
 583         }
 584 
 585         switch (version) {
 586         default:
 587                 error = socket_bind(so, name, namelen, 0, CRED());
 588                 break;
 589         case SOV_XPG4_2:
 590                 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
 591                 break;
 592         case SOV_SOCKBSD:
 593                 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
 594                 break;
 595         }
 596 done:
 597         releasef(sock);
 598         if (name != NULL)
 599                 kmem_free(name, (size_t)namelen);
 600 
 601         if (error)
 602                 return (set_errno(error));
 603         return (0);
 604 }
 605 
 606 /* ARGSUSED2 */
 607 int
 608 listen(int sock, int backlog, int version)
 609 {
 610         struct sonode *so;
 611         int error;
 612 
 613         dprint(1, ("listen(%d, %d)\n",
 614             sock, backlog));
 615 
 616         if ((so = getsonode(sock, &error, NULL)) == NULL)
 617                 return (set_errno(error));
 618 
 619         error = socket_listen(so, backlog, CRED());
 620 
 621         releasef(sock);
 622         if (error)
 623                 return (set_errno(error));
 624         return (0);
 625 }
 626 
 627 /*ARGSUSED3*/
 628 int
 629 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
 630     int flags)
 631 {
 632         struct sonode *so;
 633         file_t *fp;
 634         int error;
 635         socklen_t namelen;
 636         struct sonode *nso;
 637         struct vnode *nvp;
 638         struct file *nfp;
 639         int nfd;
 640         int ssflags;
 641         struct sockaddr *addrp;
 642         socklen_t addrlen;
 643 
 644         dprint(1, ("accept(%d, %p, %p)\n",
 645             sock, (void *)name, (void *)namelenp));
 646 
 647         if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
 648                 return (set_errno(EINVAL));
 649         }
 650 
 651         /* Translate SOCK_ flags to their SS_ variant */
 652         ssflags = 0;
 653         if (flags & SOCK_NONBLOCK)
 654                 ssflags |= SS_NONBLOCK;
 655         if (flags & SOCK_NDELAY)
 656                 ssflags |= SS_NDELAY;
 657 
 658         if ((so = getsonode(sock, &error, &fp)) == NULL)
 659                 return (set_errno(error));
 660 
 661         if (name != NULL) {
 662                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 663                 if (copyin(namelenp, &namelen, sizeof (namelen))) {
 664                         releasef(sock);
 665                         return (set_errno(EFAULT));
 666                 }
 667                 if (namelen != 0) {
 668                         error = useracc(name, (size_t)namelen, B_WRITE);
 669                         if (error && do_useracc) {
 670                                 releasef(sock);
 671                                 return (set_errno(EFAULT));
 672                         }
 673                 } else
 674                         name = NULL;
 675         } else {
 676                 namelen = 0;
 677         }
 678 
 679         /*
 680          * Allocate the user fd before socket_accept() in order to
 681          * catch EMFILE errors before calling socket_accept().
 682          */
 683         if ((nfd = ufalloc(0)) == -1) {
 684                 eprintsoline(so, EMFILE);
 685                 releasef(sock);
 686                 return (set_errno(EMFILE));
 687         }
 688         error = socket_accept(so, fp->f_flag, CRED(), &nso);
 689         if (error) {
 690                 setf(nfd, NULL);
 691                 releasef(sock);
 692                 return (set_errno(error));
 693         }
 694 
 695         nvp = SOTOV(nso);
 696 
 697         ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
 698         if (namelen != 0) {
 699                 addrlen = so->so_max_addr_len;
 700                 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
 701 
 702                 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
 703                     &addrlen, B_TRUE, CRED())) == 0) {
 704                         error = copyout_name(name, namelen, namelenp,
 705                             addrp, addrlen);
 706                 } else {
 707                         ASSERT(error == EINVAL || error == ENOTCONN);
 708                         error = ECONNABORTED;
 709                 }
 710                 kmem_free(addrp, so->so_max_addr_len);
 711         }
 712 
 713         if (error) {
 714                 setf(nfd, NULL);
 715                 (void) socket_close(nso, 0, CRED());
 716                 socket_destroy(nso);
 717                 releasef(sock);
 718                 return (set_errno(error));
 719         }
 720         if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
 721                 setf(nfd, NULL);
 722                 (void) socket_close(nso, 0, CRED());
 723                 socket_destroy(nso);
 724                 eprintsoline(so, error);
 725                 releasef(sock);
 726                 return (set_errno(error));
 727         }
 728         /*
 729          * fill in the entries that falloc reserved
 730          */
 731         nfp->f_vnode = nvp;
 732         mutex_exit(&nfp->f_tlock);
 733         setf(nfd, nfp);
 734 
 735         /*
 736          * Act on SOCK_CLOEXEC from flags
 737          */
 738         if (flags & SOCK_CLOEXEC) {
 739                 f_setfd(nfd, FD_CLOEXEC);
 740         }
 741 
 742         /*
 743          * Copy FNDELAY and FNONBLOCK from listener to acceptor
 744          * and from ssflags
 745          */
 746         if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
 747                 uint_t oflag = nfp->f_flag;
 748                 int arg = 0;
 749 
 750                 if ((ssflags | so->so_state) & SS_NONBLOCK)
 751                         arg |= FNONBLOCK;
 752                 else if ((ssflags | so->so_state) & SS_NDELAY)
 753                         arg |= FNDELAY;
 754 
 755                 /*
 756                  * This code is a simplification of the F_SETFL code in fcntl()
 757                  * Ignore any errors from VOP_SETFL.
 758                  */
 759                 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
 760                     != 0) {
 761                         eprintsoline(so, error);
 762                         error = 0;
 763                 } else {
 764                         mutex_enter(&nfp->f_tlock);
 765                         nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
 766                         nfp->f_flag |= arg;
 767                         mutex_exit(&nfp->f_tlock);
 768                 }
 769         }
 770         releasef(sock);
 771         return (nfd);
 772 }
 773 
 774 int
 775 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
 776 {
 777         struct sonode *so;
 778         file_t *fp;
 779         int error;
 780 
 781         dprint(1, ("connect(%d, %p, %d)\n",
 782             sock, (void *)name, namelen));
 783 
 784         if ((so = getsonode(sock, &error, &fp)) == NULL)
 785                 return (set_errno(error));
 786 
 787         /* Allocate and copyin name */
 788         if (namelen != 0) {
 789                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 790                 name = copyin_name(so, name, &namelen, &error);
 791                 if (name == NULL) {
 792                         releasef(sock);
 793                         return (set_errno(error));
 794                 }
 795         } else
 796                 name = NULL;
 797 
 798         error = socket_connect(so, name, namelen, fp->f_flag,
 799             (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
 800         releasef(sock);
 801         if (name)
 802                 kmem_free(name, (size_t)namelen);
 803         if (error)
 804                 return (set_errno(error));
 805         return (0);
 806 }
 807 
 808 /*ARGSUSED2*/
 809 int
 810 shutdown(int sock, int how, int version)
 811 {
 812         struct sonode *so;
 813         int error;
 814 
 815         dprint(1, ("shutdown(%d, %d)\n",
 816             sock, how));
 817 
 818         if ((so = getsonode(sock, &error, NULL)) == NULL)
 819                 return (set_errno(error));
 820 
 821         error = socket_shutdown(so, how, CRED());
 822 
 823         releasef(sock);
 824         if (error)
 825                 return (set_errno(error));
 826         return (0);
 827 }
 828 
 829 /*
 830  * Common receive routine.
 831  */
 832 static ssize_t
 833 recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
 834     socklen_t *namelenp, socklen_t *controllenp, int *flagsp)
 835 {
 836         struct sonode *so;
 837         file_t *fp;
 838         void *name;
 839         socklen_t namelen;
 840         void *control;
 841         socklen_t controllen;
 842         ssize_t len;
 843         int error;
 844 
 845         if ((so = getsonode(sock, &error, &fp)) == NULL)
 846                 return (set_errno(error));
 847 
 848         len = uiop->uio_resid;
 849         uiop->uio_fmode = fp->f_flag;
 850         uiop->uio_extflg = UIO_COPY_CACHED;
 851 
 852         name = msg->msg_name;
 853         namelen = msg->msg_namelen;
 854         control = msg->msg_control;
 855         controllen = msg->msg_controllen;
 856 
 857         msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
 858             MSG_DONTWAIT | MSG_XPG4_2);
 859 
 860         error = socket_recvmsg(so, msg, uiop, CRED());
 861         if (error) {
 862                 releasef(sock);
 863                 return (set_errno(error));
 864         }
 865         lwp_stat_update(LWP_STAT_MSGRCV, 1);
 866         releasef(sock);
 867 
 868         error = copyout_name(name, namelen, namelenp,
 869             msg->msg_name, msg->msg_namelen);
 870         if (error)
 871                 goto err;
 872 
 873         if (flagsp != NULL) {
 874                 /*
 875                  * Clear internal flag.
 876                  */
 877                 msg->msg_flags &= ~MSG_XPG4_2;
 878 
 879                 /*
 880                  * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
 881                  * when controllen is zero and there is control data to
 882                  * copy out.
 883                  */
 884                 if (controllen != 0 &&
 885                     (msg->msg_controllen > controllen || control == NULL)) {
 886                         dprint(1, ("recvit: CTRUNC %d %d %p\n",
 887                             msg->msg_controllen, controllen, control));
 888 
 889                         msg->msg_flags |= MSG_CTRUNC;
 890                 }
 891                 if (copyout(&msg->msg_flags, flagsp,
 892                     sizeof (msg->msg_flags))) {
 893                         error = EFAULT;
 894                         goto err;
 895                 }
 896         }
 897         /*
 898          * Note: This MUST be done last. There can be no "goto err" after this
 899          * point since it could make so_closefds run twice on some part
 900          * of the file descriptor array.
 901          */
 902         if (controllen != 0) {
 903                 if (!(flags & MSG_XPG4_2)) {
 904                         /*
 905                          * Good old msg_accrights can only return a multiple
 906                          * of 4 bytes.
 907                          */
 908                         controllen &= ~((int)sizeof (uint32_t) - 1);
 909                 }
 910                 error = copyout_arg(control, controllen, controllenp,
 911                     msg->msg_control, msg->msg_controllen);
 912                 if (error)
 913                         goto err;
 914 
 915                 if (msg->msg_controllen > controllen || control == NULL) {
 916                         if (control == NULL)
 917                                 controllen = 0;
 918                         so_closefds(msg->msg_control, msg->msg_controllen,
 919                             !(flags & MSG_XPG4_2), controllen);
 920                 }
 921         }
 922         if (msg->msg_namelen != 0)
 923                 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 924         if (msg->msg_controllen != 0)
 925                 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 926         return (len - uiop->uio_resid);
 927 
 928 err:
 929         /*
 930          * If we fail and the control part contains file descriptors
 931          * we have to close the fd's.
 932          */
 933         if (msg->msg_controllen != 0)
 934                 so_closefds(msg->msg_control, msg->msg_controllen,
 935                     !(flags & MSG_XPG4_2), 0);
 936         if (msg->msg_namelen != 0)
 937                 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 938         if (msg->msg_controllen != 0)
 939                 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 940         return (set_errno(error));
 941 }
 942 
 943 /*
 944  * Native system call
 945  */
 946 ssize_t
 947 recv(int sock, void *buffer, size_t len, int flags)
 948 {
 949         struct nmsghdr lmsg;
 950         struct uio auio;
 951         struct iovec aiov[1];
 952 
 953         dprint(1, ("recv(%d, %p, %ld, %d)\n",
 954             sock, buffer, len, flags));
 955 
 956         if ((ssize_t)len < 0) {
 957                 return (set_errno(EINVAL));
 958         }
 959 
 960         aiov[0].iov_base = buffer;
 961         aiov[0].iov_len = len;
 962         auio.uio_loffset = 0;
 963         auio.uio_iov = aiov;
 964         auio.uio_iovcnt = 1;
 965         auio.uio_resid = len;
 966         auio.uio_segflg = UIO_USERSPACE;
 967         auio.uio_limit = 0;
 968 
 969         lmsg.msg_namelen = 0;
 970         lmsg.msg_controllen = 0;
 971         lmsg.msg_flags = 0;
 972         return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
 973 }
 974 
 975 ssize_t
 976 recvfrom(int sock, void *buffer, size_t len, int flags, struct sockaddr *name,
 977     socklen_t *namelenp)
 978 {
 979         struct nmsghdr lmsg;
 980         struct uio auio;
 981         struct iovec aiov[1];
 982 
 983         dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
 984             sock, buffer, len, flags, (void *)name, (void *)namelenp));
 985 
 986         if ((ssize_t)len < 0) {
 987                 return (set_errno(EINVAL));
 988         }
 989 
 990         aiov[0].iov_base = buffer;
 991         aiov[0].iov_len = len;
 992         auio.uio_loffset = 0;
 993         auio.uio_iov = aiov;
 994         auio.uio_iovcnt = 1;
 995         auio.uio_resid = len;
 996         auio.uio_segflg = UIO_USERSPACE;
 997         auio.uio_limit = 0;
 998 
 999         lmsg.msg_name = (char *)name;
1000         if (namelenp != NULL) {
1001                 if (copyin(namelenp, &lmsg.msg_namelen,
1002                     sizeof (lmsg.msg_namelen)))
1003                         return (set_errno(EFAULT));
1004         } else {
1005                 lmsg.msg_namelen = 0;
1006         }
1007         lmsg.msg_controllen = 0;
1008         lmsg.msg_flags = 0;
1009 
1010         return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1011 }
1012 
1013 /*
1014  * Uses the MSG_XPG4_2 flag to determine if the caller is using
1015  * struct omsghdr or struct nmsghdr.
1016  */
1017 ssize_t
1018 recvmsg(int sock, struct nmsghdr *msg, int flags)
1019 {
1020         STRUCT_DECL(nmsghdr, u_lmsg);
1021         STRUCT_HANDLE(nmsghdr, umsgptr);
1022         struct nmsghdr lmsg;
1023         struct uio auio;
1024         struct iovec aiov[MSG_MAXIOVLEN];
1025         int iovcnt;
1026         ssize_t len;
1027         int i;
1028         int *flagsp;
1029         model_t model;
1030 
1031         dprint(1, ("recvmsg(%d, %p, %d)\n",
1032             sock, (void *)msg, flags));
1033 
1034         model = get_udatamodel();
1035         STRUCT_INIT(u_lmsg, model);
1036         STRUCT_SET_HANDLE(umsgptr, model, msg);
1037 
1038         if (flags & MSG_XPG4_2) {
1039                 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1040                         return (set_errno(EFAULT));
1041                 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1042         } else {
1043                 /*
1044                  * Assumes that nmsghdr and omsghdr are identically shaped
1045                  * except for the added msg_flags field.
1046                  */
1047                 if (copyin(msg, STRUCT_BUF(u_lmsg),
1048                     SIZEOF_STRUCT(omsghdr, model)))
1049                         return (set_errno(EFAULT));
1050                 STRUCT_FSET(u_lmsg, msg_flags, 0);
1051                 flagsp = NULL;
1052         }
1053 
1054         /*
1055          * Code below us will kmem_alloc memory and hang it
1056          * off msg_control and msg_name fields. This forces
1057          * us to copy the structure to its native form.
1058          */
1059         lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1060         lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1061         lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1062         lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1063         lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1064         lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1065         lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1066 
1067         iovcnt = lmsg.msg_iovlen;
1068 
1069         if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1070                 return (set_errno(EMSGSIZE));
1071         }
1072 
1073 #ifdef _SYSCALL32_IMPL
1074         /*
1075          * 32-bit callers need to have their iovec expanded, while ensuring
1076          * that they can't move more than 2Gbytes of data in a single call.
1077          */
1078         if (model == DATAMODEL_ILP32) {
1079                 struct iovec32 aiov32[MSG_MAXIOVLEN];
1080                 ssize32_t count32;
1081 
1082                 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1083                     iovcnt * sizeof (struct iovec32)))
1084                         return (set_errno(EFAULT));
1085 
1086                 count32 = 0;
1087                 for (i = 0; i < iovcnt; i++) {
1088                         ssize32_t iovlen32;
1089 
1090                         iovlen32 = aiov32[i].iov_len;
1091                         count32 += iovlen32;
1092                         if (iovlen32 < 0 || count32 < 0)
1093                                 return (set_errno(EINVAL));
1094                         aiov[i].iov_len = iovlen32;
1095                         aiov[i].iov_base =
1096                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1097                 }
1098         } else
1099 #endif /* _SYSCALL32_IMPL */
1100         if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1101                 return (set_errno(EFAULT));
1102         }
1103         len = 0;
1104         for (i = 0; i < iovcnt; i++) {
1105                 ssize_t iovlen = aiov[i].iov_len;
1106                 len += iovlen;
1107                 if (iovlen < 0 || len < 0) {
1108                         return (set_errno(EINVAL));
1109                 }
1110         }
1111         auio.uio_loffset = 0;
1112         auio.uio_iov = aiov;
1113         auio.uio_iovcnt = iovcnt;
1114         auio.uio_resid = len;
1115         auio.uio_segflg = UIO_USERSPACE;
1116         auio.uio_limit = 0;
1117 
1118         if (lmsg.msg_control != NULL &&
1119             (do_useracc == 0 ||
1120             useracc(lmsg.msg_control, lmsg.msg_controllen,
1121             B_WRITE) != 0)) {
1122                 return (set_errno(EFAULT));
1123         }
1124 
1125         return (recvit(sock, &lmsg, &auio, flags,
1126             STRUCT_FADDR(umsgptr, msg_namelen),
1127             STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1128 }
1129 
1130 /*
1131  * Common send function.
1132  */
1133 static ssize_t
1134 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1135 {
1136         struct sonode *so;
1137         file_t *fp;
1138         void *name;
1139         socklen_t namelen;
1140         void *control;
1141         socklen_t controllen;
1142         ssize_t len;
1143         int error;
1144 
1145         if ((so = getsonode(sock, &error, &fp)) == NULL)
1146                 return (set_errno(error));
1147 
1148         uiop->uio_fmode = fp->f_flag;
1149 
1150         if (so->so_family == AF_UNIX)
1151                 uiop->uio_extflg = UIO_COPY_CACHED;
1152         else
1153                 uiop->uio_extflg = UIO_COPY_DEFAULT;
1154 
1155         /* Allocate and copyin name and control */
1156         name = msg->msg_name;
1157         namelen = msg->msg_namelen;
1158         if (name != NULL && namelen != 0) {
1159                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1160                 name = copyin_name(so,
1161                     (struct sockaddr *)name,
1162                     &namelen, &error);
1163                 if (name == NULL)
1164                         goto done3;
1165                 /* copyin_name null terminates addresses for AF_UNIX */
1166                 msg->msg_namelen = namelen;
1167                 msg->msg_name = name;
1168         } else {
1169                 msg->msg_name = name = NULL;
1170                 msg->msg_namelen = namelen = 0;
1171         }
1172 
1173         control = msg->msg_control;
1174         controllen = msg->msg_controllen;
1175         if ((control != NULL) && (controllen != 0)) {
1176                 /*
1177                  * Verify that the length is not excessive to prevent
1178                  * an application from consuming all of kernel memory.
1179                  */
1180                 if (controllen > SO_MAXARGSIZE) {
1181                         error = EINVAL;
1182                         goto done2;
1183                 }
1184                 control = kmem_alloc(controllen, KM_SLEEP);
1185 
1186                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1187                 if (copyin(msg->msg_control, control, controllen)) {
1188                         error = EFAULT;
1189                         goto done1;
1190                 }
1191                 msg->msg_control = control;
1192         } else {
1193                 msg->msg_control = control = NULL;
1194                 msg->msg_controllen = controllen = 0;
1195         }
1196 
1197         len = uiop->uio_resid;
1198         msg->msg_flags = flags;
1199 
1200         error = socket_sendmsg(so, msg, uiop, CRED());
1201 done1:
1202         if (control != NULL)
1203                 kmem_free(control, controllen);
1204 done2:
1205         if (name != NULL)
1206                 kmem_free(name, namelen);
1207 done3:
1208         if (error != 0) {
1209                 releasef(sock);
1210                 return (set_errno(error));
1211         }
1212         lwp_stat_update(LWP_STAT_MSGSND, 1);
1213         releasef(sock);
1214         return (len - uiop->uio_resid);
1215 }
1216 
1217 /*
1218  * Native system call
1219  */
1220 ssize_t
1221 send(int sock, void *buffer, size_t len, int flags)
1222 {
1223         struct nmsghdr lmsg;
1224         struct uio auio;
1225         struct iovec aiov[1];
1226 
1227         dprint(1, ("send(%d, %p, %ld, %d)\n",
1228             sock, buffer, len, flags));
1229 
1230         if ((ssize_t)len < 0) {
1231                 return (set_errno(EINVAL));
1232         }
1233 
1234         aiov[0].iov_base = buffer;
1235         aiov[0].iov_len = len;
1236         auio.uio_loffset = 0;
1237         auio.uio_iov = aiov;
1238         auio.uio_iovcnt = 1;
1239         auio.uio_resid = len;
1240         auio.uio_segflg = UIO_USERSPACE;
1241         auio.uio_limit = 0;
1242 
1243         lmsg.msg_name = NULL;
1244         lmsg.msg_control = NULL;
1245         if (!(flags & MSG_XPG4_2)) {
1246                 /*
1247                  * In order to be compatible with the libsocket/sockmod
1248                  * implementation we set EOR for all send* calls.
1249                  */
1250                 flags |= MSG_EOR;
1251         }
1252         return (sendit(sock, &lmsg, &auio, flags));
1253 }
1254 
1255 /*
1256  * Uses the MSG_XPG4_2 flag to determine if the caller is using
1257  * struct omsghdr or struct nmsghdr.
1258  */
1259 ssize_t
1260 sendmsg(int sock, struct nmsghdr *msg, int flags)
1261 {
1262         struct nmsghdr lmsg;
1263         STRUCT_DECL(nmsghdr, u_lmsg);
1264         struct uio auio;
1265         struct iovec aiov[MSG_MAXIOVLEN];
1266         int iovcnt;
1267         ssize_t len;
1268         int i;
1269         model_t model;
1270 
1271         dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1272 
1273         model = get_udatamodel();
1274         STRUCT_INIT(u_lmsg, model);
1275 
1276         if (flags & MSG_XPG4_2) {
1277                 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1278                     STRUCT_SIZE(u_lmsg)))
1279                         return (set_errno(EFAULT));
1280         } else {
1281                 /*
1282                  * Assumes that nmsghdr and omsghdr are identically shaped
1283                  * except for the added msg_flags field.
1284                  */
1285                 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1286                     SIZEOF_STRUCT(omsghdr, model)))
1287                         return (set_errno(EFAULT));
1288                 /*
1289                  * In order to be compatible with the libsocket/sockmod
1290                  * implementation we set EOR for all send* calls.
1291                  */
1292                 flags |= MSG_EOR;
1293         }
1294 
1295         /*
1296          * Code below us will kmem_alloc memory and hang it
1297          * off msg_control and msg_name fields. This forces
1298          * us to copy the structure to its native form.
1299          */
1300         lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1301         lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1302         lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1303         lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1304         lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1305         lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1306         lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1307 
1308         iovcnt = lmsg.msg_iovlen;
1309 
1310         if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1311                 /*
1312                  * Unless this is XPG 4.2 we allow iovcnt == 0 to
1313                  * be compatible with SunOS 4.X and 4.4BSD.
1314                  */
1315                 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1316                         return (set_errno(EMSGSIZE));
1317         }
1318 
1319 #ifdef _SYSCALL32_IMPL
1320         /*
1321          * 32-bit callers need to have their iovec expanded, while ensuring
1322          * that they can't move more than 2Gbytes of data in a single call.
1323          */
1324         if (model == DATAMODEL_ILP32) {
1325                 struct iovec32 aiov32[MSG_MAXIOVLEN];
1326                 ssize32_t count32;
1327 
1328                 if (iovcnt != 0 &&
1329                     copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1330                     iovcnt * sizeof (struct iovec32)))
1331                         return (set_errno(EFAULT));
1332 
1333                 count32 = 0;
1334                 for (i = 0; i < iovcnt; i++) {
1335                         ssize32_t iovlen32;
1336 
1337                         iovlen32 = aiov32[i].iov_len;
1338                         count32 += iovlen32;
1339                         if (iovlen32 < 0 || count32 < 0)
1340                                 return (set_errno(EINVAL));
1341                         aiov[i].iov_len = iovlen32;
1342                         aiov[i].iov_base =
1343                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1344                 }
1345         } else
1346 #endif /* _SYSCALL32_IMPL */
1347         if (iovcnt != 0 &&
1348             copyin(lmsg.msg_iov, aiov,
1349             (unsigned)iovcnt * sizeof (struct iovec))) {
1350                 return (set_errno(EFAULT));
1351         }
1352         len = 0;
1353         for (i = 0; i < iovcnt; i++) {
1354                 ssize_t iovlen = aiov[i].iov_len;
1355                 len += iovlen;
1356                 if (iovlen < 0 || len < 0) {
1357                         return (set_errno(EINVAL));
1358                 }
1359         }
1360         auio.uio_loffset = 0;
1361         auio.uio_iov = aiov;
1362         auio.uio_iovcnt = iovcnt;
1363         auio.uio_resid = len;
1364         auio.uio_segflg = UIO_USERSPACE;
1365         auio.uio_limit = 0;
1366 
1367         return (sendit(sock, &lmsg, &auio, flags));
1368 }
1369 
1370 ssize_t
1371 sendto(int sock, void *buffer, size_t len, int flags,
1372     struct sockaddr *name, socklen_t namelen)
1373 {
1374         struct nmsghdr lmsg;
1375         struct uio auio;
1376         struct iovec aiov[1];
1377 
1378         dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1379             sock, buffer, len, flags, (void *)name, namelen));
1380 
1381         if ((ssize_t)len < 0) {
1382                 return (set_errno(EINVAL));
1383         }
1384 
1385         aiov[0].iov_base = buffer;
1386         aiov[0].iov_len = len;
1387         auio.uio_loffset = 0;
1388         auio.uio_iov = aiov;
1389         auio.uio_iovcnt = 1;
1390         auio.uio_resid = len;
1391         auio.uio_segflg = UIO_USERSPACE;
1392         auio.uio_limit = 0;
1393 
1394         lmsg.msg_name = (char *)name;
1395         lmsg.msg_namelen = namelen;
1396         lmsg.msg_control = NULL;
1397         if (!(flags & MSG_XPG4_2)) {
1398                 /*
1399                  * In order to be compatible with the libsocket/sockmod
1400                  * implementation we set EOR for all send* calls.
1401                  */
1402                 flags |= MSG_EOR;
1403         }
1404         return (sendit(sock, &lmsg, &auio, flags));
1405 }
1406 
1407 /*ARGSUSED3*/
1408 int
1409 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1410 {
1411         struct sonode *so;
1412         int error;
1413         socklen_t namelen;
1414         socklen_t sock_addrlen;
1415         struct sockaddr *sock_addrp;
1416 
1417         dprint(1, ("getpeername(%d, %p, %p)\n",
1418             sock, (void *)name, (void *)namelenp));
1419 
1420         if ((so = getsonode(sock, &error, NULL)) == NULL)
1421                 goto bad;
1422 
1423         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1424         if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1425             (name == NULL && namelen != 0)) {
1426                 error = EFAULT;
1427                 goto rel_out;
1428         }
1429         sock_addrlen = so->so_max_addr_len;
1430         sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1431 
1432         if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1433             B_FALSE, CRED())) == 0) {
1434                 ASSERT(sock_addrlen <= so->so_max_addr_len);
1435                 error = copyout_name(name, namelen, namelenp,
1436                     (void *)sock_addrp, sock_addrlen);
1437         }
1438         kmem_free(sock_addrp, so->so_max_addr_len);
1439 rel_out:
1440         releasef(sock);
1441 bad:    return (error != 0 ? set_errno(error) : 0);
1442 }
1443 
1444 /*ARGSUSED3*/
1445 int
1446 getsockname(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1447 {
1448         struct sonode *so;
1449         int error;
1450         socklen_t namelen, sock_addrlen;
1451         struct sockaddr *sock_addrp;
1452 
1453         dprint(1, ("getsockname(%d, %p, %p)\n",
1454             sock, (void *)name, (void *)namelenp));
1455 
1456         if ((so = getsonode(sock, &error, NULL)) == NULL)
1457                 goto bad;
1458 
1459         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1460         if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1461             (name == NULL && namelen != 0)) {
1462                 error = EFAULT;
1463                 goto rel_out;
1464         }
1465 
1466         sock_addrlen = so->so_max_addr_len;
1467         sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1468         if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1469             CRED())) == 0) {
1470                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1471                 ASSERT(sock_addrlen <= so->so_max_addr_len);
1472                 error = copyout_name(name, namelen, namelenp,
1473                     (void *)sock_addrp, sock_addrlen);
1474         }
1475         kmem_free(sock_addrp, so->so_max_addr_len);
1476 rel_out:
1477         releasef(sock);
1478 bad:    return (error != 0 ? set_errno(error) : 0);
1479 }
1480 
1481 /*ARGSUSED5*/
1482 int
1483 getsockopt(int sock, int level, int option_name, void *option_value,
1484     socklen_t *option_lenp, int version)
1485 {
1486         struct sonode *so;
1487         socklen_t optlen, optlen_res;
1488         void *optval;
1489         int error;
1490 
1491         dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1492             sock, level, option_name, option_value, (void *)option_lenp));
1493 
1494         if ((so = getsonode(sock, &error, NULL)) == NULL)
1495                 return (set_errno(error));
1496 
1497         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1498         if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1499                 releasef(sock);
1500                 return (set_errno(EFAULT));
1501         }
1502         /*
1503          * Verify that the length is not excessive to prevent
1504          * an application from consuming all of kernel memory.
1505          */
1506         if (optlen > SO_MAXARGSIZE) {
1507                 error = EINVAL;
1508                 releasef(sock);
1509                 return (set_errno(error));
1510         }
1511         optval = kmem_alloc(optlen, KM_SLEEP);
1512         optlen_res = optlen;
1513         error = socket_getsockopt(so, level, option_name, optval,
1514             &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1515             CRED());
1516         releasef(sock);
1517         if (error) {
1518                 kmem_free(optval, optlen);
1519                 return (set_errno(error));
1520         }
1521         error = copyout_arg(option_value, optlen, option_lenp,
1522             optval, optlen_res);
1523         kmem_free(optval, optlen);
1524         if (error)
1525                 return (set_errno(error));
1526         return (0);
1527 }
1528 
1529 /*ARGSUSED5*/
1530 int
1531 setsockopt(int sock, int level, int option_name, void *option_value,
1532     socklen_t option_len, int version)
1533 {
1534         struct sonode *so;
1535         intptr_t buffer[2];
1536         void *optval = NULL;
1537         int error;
1538 
1539         dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1540             sock, level, option_name, option_value, option_len));
1541 
1542         if ((so = getsonode(sock, &error, NULL)) == NULL)
1543                 return (set_errno(error));
1544 
1545         if (option_value != NULL) {
1546                 if (option_len != 0) {
1547                         /*
1548                          * Verify that the length is not excessive to prevent
1549                          * an application from consuming all of kernel memory.
1550                          */
1551                         if (option_len > SO_MAXARGSIZE) {
1552                                 error = EINVAL;
1553                                 goto done2;
1554                         }
1555                         optval = option_len <= sizeof (buffer) ?
1556                             &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1557                         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1558                         if (copyin(option_value, optval, (size_t)option_len)) {
1559                                 error = EFAULT;
1560                                 goto done1;
1561                         }
1562                 }
1563         } else
1564                 option_len = 0;
1565 
1566         error = socket_setsockopt(so, level, option_name, optval,
1567             (t_uscalar_t)option_len, CRED());
1568 done1:
1569         if (optval != buffer)
1570                 kmem_free(optval, (size_t)option_len);
1571 done2:
1572         releasef(sock);
1573         if (error)
1574                 return (set_errno(error));
1575         return (0);
1576 }
1577 
1578 static int
1579 sockconf_add_sock(int family, int type, int protocol, char *name)
1580 {
1581         int error = 0;
1582         char *kdevpath = NULL;
1583         char *kmodule = NULL;
1584         char *buf = NULL;
1585         size_t pathlen = 0;
1586         struct sockparams *sp;
1587 
1588         if (name == NULL)
1589                 return (EINVAL);
1590         /*
1591          * Copyin the name.
1592          * This also makes it possible to check for too long pathnames.
1593          * Compress the space needed for the name before passing it
1594          * to soconfig - soconfig will store the string until
1595          * the configuration is removed.
1596          */
1597         buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1598         if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1599                 kmem_free(buf, MAXPATHLEN);
1600                 return (error);
1601         }
1602         if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1603                 /* For device */
1604 
1605                 /*
1606                  * Special handling for NCA:
1607                  *
1608                  * DEV_NCA is never opened even if an application
1609                  * requests for AF_NCA. The device opened is instead a
1610                  * predefined AF_INET transport (NCA_INET_DEV).
1611                  *
1612                  * Prior to Volo (PSARC/2007/587) NCA would determine
1613                  * the device using a lookup, which worked then because
1614                  * all protocols were based on TPI. Since TPI is no
1615                  * longer the default, we have to explicitly state
1616                  * which device to use.
1617                  */
1618                 if (strcmp(buf, NCA_DEV) == 0) {
1619                         /* only support entry <28, 2, 0> */
1620                         if (family != AF_NCA || type != SOCK_STREAM ||
1621                             protocol != 0) {
1622                                 kmem_free(buf, MAXPATHLEN);
1623                                 return (EINVAL);
1624                         }
1625 
1626                         pathlen = strlen(NCA_INET_DEV) + 1;
1627                         kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1628                         bcopy(NCA_INET_DEV, kdevpath, pathlen);
1629                         kdevpath[pathlen - 1] = '\0';
1630                 } else {
1631                         kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1632                         bcopy(buf, kdevpath, pathlen);
1633                         kdevpath[pathlen - 1] = '\0';
1634                 }
1635         } else {
1636                 /* For socket module */
1637                 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1638                 bcopy(buf, kmodule, pathlen);
1639                 kmodule[pathlen - 1] = '\0';
1640                 pathlen = 0;
1641         }
1642         kmem_free(buf, MAXPATHLEN);
1643 
1644         /* sockparams_create frees mod name and devpath upon failure */
1645         sp = sockparams_create(family, type, protocol, kmodule,
1646             kdevpath, pathlen, 0, KM_SLEEP, &error);
1647         if (sp != NULL) {
1648                 error = sockparams_add(sp);
1649                 if (error != 0)
1650                         sockparams_destroy(sp);
1651         }
1652 
1653         return (error);
1654 }
1655 
1656 static int
1657 sockconf_remove_sock(int family, int type, int protocol)
1658 {
1659         return (sockparams_delete(family, type, protocol));
1660 }
1661 
1662 static int
1663 sockconfig_remove_filter(const char *uname)
1664 {
1665         char kname[SOF_MAXNAMELEN];
1666         size_t len;
1667         int error;
1668         sof_entry_t *ent;
1669 
1670         if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1671                 return (error);
1672 
1673         ent = sof_entry_remove_by_name(kname);
1674         if (ent == NULL)
1675                 return (ENXIO);
1676 
1677         mutex_enter(&ent->sofe_lock);
1678         ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1679         if (ent->sofe_refcnt == 0) {
1680                 mutex_exit(&ent->sofe_lock);
1681                 sof_entry_free(ent);
1682         } else {
1683                 /* let the last socket free the filter */
1684                 ent->sofe_flags |= SOFEF_CONDEMED;
1685                 mutex_exit(&ent->sofe_lock);
1686         }
1687 
1688         return (0);
1689 }
1690 
1691 static int
1692 sockconfig_add_filter(const char *uname, void *ufilpropp)
1693 {
1694         struct sockconfig_filter_props filprop;
1695         sof_entry_t *ent;
1696         int error;
1697         size_t tuplesz, len;
1698         char hintbuf[SOF_MAXNAMELEN];
1699 
1700         ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1701         mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1702 
1703         if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1704             &len)) != 0) {
1705                 sof_entry_free(ent);
1706                 return (error);
1707         }
1708 
1709         if (get_udatamodel() == DATAMODEL_NATIVE) {
1710                 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1711                         sof_entry_free(ent);
1712                         return (EFAULT);
1713                 }
1714         }
1715 #ifdef  _SYSCALL32_IMPL
1716         else {
1717                 struct sockconfig_filter_props32 filprop32;
1718 
1719                 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1720                         sof_entry_free(ent);
1721                         return (EFAULT);
1722                 }
1723                 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1724                 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1725                 filprop.sfp_hint = filprop32.sfp_hint;
1726                 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1727                 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1728                 filprop.sfp_socktuple =
1729                     (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1730         }
1731 #endif  /* _SYSCALL32_IMPL */
1732 
1733         if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1734             sizeof (ent->sofe_modname), &len)) != 0) {
1735                 sof_entry_free(ent);
1736                 return (error);
1737         }
1738 
1739         /*
1740          * A filter must specify at least one socket tuple.
1741          */
1742         if (filprop.sfp_socktuple_cnt == 0 ||
1743             filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1744                 sof_entry_free(ent);
1745                 return (EINVAL);
1746         }
1747         ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1748         ent->sofe_hint = filprop.sfp_hint;
1749 
1750         /*
1751          * Verify the hint, and copy in the hint argument, if necessary.
1752          */
1753         switch (ent->sofe_hint) {
1754         case SOF_HINT_BEFORE:
1755         case SOF_HINT_AFTER:
1756                 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1757                     sizeof (hintbuf), &len)) != 0) {
1758                         sof_entry_free(ent);
1759                         return (error);
1760                 }
1761                 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1762                 bcopy(hintbuf, ent->sofe_hintarg, len);
1763                 /* FALLTHRU */
1764         case SOF_HINT_TOP:
1765         case SOF_HINT_BOTTOM:
1766                 /* hints cannot be used with programmatic filters */
1767                 if (ent->sofe_flags & SOFEF_PROG) {
1768                         sof_entry_free(ent);
1769                         return (EINVAL);
1770                 }
1771                 break;
1772         case SOF_HINT_NONE:
1773                 break;
1774         default:
1775                 /* bad hint value */
1776                 sof_entry_free(ent);
1777                 return (EINVAL);
1778         }
1779 
1780         ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1781         tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1782         ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1783 
1784         if (get_udatamodel() == DATAMODEL_NATIVE) {
1785                 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1786                     tuplesz)) {
1787                         sof_entry_free(ent);
1788                         return (EFAULT);
1789                 }
1790         }
1791 #ifdef  _SYSCALL32_IMPL
1792         else {
1793                 int i;
1794                 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1795                 sof_socktuple_t *tup = ent->sofe_socktuple;
1796                 sof_socktuple32_t tup32;
1797 
1798                 tup = ent->sofe_socktuple;
1799                 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1800                         ASSERT(tup < ent->sofe_socktuple + tuplesz);
1801 
1802                         if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1803                                 sof_entry_free(ent);
1804                                 return (EFAULT);
1805                         }
1806                         tup->sofst_family = tup32.sofst_family;
1807                         tup->sofst_type = tup32.sofst_type;
1808                         tup->sofst_protocol = tup32.sofst_protocol;
1809 
1810                         data += sizeof (tup32);
1811                 }
1812         }
1813 #endif  /* _SYSCALL32_IMPL */
1814 
1815         /* Sockets can start using the filter as soon as the filter is added */
1816         if ((error = sof_entry_add(ent)) != 0)
1817                 sof_entry_free(ent);
1818 
1819         return (error);
1820 }
1821 
1822 /*
1823  * Socket configuration system call. It is used to add and remove
1824  * socket types.
1825  */
1826 int
1827 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1828 {
1829         int error = 0;
1830 
1831         if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1832                 return (set_errno(EPERM));
1833 
1834         if (sockfs_defer_nl7c_init) {
1835                 nl7c_init();
1836                 sockfs_defer_nl7c_init = 0;
1837         }
1838 
1839         switch (cmd) {
1840         case SOCKCONFIG_ADD_SOCK:
1841                 error = sockconf_add_sock((int)(uintptr_t)arg1,
1842                     (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1843                 break;
1844         case SOCKCONFIG_REMOVE_SOCK:
1845                 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1846                     (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1847                 break;
1848         case SOCKCONFIG_ADD_FILTER:
1849                 error = sockconfig_add_filter((const char *)arg1, arg2);
1850                 break;
1851         case SOCKCONFIG_REMOVE_FILTER:
1852                 error = sockconfig_remove_filter((const char *)arg1);
1853                 break;
1854         case SOCKCONFIG_GET_SOCKTABLE:
1855                 error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1856                 break;
1857         default:
1858 #ifdef  DEBUG
1859                 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1860 #endif
1861                 error = EINVAL;
1862                 break;
1863         }
1864 
1865         if (error != 0) {
1866                 eprintline(error);
1867                 return (set_errno(error));
1868         }
1869         return (0);
1870 }
1871 
1872 
1873 /*
1874  * Sendfile is implemented through two schemes, direct I/O or by
1875  * caching in the filesystem page cache. We cache the input file by
1876  * default and use direct I/O only if sendfile_max_size is set
1877  * appropriately as explained below. Note that this logic is consistent
1878  * with other filesystems where caching is turned on by default
1879  * unless explicitly turned off by using the DIRECTIO ioctl.
1880  *
1881  * We choose a slightly different scheme here. One can turn off
1882  * caching by setting sendfile_max_size to 0. One can also enable
1883  * caching of files <= sendfile_max_size by setting sendfile_max_size
1884  * to an appropriate value. By default sendfile_max_size is set to the
1885  * maximum value so that all files are cached. In future, we may provide
1886  * better interfaces for caching the file.
1887  *
1888  * Sendfile through Direct I/O (Zero copy)
1889  * --------------------------------------
1890  *
1891  * As disks are normally slower than the network, we can't have a
1892  * single thread that reads the disk and writes to the network. We
1893  * need to have parallelism. This is done by having the sendfile
1894  * thread create another thread that reads from the filesystem
1895  * and queues it for network processing. In this scheme, the data
1896  * is never copied anywhere i.e it is zero copy unlike the other
1897  * scheme.
1898  *
1899  * We have a sendfile queue (snfq) where each sendfile
1900  * request (snf_req_t) is queued for processing by a thread. Number
1901  * of threads is dynamically allocated and they exit if they are idling
1902  * beyond a specified amount of time. When each request (snf_req_t) is
1903  * processed by a thread, it produces a number of mblk_t structures to
1904  * be consumed by the sendfile thread. snf_deque and snf_enque are
1905  * used for consuming and producing mblks. Size of the filesystem
1906  * read is determined by the tunable (sendfile_read_size). A single
1907  * mblk holds sendfile_read_size worth of data (except the last
1908  * read of the file) which is sent down as a whole to the network.
1909  * sendfile_read_size is set to 1 MB as this seems to be the optimal
1910  * value for the UFS filesystem backed by a striped storage array.
1911  *
1912  * Synchronisation between read (producer) and write (consumer) threads.
1913  * --------------------------------------------------------------------
1914  *
1915  * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1916  * adding and deleting items in this list. Error can happen anytime
1917  * during read or write. There could be unprocessed mblks in the
1918  * sr_ib_XXX list when a read or write error occurs. Whenever error
1919  * is encountered, we need two things to happen :
1920  *
1921  * a) One of the threads need to clean the mblks.
1922  * b) When one thread encounters an error, the other should stop.
1923  *
1924  * For (a), we don't want to penalize the reader thread as it could do
1925  * some useful work processing other requests. For (b), the error can
1926  * be detected by examining sr_read_error or sr_write_error.
1927  * sr_lock protects sr_read_error and sr_write_error. If both reader and
1928  * writer encounters error, we need to report the write error back to
1929  * the application as that's what would have happened if the operations
1930  * were done sequentially. With this in mind, following should work :
1931  *
1932  *      - Check for errors before read or write.
1933  *      - If the reader encounters error, set the error in sr_read_error.
1934  *        Check sr_write_error, if it is set, send cv_signal as it is
1935  *        waiting for reader to complete. If it is not set, the writer
1936  *        is either running sinking data to the network or blocked
1937  *        because of flow control. For handling the latter case, we
1938  *        always send a signal. In any case, it will examine sr_read_error
1939  *        and return. sr_read_error is marked with SR_READ_DONE to tell
1940  *        the writer that the reader is done in all the cases.
1941  *      - If the writer encounters error, set the error in sr_write_error.
1942  *        The reader thread is either blocked because of flow control or
1943  *        running reading data from the disk. For the former, we need to
1944  *        wakeup the thread. Again to keep it simple, we always wake up
1945  *        the reader thread. Then, wait for the read thread to complete
1946  *        if it is not done yet. Cleanup and return.
1947  *
1948  * High and low water marks for the read thread.
1949  * --------------------------------------------
1950  *
1951  * If sendfile() is used to send data over a slow network, we need to
1952  * make sure that the read thread does not produce data at a faster
1953  * rate than the network. This can happen if the disk is faster than
1954  * the network. In such a case, we don't want to build a very large queue.
1955  * But we would still like to get all of the network throughput possible.
1956  * This implies that network should never block waiting for data.
1957  * As there are lot of disk throughput/network throughput combinations
1958  * possible, it is difficult to come up with an accurate number.
1959  * A typical 10K RPM disk has a max seek latency 17ms and rotational
1960  * latency of 3ms for reading a disk block. Thus, the total latency to
1961  * initiate a new read, transfer data from the disk and queue for
1962  * transmission would take about a max of 25ms. Todays max transfer rate
1963  * for network is 100MB/sec. If the thread is blocked because of flow
1964  * control, it would take 25ms to get new data ready for transmission.
1965  * We have to make sure that network is not idling, while we are initiating
1966  * new transfers. So, at 100MB/sec, to keep network busy we would need
1967  * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1968  * We need to pick a high water mark so that the woken up thread would
1969  * do considerable work before blocking again to prevent thrashing. Currently,
1970  * we pick this to be 10 times that of the low water mark.
1971  *
1972  * Sendfile with segmap caching (One copy from page cache to mblks).
1973  * ----------------------------------------------------------------
1974  *
1975  * We use the segmap cache for caching the file, if the size of file
1976  * is <= sendfile_max_size. In this case we don't use threads as VM
1977  * is reasonably fast enough to keep up with the network. If the underlying
1978  * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1979  * of data into segmap space, and use the virtual address from segmap
1980  * directly through desballoc() to avoid copy. Once the transport is done
1981  * with the data, the mapping will be released through segmap_release()
1982  * called by the call-back routine.
1983  *
1984  * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1985  * to copy the data from the filesystem into our temporary network buffer.
1986  *
1987  * To disable caching, set sendfile_max_size to 0.
1988  */
1989 
1990 uint_t sendfile_read_size = 1024 * 1024;
1991 #define SENDFILE_REQ_LOWAT      3 * 1024 * 1024
1992 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1993 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1994 struct sendfile_stats sf_stats;
1995 struct sendfile_queue *snfq;
1996 clock_t snfq_timeout;
1997 off64_t sendfile_max_size;
1998 
1999 static void snf_enque(snf_req_t *, mblk_t *);
2000 static mblk_t *snf_deque(snf_req_t *);
2001 
2002 void
2003 sendfile_init(void)
2004 {
2005         snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2006 
2007         mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2008         cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2009         snfq->snfq_max_threads = max_ncpus;
2010         snfq_timeout = SNFQ_TIMEOUT;
2011         /* Cache all files by default. */
2012         sendfile_max_size = MAXOFFSET_T;
2013 }
2014 
2015 /*
2016  * Queues a mblk_t for network processing.
2017  */
2018 static void
2019 snf_enque(snf_req_t *sr, mblk_t *mp)
2020 {
2021         mp->b_next = NULL;
2022         mutex_enter(&sr->sr_lock);
2023         if (sr->sr_mp_head == NULL) {
2024                 sr->sr_mp_head = sr->sr_mp_tail = mp;
2025                 cv_signal(&sr->sr_cv);
2026         } else {
2027                 sr->sr_mp_tail->b_next = mp;
2028                 sr->sr_mp_tail = mp;
2029         }
2030         sr->sr_qlen += MBLKL(mp);
2031         while ((sr->sr_qlen > sr->sr_hiwat) &&
2032             (sr->sr_write_error == 0)) {
2033                 sf_stats.ss_full_waits++;
2034                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2035         }
2036         mutex_exit(&sr->sr_lock);
2037 }
2038 
2039 /*
2040  * De-queues a mblk_t for network processing.
2041  */
2042 static mblk_t *
2043 snf_deque(snf_req_t *sr)
2044 {
2045         mblk_t *mp;
2046 
2047         mutex_enter(&sr->sr_lock);
2048         /*
2049          * If we have encountered an error on read or read is
2050          * completed and no more mblks, return NULL.
2051          * We need to check for NULL sr_mp_head also as
2052          * the reads could have completed and there is
2053          * nothing more to come.
2054          */
2055         if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2056             ((sr->sr_read_error & SR_READ_DONE) &&
2057             sr->sr_mp_head == NULL)) {
2058                 mutex_exit(&sr->sr_lock);
2059                 return (NULL);
2060         }
2061         /*
2062          * To start with neither SR_READ_DONE is marked nor
2063          * the error is set. When we wake up from cv_wait,
2064          * following are the possibilities :
2065          *
2066          *      a) sr_read_error is zero and mblks are queued.
2067          *      b) sr_read_error is set to SR_READ_DONE
2068          *         and mblks are queued.
2069          *      c) sr_read_error is set to SR_READ_DONE
2070          *         and no mblks.
2071          *      d) sr_read_error is set to some error other
2072          *         than SR_READ_DONE.
2073          */
2074 
2075         while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2076                 sf_stats.ss_empty_waits++;
2077                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2078         }
2079         /* Handle (a) and (b) first  - the normal case. */
2080         if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2081             (sr->sr_mp_head != NULL)) {
2082                 mp = sr->sr_mp_head;
2083                 sr->sr_mp_head = mp->b_next;
2084                 sr->sr_qlen -= MBLKL(mp);
2085                 if (sr->sr_qlen < sr->sr_lowat)
2086                         cv_signal(&sr->sr_cv);
2087                 mutex_exit(&sr->sr_lock);
2088                 mp->b_next = NULL;
2089                 return (mp);
2090         }
2091         /* Handle (c) and (d). */
2092         mutex_exit(&sr->sr_lock);
2093         return (NULL);
2094 }
2095 
2096 /*
2097  * Reads data from the filesystem and queues it for network processing.
2098  */
2099 void
2100 snf_async_read(snf_req_t *sr)
2101 {
2102         size_t iosize;
2103         u_offset_t fileoff;
2104         u_offset_t size;
2105         int ret_size;
2106         int error;
2107         file_t *fp;
2108         mblk_t *mp;
2109         struct vnode *vp;
2110         int extra = 0;
2111         int maxblk = 0;
2112         int wroff = 0;
2113         struct sonode *so;
2114 
2115         fp = sr->sr_fp;
2116         size = sr->sr_file_size;
2117         fileoff = sr->sr_file_off;
2118 
2119         /*
2120          * Ignore the error for filesystems that doesn't support DIRECTIO.
2121          */
2122         (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2123             kcred, NULL, NULL);
2124 
2125         vp = sr->sr_vp;
2126         if (vp->v_type == VSOCK) {
2127                 stdata_t *stp;
2128 
2129                 /*
2130                  * Get the extra space to insert a header and a trailer.
2131                  */
2132                 so = VTOSO(vp);
2133                 stp = vp->v_stream;
2134                 if (stp == NULL) {
2135                         wroff = so->so_proto_props.sopp_wroff;
2136                         maxblk = so->so_proto_props.sopp_maxblk;
2137                         extra = wroff + so->so_proto_props.sopp_tail;
2138                 } else {
2139                         wroff = (int)(stp->sd_wroff);
2140                         maxblk = (int)(stp->sd_maxblk);
2141                         extra = wroff + (int)(stp->sd_tail);
2142                 }
2143         }
2144 
2145         while ((size != 0) && (sr->sr_write_error == 0)) {
2146 
2147                 iosize = (int)MIN(sr->sr_maxpsz, size);
2148 
2149                 /*
2150                  * Socket filters can limit the mblk size,
2151                  * so limit reads to maxblk if there are
2152                  * filters present.
2153                  */
2154                 if (vp->v_type == VSOCK &&
2155                     so->so_filter_active > 0 && maxblk != INFPSZ)
2156                         iosize = (int)MIN(iosize, maxblk);
2157 
2158                 if (is_system_labeled()) {
2159                         mp = allocb_cred(iosize + extra, CRED(),
2160                             curproc->p_pid);
2161                 } else {
2162                         mp = allocb(iosize + extra, BPRI_MED);
2163                 }
2164                 if (mp == NULL) {
2165                         error = EAGAIN;
2166                         break;
2167                 }
2168 
2169                 mp->b_rptr += wroff;
2170 
2171                 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2172 
2173                 /* Error or Reached EOF ? */
2174                 if ((error != 0) || (ret_size == 0)) {
2175                         freeb(mp);
2176                         break;
2177                 }
2178                 mp->b_wptr = mp->b_rptr + ret_size;
2179 
2180                 snf_enque(sr, mp);
2181                 size -= ret_size;
2182                 fileoff += ret_size;
2183         }
2184         (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2185             kcred, NULL, NULL);
2186         mutex_enter(&sr->sr_lock);
2187         sr->sr_read_error = error;
2188         sr->sr_read_error |= SR_READ_DONE;
2189         cv_signal(&sr->sr_cv);
2190         mutex_exit(&sr->sr_lock);
2191 }
2192 
2193 void
2194 snf_async_thread(void)
2195 {
2196         snf_req_t *sr;
2197         callb_cpr_t cprinfo;
2198         clock_t time_left = 1;
2199 
2200         CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2201 
2202         mutex_enter(&snfq->snfq_lock);
2203         for (;;) {
2204                 /*
2205                  * If we didn't find a entry, then block until woken up
2206                  * again and then look through the queues again.
2207                  */
2208                 while ((sr = snfq->snfq_req_head) == NULL) {
2209                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2210                         if (time_left <= 0) {
2211                                 snfq->snfq_svc_threads--;
2212                                 CALLB_CPR_EXIT(&cprinfo);
2213                                 thread_exit();
2214                                 /* NOTREACHED */
2215                         }
2216                         snfq->snfq_idle_cnt++;
2217 
2218                         time_left = cv_reltimedwait(&snfq->snfq_cv,
2219                             &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2220                         snfq->snfq_idle_cnt--;
2221 
2222                         CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2223                 }
2224                 snfq->snfq_req_head = sr->sr_next;
2225                 snfq->snfq_req_cnt--;
2226                 mutex_exit(&snfq->snfq_lock);
2227                 snf_async_read(sr);
2228                 mutex_enter(&snfq->snfq_lock);
2229         }
2230 }
2231 
2232 
2233 snf_req_t *
2234 create_thread(int operation, struct vnode *vp, file_t *fp,
2235     u_offset_t fileoff, u_offset_t size)
2236 {
2237         snf_req_t *sr;
2238         stdata_t *stp;
2239 
2240         sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2241 
2242         sr->sr_vp = vp;
2243         sr->sr_fp = fp;
2244         stp = vp->v_stream;
2245 
2246         /*
2247          * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2248          * stream might be closed before thread returns from snf_async_read.
2249          */
2250         if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2251                 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2252         } else {
2253                 sr->sr_maxpsz = MAXBSIZE;
2254         }
2255 
2256         sr->sr_operation = operation;
2257         sr->sr_file_off = fileoff;
2258         sr->sr_file_size = size;
2259         sr->sr_hiwat = sendfile_req_hiwat;
2260         sr->sr_lowat = sendfile_req_lowat;
2261         mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2262         cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2263         /*
2264          * See whether we need another thread for servicing this
2265          * request. If there are already enough requests queued
2266          * for the threads, create one if not exceeding
2267          * snfq_max_threads.
2268          */
2269         mutex_enter(&snfq->snfq_lock);
2270         if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2271             snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2272                 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2273                     TS_RUN, minclsyspri);
2274                 snfq->snfq_svc_threads++;
2275         }
2276         if (snfq->snfq_req_head == NULL) {
2277                 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2278                 cv_signal(&snfq->snfq_cv);
2279         } else {
2280                 snfq->snfq_req_tail->sr_next = sr;
2281                 snfq->snfq_req_tail = sr;
2282         }
2283         snfq->snfq_req_cnt++;
2284         mutex_exit(&snfq->snfq_lock);
2285         return (sr);
2286 }
2287 
2288 int
2289 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2290     ssize_t *count)
2291 {
2292         snf_req_t *sr;
2293         mblk_t *mp;
2294         int iosize;
2295         int error = 0;
2296         short fflag;
2297         struct vnode *vp;
2298         int ksize;
2299         struct nmsghdr msg;
2300 
2301         ksize = 0;
2302         *count = 0;
2303         bzero(&msg, sizeof (msg));
2304 
2305         vp = fp->f_vnode;
2306         fflag = fp->f_flag;
2307         if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2308                 return (EAGAIN);
2309 
2310         /*
2311          * We check for read error in snf_deque. It has to check
2312          * for successful READ_DONE and return NULL, and we might
2313          * as well make an additional check there.
2314          */
2315         while ((mp = snf_deque(sr)) != NULL) {
2316 
2317                 if (ISSIG(curthread, JUSTLOOKING)) {
2318                         freeb(mp);
2319                         error = EINTR;
2320                         break;
2321                 }
2322                 iosize = MBLKL(mp);
2323 
2324                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2325 
2326                 if (error != 0) {
2327                         if (mp != NULL)
2328                                 freeb(mp);
2329                         break;
2330                 }
2331                 ksize += iosize;
2332         }
2333         *count = ksize;
2334 
2335         mutex_enter(&sr->sr_lock);
2336         sr->sr_write_error = error;
2337         /* Look at the big comments on why we cv_signal here. */
2338         cv_signal(&sr->sr_cv);
2339 
2340         /* Wait for the reader to complete always. */
2341         while (!(sr->sr_read_error & SR_READ_DONE)) {
2342                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2343         }
2344         /* If there is no write error, check for read error. */
2345         if (error == 0)
2346                 error = (sr->sr_read_error & ~SR_READ_DONE);
2347 
2348         if (error != 0) {
2349                 mblk_t *next_mp;
2350 
2351                 mp = sr->sr_mp_head;
2352                 while (mp != NULL) {
2353                         next_mp = mp->b_next;
2354                         mp->b_next = NULL;
2355                         freeb(mp);
2356                         mp = next_mp;
2357                 }
2358         }
2359         mutex_exit(&sr->sr_lock);
2360         kmem_free(sr, sizeof (snf_req_t));
2361         return (error);
2362 }
2363 
2364 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2365 #define SNF_VPMMAXPGS   (VPMMAXPGS/2)
2366 
2367 /*
2368  * Maximum no.of elements in the list returned by vpm, including
2369  * NULL for the last entry
2370  */
2371 #define SNF_MAXVMAPS    (SNF_VPMMAXPGS + 1)
2372 
2373 typedef struct {
2374         unsigned int    snfv_ref;
2375         frtn_t          snfv_frtn;
2376         vnode_t         *snfv_vp;
2377         struct vmap     snfv_vml[SNF_MAXVMAPS];
2378 } snf_vmap_desbinfo;
2379 
2380 typedef struct {
2381         frtn_t          snfi_frtn;
2382         caddr_t         snfi_base;
2383         uint_t          snfi_mapoff;
2384         size_t          snfi_len;
2385         vnode_t         *snfi_vp;
2386 } snf_smap_desbinfo;
2387 
2388 /*
2389  * The callback function used for vpm mapped mblks called when the last ref of
2390  * the mblk is dropped which normally occurs when TCP receives the ack. But it
2391  * can be the driver too due to lazy reclaim.
2392  */
2393 void
2394 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2395 {
2396         ASSERT(snfv->snfv_ref != 0);
2397         if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2398                 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2399                 VN_RELE(snfv->snfv_vp);
2400                 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2401         }
2402 }
2403 
2404 /*
2405  * The callback function used for segmap'ped mblks called when the last ref of
2406  * the mblk is dropped which normally occurs when TCP receives the ack. But it
2407  * can be the driver too due to lazy reclaim.
2408  */
2409 void
2410 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2411 {
2412         if (! IS_KPM_ADDR(snfi->snfi_base)) {
2413                 /*
2414                  * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2415                  * segmap_kpm as long as the latter never falls back to
2416                  * "use_segmap_range". (See segmap_getmapflt().)
2417                  *
2418                  * Using S_OTHER saves an redundant hat_setref() in
2419                  * segmap_unlock()
2420                  */
2421                 (void) segmap_fault(kas.a_hat, segkmap,
2422                     (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2423                     snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2424                     F_SOFTUNLOCK, S_OTHER);
2425         }
2426         (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2427         VN_RELE(snfi->snfi_vp);
2428         kmem_free(snfi, sizeof (*snfi));
2429 }
2430 
2431 /*
2432  * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2433  * When segmap is used, the mblk contains a segmap slot of no more
2434  * than MAXBSIZE.
2435  *
2436  * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2437  * in each iteration and sent by socket_sendmblk until an error occurs or
2438  * the requested size has been transferred. An mblk is esballoca'ed from
2439  * each mapped page and a chain of these mblk is sent to the transport layer.
2440  * vpm will be called to unmap the pages when all mblks have been freed by
2441  * free_func.
2442  *
2443  * At the end of the whole sendfile() operation, we wait till the data from
2444  * the last mblk is ack'ed by the transport before returning so that the
2445  * caller of sendfile() can safely modify the file content.
2446  *
2447  * The caller of this function should make sure that total_size does not exceed
2448  * the actual file size of fvp.
2449  */
2450 int
2451 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2452     ssize_t *count, boolean_t nowait)
2453 {
2454         caddr_t base;
2455         int mapoff;
2456         vnode_t *vp;
2457         mblk_t *mp = NULL;
2458         int chain_size;
2459         int error;
2460         clock_t deadlk_wait;
2461         short fflag;
2462         int ksize;
2463         struct vattr va;
2464         boolean_t dowait = B_FALSE;
2465         struct nmsghdr msg;
2466 
2467         vp = fp->f_vnode;
2468         fflag = fp->f_flag;
2469         ksize = 0;
2470         bzero(&msg, sizeof (msg));
2471 
2472         for (;;) {
2473                 if (ISSIG(curthread, JUSTLOOKING)) {
2474                         error = EINTR;
2475                         break;
2476                 }
2477 
2478                 if (vpm_enable) {
2479                         snf_vmap_desbinfo *snfv;
2480                         mblk_t *nmp;
2481                         int mblk_size;
2482                         int maxsize;
2483                         int i;
2484 
2485                         mapoff = fileoff & PAGEOFFSET;
2486                         maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2487 
2488                         snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2489                             KM_SLEEP);
2490 
2491                         /*
2492                          * Get vpm mappings for maxsize with read access.
2493                          * If the pages aren't available yet, we get
2494                          * DEADLK, so wait and try again a little later using
2495                          * an increasing wait. We might be here a long time.
2496                          *
2497                          * If delay_sig returns EINTR, be sure to exit and
2498                          * pass it up to the caller.
2499                          */
2500                         deadlk_wait = 0;
2501                         while ((error = vpm_map_pages(fvp, fileoff,
2502                             (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2503                             SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2504                                 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2505                                 if ((error = delay_sig(deadlk_wait)) != 0) {
2506                                         break;
2507                                 }
2508                         }
2509                         if (error != 0) {
2510                                 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2511                                 error = (error == EINTR) ? EINTR : EIO;
2512                                 goto out;
2513                         }
2514                         snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2515                         snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2516 
2517                         /* Construct the mblk chain from the page mappings */
2518                         chain_size = 0;
2519                         for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2520                             total_size > 0; i++) {
2521                                 ASSERT(chain_size < maxsize);
2522                                 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2523                                     mapoff, total_size);
2524                                 nmp = esballoca(
2525                                     (uchar_t *)snfv->snfv_vml[i].vs_addr +
2526                                     mapoff, mblk_size, BPRI_HI,
2527                                     &snfv->snfv_frtn);
2528 
2529                                 /*
2530                                  * We return EAGAIN after unmapping the pages
2531                                  * if we cannot allocate the the head of the
2532                                  * chain. Otherwise, we continue sending the
2533                                  * mblks constructed so far.
2534                                  */
2535                                 if (nmp == NULL) {
2536                                         if (i == 0) {
2537                                                 vpm_unmap_pages(snfv->snfv_vml,
2538                                                     S_READ);
2539                                                 kmem_free(snfv,
2540                                                     sizeof (snf_vmap_desbinfo));
2541                                                 error = EAGAIN;
2542                                                 goto out;
2543                                         }
2544                                         break;
2545                                 }
2546                                 /* Mark this dblk with the zero-copy flag */
2547                                 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2548                                 nmp->b_wptr += mblk_size;
2549                                 chain_size += mblk_size;
2550                                 fileoff += mblk_size;
2551                                 total_size -= mblk_size;
2552                                 snfv->snfv_ref++;
2553                                 mapoff = 0;
2554                                 if (i > 0)
2555                                         linkb(mp, nmp);
2556                                 else
2557                                         mp = nmp;
2558                         }
2559                         VN_HOLD(fvp);
2560                         snfv->snfv_vp = fvp;
2561                 } else {
2562                         /* vpm not supported. fallback to segmap */
2563                         snf_smap_desbinfo *snfi;
2564 
2565                         mapoff = fileoff & MAXBOFFSET;
2566                         chain_size = MAXBSIZE - mapoff;
2567                         if (chain_size > total_size)
2568                                 chain_size = total_size;
2569                         /*
2570                          * we don't forcefault because we'll call
2571                          * segmap_fault(F_SOFTLOCK) next.
2572                          *
2573                          * S_READ will get the ref bit set (by either
2574                          * segmap_getmapflt() or segmap_fault()) and page
2575                          * shared locked.
2576                          */
2577                         base = segmap_getmapflt(segkmap, fvp, fileoff,
2578                             chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2579 
2580                         snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2581                         snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2582                             PAGESIZE)- (mapoff & PAGEMASK);
2583                         /*
2584                          * We must call segmap_fault() even for segmap_kpm
2585                          * because that's how error gets returned.
2586                          * (segmap_getmapflt() never fails but segmap_fault()
2587                          * does.)
2588                          *
2589                          * If the pages aren't available yet, we get
2590                          * DEADLK, so wait and try again a little later using
2591                          * an increasing wait. We might be here a long time.
2592                          *
2593                          * If delay_sig returns EINTR, be sure to exit and
2594                          * pass it up to the caller.
2595                          */
2596                         deadlk_wait = 0;
2597                         while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2598                             segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2599                             mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2600                             S_READ))) == EDEADLK) {
2601                                 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2602                                 if ((error = delay_sig(deadlk_wait)) != 0) {
2603                                         break;
2604                                 }
2605                         }
2606                         if (error != 0) {
2607                                 (void) segmap_release(segkmap, base, 0);
2608                                 kmem_free(snfi, sizeof (*snfi));
2609                                 error = (error == EINTR) ? EINTR : EIO;
2610                                 goto out;
2611                         }
2612                         snfi->snfi_frtn.free_func = snf_smap_desbfree;
2613                         snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2614                         snfi->snfi_base = base;
2615                         snfi->snfi_mapoff = mapoff;
2616                         mp = esballoca((uchar_t *)base + mapoff, chain_size,
2617                             BPRI_HI, &snfi->snfi_frtn);
2618 
2619                         if (mp == NULL) {
2620                                 (void) segmap_fault(kas.a_hat, segkmap,
2621                                     (caddr_t)(uintptr_t)(((uintptr_t)base +
2622                                     mapoff) & PAGEMASK), snfi->snfi_len,
2623                                     F_SOFTUNLOCK, S_OTHER);
2624                                 (void) segmap_release(segkmap, base, 0);
2625                                 kmem_free(snfi, sizeof (*snfi));
2626                                 freemsg(mp);
2627                                 error = EAGAIN;
2628                                 goto out;
2629                         }
2630                         VN_HOLD(fvp);
2631                         snfi->snfi_vp = fvp;
2632                         mp->b_wptr += chain_size;
2633 
2634                         /* Mark this dblk with the zero-copy flag */
2635                         mp->b_datap->db_struioflag |= STRUIO_ZC;
2636                         fileoff += chain_size;
2637                         total_size -= chain_size;
2638                 }
2639 
2640                 if (total_size == 0 && !nowait) {
2641                         ASSERT(!dowait);
2642                         dowait = B_TRUE;
2643                         mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2644                 }
2645                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2646                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2647                 if (error != 0) {
2648                         /*
2649                          * mp contains the mblks that were not sent by
2650                          * socket_sendmblk. Use its size to update *count
2651                          */
2652                         *count = ksize + (chain_size - msgdsize(mp));
2653                         if (mp != NULL)
2654                                 freemsg(mp);
2655                         return (error);
2656                 }
2657                 ksize += chain_size;
2658                 if (total_size == 0)
2659                         goto done;
2660 
2661                 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2662                 va.va_mask = AT_SIZE;
2663                 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2664                 if (error)
2665                         break;
2666                 /* Read as much as possible. */
2667                 if (fileoff >= va.va_size)
2668                         break;
2669                 if (total_size + fileoff > va.va_size)
2670                         total_size = va.va_size - fileoff;
2671         }
2672 out:
2673         VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2674 done:
2675         *count = ksize;
2676         if (dowait) {
2677                 stdata_t *stp;
2678 
2679                 stp = vp->v_stream;
2680                 if (stp == NULL) {
2681                         struct sonode *so;
2682                         so = VTOSO(vp);
2683                         error = so_zcopy_wait(so);
2684                 } else {
2685                         mutex_enter(&stp->sd_lock);
2686                         while (!(stp->sd_flag & STZCNOTIFY)) {
2687                                 if (cv_wait_sig(&stp->sd_zcopy_wait,
2688                                     &stp->sd_lock) == 0) {
2689                                         error = EINTR;
2690                                         break;
2691                                 }
2692                         }
2693                         stp->sd_flag &= ~STZCNOTIFY;
2694                         mutex_exit(&stp->sd_lock);
2695                 }
2696         }
2697         return (error);
2698 }
2699 
2700 int
2701 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2702     uint_t maxpsz, ssize_t *count)
2703 {
2704         struct vnode *vp;
2705         mblk_t *mp;
2706         int iosize;
2707         int extra = 0;
2708         int error;
2709         short fflag;
2710         int ksize;
2711         int ioflag;
2712         struct uio auio;
2713         struct iovec aiov;
2714         struct vattr va;
2715         int maxblk = 0;
2716         int wroff = 0;
2717         struct sonode *so;
2718         struct nmsghdr msg;
2719 
2720         vp = fp->f_vnode;
2721         if (vp->v_type == VSOCK) {
2722                 stdata_t *stp;
2723 
2724                 /*
2725                  * Get the extra space to insert a header and a trailer.
2726                  */
2727                 so = VTOSO(vp);
2728                 stp = vp->v_stream;
2729                 if (stp == NULL) {
2730                         wroff = so->so_proto_props.sopp_wroff;
2731                         maxblk = so->so_proto_props.sopp_maxblk;
2732                         extra = wroff + so->so_proto_props.sopp_tail;
2733                 } else {
2734                         wroff = (int)(stp->sd_wroff);
2735                         maxblk = (int)(stp->sd_maxblk);
2736                         extra = wroff + (int)(stp->sd_tail);
2737                 }
2738         }
2739         bzero(&msg, sizeof (msg));
2740         fflag = fp->f_flag;
2741         ksize = 0;
2742         auio.uio_iov = &aiov;
2743         auio.uio_iovcnt = 1;
2744         auio.uio_segflg = UIO_SYSSPACE;
2745         auio.uio_llimit = MAXOFFSET_T;
2746         auio.uio_fmode = fflag;
2747         auio.uio_extflg = UIO_COPY_CACHED;
2748         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2749         /* If read sync is not asked for, filter sync flags */
2750         if ((ioflag & FRSYNC) == 0)
2751                 ioflag &= ~(FSYNC|FDSYNC);
2752         for (;;) {
2753                 if (ISSIG(curthread, JUSTLOOKING)) {
2754                         error = EINTR;
2755                         break;
2756                 }
2757                 iosize = (int)MIN(maxpsz, size);
2758 
2759                 /*
2760                  * Socket filters can limit the mblk size,
2761                  * so limit reads to maxblk if there are
2762                  * filters present.
2763                  */
2764                 if (vp->v_type == VSOCK &&
2765                     so->so_filter_active > 0 && maxblk != INFPSZ)
2766                         iosize = (int)MIN(iosize, maxblk);
2767 
2768                 if (is_system_labeled()) {
2769                         mp = allocb_cred(iosize + extra, CRED(),
2770                             curproc->p_pid);
2771                 } else {
2772                         mp = allocb(iosize + extra, BPRI_MED);
2773                 }
2774                 if (mp == NULL) {
2775                         error = EAGAIN;
2776                         break;
2777                 }
2778 
2779                 mp->b_rptr += wroff;
2780 
2781                 aiov.iov_base = (caddr_t)mp->b_rptr;
2782                 aiov.iov_len = iosize;
2783                 auio.uio_loffset = fileoff;
2784                 auio.uio_resid = iosize;
2785 
2786                 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2787                 iosize -= auio.uio_resid;
2788 
2789                 if (error == EINTR && iosize != 0)
2790                         error = 0;
2791 
2792                 if (error != 0 || iosize == 0) {
2793                         freeb(mp);
2794                         break;
2795                 }
2796                 mp->b_wptr = mp->b_rptr + iosize;
2797 
2798                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2799 
2800                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2801 
2802                 if (error != 0) {
2803                         *count = ksize;
2804                         if (mp != NULL)
2805                                 freeb(mp);
2806                         return (error);
2807                 }
2808                 ksize += iosize;
2809                 size -= iosize;
2810                 if (size == 0)
2811                         goto done;
2812 
2813                 fileoff += iosize;
2814                 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2815                 va.va_mask = AT_SIZE;
2816                 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2817                 if (error)
2818                         break;
2819                 /* Read as much as possible. */
2820                 if (fileoff >= va.va_size)
2821                         size = 0;
2822                 else if (size + fileoff > va.va_size)
2823                         size = va.va_size - fileoff;
2824         }
2825         VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826 done:
2827         *count = ksize;
2828         return (error);
2829 }
2830 
2831 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2832 /*
2833  * Largefile support for 32 bit applications only.
2834  */
2835 int
2836 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2837     ssize32_t *count32)
2838 {
2839         ssize32_t sfv_len;
2840         u_offset_t sfv_off, va_size;
2841         struct vnode *vp, *fvp, *realvp;
2842         struct vattr va;
2843         stdata_t *stp;
2844         ssize_t count = 0;
2845         int error = 0;
2846         boolean_t dozcopy = B_FALSE;
2847         uint_t maxpsz;
2848 
2849         sfv_len = (ssize32_t)sfv->sfv_len;
2850         if (sfv_len < 0) {
2851                 error = EINVAL;
2852                 goto out;
2853         }
2854 
2855         if (sfv_len == 0) goto out;
2856 
2857         sfv_off = (u_offset_t)sfv->sfv_off;
2858 
2859         /* Same checks as in pread */
2860         if (sfv_off > MAXOFFSET_T) {
2861                 error = EINVAL;
2862                 goto out;
2863         }
2864         if (sfv_off + sfv_len > MAXOFFSET_T)
2865                 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2866 
2867         /*
2868          * There are no more checks on sfv_len. So, we cast it to
2869          * u_offset_t and share the snf_direct_io/snf_cache code between
2870          * 32 bit and 64 bit.
2871          *
2872          * TODO: should do nbl_need_check() like read()?
2873          */
2874         if (sfv_len > sendfile_max_size) {
2875                 sf_stats.ss_file_not_cached++;
2876                 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2877                     &count);
2878                 goto out;
2879         }
2880         fvp = rfp->f_vnode;
2881         if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2882                 fvp = realvp;
2883         /*
2884          * Grab the lock as a reader to prevent the file size
2885          * from changing underneath.
2886          */
2887         (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2888         va.va_mask = AT_SIZE;
2889         error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2890         va_size = va.va_size;
2891         if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2892                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2893                 goto out;
2894         }
2895         /* Read as much as possible. */
2896         if (sfv_off + sfv_len > va_size)
2897                 sfv_len = va_size - sfv_off;
2898 
2899         vp = fp->f_vnode;
2900         stp = vp->v_stream;
2901         /*
2902          * When the NOWAIT flag is not set, we enable zero-copy only if the
2903          * transfer size is large enough. This prevents performance loss
2904          * when the caller sends the file piece by piece.
2905          */
2906         if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2907             (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2908             !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2909                 uint_t copyflag;
2910                 copyflag = stp != NULL ? stp->sd_copyflag :
2911                     VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2912                 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2913                         int on = 1;
2914 
2915                         if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2916                             SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2917                                 dozcopy = B_TRUE;
2918                 } else {
2919                         dozcopy = copyflag & STZCVMSAFE;
2920                 }
2921         }
2922         if (dozcopy) {
2923                 sf_stats.ss_file_segmap++;
2924                 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2925                     &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2926         } else {
2927                 if (vp->v_type == VSOCK && stp == NULL) {
2928                         sonode_t *so = VTOSO(vp);
2929                         maxpsz = so->so_proto_props.sopp_maxpsz;
2930                 } else if (stp != NULL) {
2931                         maxpsz = stp->sd_qn_maxpsz;
2932                 } else {
2933                         maxpsz = maxphys;
2934                 }
2935 
2936                 if (maxpsz == INFPSZ)
2937                         maxpsz = maxphys;
2938                 else
2939                         maxpsz = roundup(maxpsz, MAXBSIZE);
2940                 sf_stats.ss_file_cached++;
2941                 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2942                     maxpsz, &count);
2943         }
2944 out:
2945         releasef(sfv->sfv_fd);
2946         *count32 = (ssize32_t)count;
2947         return (error);
2948 }
2949 #endif
2950 
2951 #ifdef _SYSCALL32_IMPL
2952 /*
2953  * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2954  * ssize_t rather than ssize32_t; see the comments above read32 for details.
2955  */
2956 
2957 ssize_t
2958 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2959 {
2960         return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2961 }
2962 
2963 ssize_t
2964 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2965     caddr32_t name, caddr32_t namelenp)
2966 {
2967         return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2968             (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2969 }
2970 
2971 ssize_t
2972 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2973 {
2974         return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2975 }
2976 
2977 ssize_t
2978 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2979     caddr32_t name, socklen_t namelen)
2980 {
2981         return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2982             (void *)(uintptr_t)name, namelen));
2983 }
2984 #endif  /* _SYSCALL32_IMPL */
2985 
2986 /*
2987  * Function wrappers (mostly around the sonode switch) for
2988  * backward compatibility.
2989  */
2990 
2991 int
2992 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2993 {
2994         return (socket_accept(so, fflag, CRED(), nsop));
2995 }
2996 
2997 int
2998 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2999     int backlog, int flags)
3000 {
3001         int     error;
3002 
3003         error = socket_bind(so, name, namelen, flags, CRED());
3004         if (error == 0 && backlog != 0)
3005                 return (socket_listen(so, backlog, CRED()));
3006 
3007         return (error);
3008 }
3009 
3010 int
3011 solisten(struct sonode *so, int backlog)
3012 {
3013         return (socket_listen(so, backlog, CRED()));
3014 }
3015 
3016 int
3017 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3018     int fflag, int flags)
3019 {
3020         return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3021 }
3022 
3023 int
3024 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3025 {
3026         return (socket_recvmsg(so, msg, uiop, CRED()));
3027 }
3028 
3029 int
3030 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3031 {
3032         return (socket_sendmsg(so, msg, uiop, CRED()));
3033 }
3034 
3035 int
3036 soshutdown(struct sonode *so, int how)
3037 {
3038         return (socket_shutdown(so, how, CRED()));
3039 }
3040 
3041 int
3042 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3043     socklen_t *optlenp, int flags)
3044 {
3045         return (socket_getsockopt(so, level, option_name, optval, optlenp,
3046             flags, CRED()));
3047 }
3048 
3049 int
3050 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3051     t_uscalar_t optlen)
3052 {
3053         return (socket_setsockopt(so, level, option_name, optval, optlen,
3054             CRED()));
3055 }
3056 
3057 /*
3058  * Because this is backward compatibility interface it only needs to be
3059  * able to handle the creation of TPI sockfs sockets.
3060  */
3061 struct sonode *
3062 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3063     int *errorp)
3064 {
3065         struct sonode *so;
3066 
3067         ASSERT(sp != NULL);
3068 
3069         so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3070             version, SOCKET_SLEEP, errorp, CRED());
3071         if (so == NULL) {
3072                 SOCKPARAMS_DEC_REF(sp);
3073         } else {
3074                 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3075                         /* Cannot fail, only bumps so_count */
3076                         (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3077                 } else {
3078                         socket_destroy(so);
3079                         so = NULL;
3080                 }
3081         }
3082         return (so);
3083 }