illumos-gate New usr/src/uts/common/fs/sockfs/socksyscalls.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
  27 /*
  28  * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/buf.h>
  36 #include <sys/conf.h>
  37 #include <sys/cred.h>
  38 #include <sys/kmem.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/vfs.h>
  41 #include <sys/vnode.h>
  42 #include <sys/debug.h>
  43 #include <sys/errno.h>
  44 #include <sys/time.h>
  45 #include <sys/file.h>
  46 #include <sys/user.h>
  47 #include <sys/stream.h>
  48 #include <sys/strsubr.h>
  49 #include <sys/strsun.h>
  50 #include <sys/sunddi.h>
  51 #include <sys/esunddi.h>
  52 #include <sys/flock.h>
  53 #include <sys/modctl.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/vmsystm.h>
  56 #include <sys/policy.h>
  57 
  58 #include <sys/socket.h>
  59 #include <sys/socketvar.h>
  60 
  61 #include <sys/isa_defs.h>
  62 #include <sys/inttypes.h>
  63 #include <sys/systm.h>
  64 #include <sys/cpuvar.h>
  65 #include <sys/filio.h>
  66 #include <sys/sendfile.h>
  67 #include <sys/ddi.h>
  68 #include <vm/seg.h>
  69 #include <vm/seg_map.h>
  70 #include <vm/seg_kpm.h>
  71 
  72 #include <fs/sockfs/nl7c.h>
  73 #include <fs/sockfs/sockcommon.h>
  74 #include <fs/sockfs/sockfilter_impl.h>
  75 #include <fs/sockfs/socktpi.h>
  76 
  77 #ifdef SOCK_TEST
  78 int do_useracc = 1;             /* Controlled by setting SO_DEBUG to 4 */
  79 #else
  80 #define do_useracc      1
  81 #endif /* SOCK_TEST */
  82 
  83 extern int      xnet_truncate_print;
  84 
  85 extern void     nl7c_init(void);
  86 extern int      sockfs_defer_nl7c_init;
  87 
  88 /*
  89  * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
  90  *       as there isn't a formal definition of IOV_MAX ???
  91  */
  92 #define MSG_MAXIOVLEN   16
  93 
  94 /*
  95  * Kernel component of socket creation.
  96  *
  97  * The socket library determines which version number to use.
  98  * First the library calls this with a NULL devpath. If this fails
  99  * to find a transport (using solookup) the library will look in /etc/netconfig
 100  * for the appropriate transport. If one is found it will pass in the
 101  * devpath for the kernel to use.
 102  */
 103 int
 104 so_socket(int family, int type_w_flags, int protocol, char *devpath,
 105     int version)
 106 {
 107         struct sonode *so;
 108         vnode_t *vp;
 109         struct file *fp;
 110         int fd;
 111         int error;
 112         int type;
 113 
 114         type = type_w_flags & SOCK_TYPE_MASK;
 115         type_w_flags &= ~SOCK_TYPE_MASK;
 116         if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
 117                 return (set_errno(EINVAL));
 118 
 119         if (devpath != NULL) {
 120                 char *buf;
 121                 size_t kdevpathlen = 0;
 122 
 123                 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 124                 if ((error = copyinstr(devpath, buf,
 125                     MAXPATHLEN, &kdevpathlen)) != 0) {
 126                         kmem_free(buf, MAXPATHLEN);
 127                         return (set_errno(error));
 128                 }
 129                 so = socket_create(family, type, protocol, buf, NULL,
 130                     SOCKET_SLEEP, version, CRED(), &error);
 131                 kmem_free(buf, MAXPATHLEN);
 132         } else {
 133                 so = socket_create(family, type, protocol, NULL, NULL,
 134                     SOCKET_SLEEP, version, CRED(), &error);
 135         }
 136         if (so == NULL)
 137                 return (set_errno(error));
 138 
 139         /* Allocate a file descriptor for the socket */
 140         vp = SOTOV(so);
 141         if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
 142                 (void) socket_close(so, 0, CRED());
 143                 socket_destroy(so);
 144                 return (set_errno(error));
 145         }
 146 
 147         /*
 148          * Now fill in the entries that falloc reserved
 149          */
 150         if (type_w_flags & SOCK_NDELAY) {
 151                 so->so_state |= SS_NDELAY;
 152                 fp->f_flag |= FNDELAY;
 153         }
 154         if (type_w_flags & SOCK_NONBLOCK) {
 155                 so->so_state |= SS_NONBLOCK;
 156                 fp->f_flag |= FNONBLOCK;
 157         }
 158         mutex_exit(&fp->f_tlock);
 159         setf(fd, fp);
 160         if ((type_w_flags & SOCK_CLOEXEC) != 0) {
 161                 f_setfd(fd, FD_CLOEXEC);
 162         }
 163 
 164         return (fd);
 165 }
 166 
 167 /*
 168  * Map from a file descriptor to a socket node.
 169  * Returns with the file descriptor held i.e. the caller has to
 170  * use releasef when done with the file descriptor.
 171  */
 172 struct sonode *
 173 getsonode(int sock, int *errorp, file_t **fpp)
 174 {
 175         file_t *fp;
 176         vnode_t *vp;
 177         struct sonode *so;
 178 
 179         if ((fp = getf(sock)) == NULL) {
 180                 *errorp = EBADF;
 181                 eprintline(*errorp);
 182                 return (NULL);
 183         }
 184         vp = fp->f_vnode;
 185         /* Check if it is a socket */
 186         if (vp->v_type != VSOCK) {
 187                 releasef(sock);
 188                 *errorp = ENOTSOCK;
 189                 eprintline(*errorp);
 190                 return (NULL);
 191         }
 192         /*
 193          * Use the stream head to find the real socket vnode.
 194          * This is needed when namefs sits above sockfs.
 195          */
 196         if (vp->v_stream) {
 197                 ASSERT(vp->v_stream->sd_vnode);
 198                 vp = vp->v_stream->sd_vnode;
 199 
 200                 so = VTOSO(vp);
 201                 if (so->so_version == SOV_STREAM) {
 202                         releasef(sock);
 203                         *errorp = ENOTSOCK;
 204                         eprintsoline(so, *errorp);
 205                         return (NULL);
 206                 }
 207         } else {
 208                 so = VTOSO(vp);
 209         }
 210         if (fpp)
 211                 *fpp = fp;
 212         return (so);
 213 }
 214 
 215 /*
 216  * Allocate and copyin a sockaddr.
 217  * Ensures NULL termination for AF_UNIX addresses by extending them
 218  * with one NULL byte if need be. Verifies that the length is not
 219  * excessive to prevent an application from consuming all of kernel
 220  * memory. Returns NULL when an error occurred.
 221  */
 222 static struct sockaddr *
 223 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
 224             int *errorp)
 225 {
 226         char    *faddr;
 227         size_t  namelen = (size_t)*namelenp;
 228 
 229         ASSERT(namelen != 0);
 230         if (namelen > SO_MAXARGSIZE) {
 231                 *errorp = EINVAL;
 232                 eprintsoline(so, *errorp);
 233                 return (NULL);
 234         }
 235 
 236         faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
 237         if (copyin(name, faddr, namelen)) {
 238                 kmem_free(faddr, namelen);
 239                 *errorp = EFAULT;
 240                 eprintsoline(so, *errorp);
 241                 return (NULL);
 242         }
 243 
 244         /*
 245          * Add space for NULL termination if needed.
 246          * Do a quick check if the last byte is NUL.
 247          */
 248         if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
 249                 /* Check if there is any NULL termination */
 250                 size_t  i;
 251                 int foundnull = 0;
 252 
 253                 for (i = sizeof (name->sa_family); i < namelen; i++) {
 254                         if (faddr[i] == '\0') {
 255                                 foundnull = 1;
 256                                 break;
 257                         }
 258                 }
 259                 if (!foundnull) {
 260                         /* Add extra byte for NUL padding */
 261                         char *nfaddr;
 262 
 263                         nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
 264                         bcopy(faddr, nfaddr, namelen);
 265                         kmem_free(faddr, namelen);
 266 
 267                         /* NUL terminate */
 268                         nfaddr[namelen] = '\0';
 269                         namelen++;
 270                         ASSERT((socklen_t)namelen == namelen);
 271                         *namelenp = (socklen_t)namelen;
 272                         faddr = nfaddr;
 273                 }
 274         }
 275         return ((struct sockaddr *)faddr);
 276 }
 277 
 278 /*
 279  * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 280  */
 281 static int
 282 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
 283                 void *kaddr, socklen_t klen)
 284 {
 285         if (uaddr != NULL) {
 286                 if (ulen > klen)
 287                         ulen = klen;
 288 
 289                 if (ulen != 0) {
 290                         if (copyout(kaddr, uaddr, ulen))
 291                                 return (EFAULT);
 292                 }
 293         } else
 294                 ulen = 0;
 295 
 296         if (ulenp != NULL) {
 297                 if (copyout(&ulen, ulenp, sizeof (ulen)))
 298                         return (EFAULT);
 299         }
 300         return (0);
 301 }
 302 
 303 /*
 304  * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 305  * If klen is greater than ulen it still uses the non-truncated
 306  * klen to update ulenp.
 307  */
 308 static int
 309 copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
 310                 void *kaddr, socklen_t klen)
 311 {
 312         if (uaddr != NULL) {
 313                 if (ulen >= klen)
 314                         ulen = klen;
 315                 else if (ulen != 0 && xnet_truncate_print) {
 316                         printf("sockfs: truncating copyout of address using "
 317                             "XNET semantics for pid = %d. Lengths %d, %d\n",
 318                             curproc->p_pid, klen, ulen);
 319                 }
 320 
 321                 if (ulen != 0) {
 322                         if (copyout(kaddr, uaddr, ulen))
 323                                 return (EFAULT);
 324                 } else
 325                         klen = 0;
 326         } else
 327                 klen = 0;
 328 
 329         if (ulenp != NULL) {
 330                 if (copyout(&klen, ulenp, sizeof (klen)))
 331                         return (EFAULT);
 332         }
 333         return (0);
 334 }
 335 
 336 /*
 337  * The socketpair() code in libsocket creates two sockets (using
 338  * the /etc/netconfig fallback if needed) before calling this routine
 339  * to connect the two sockets together.
 340  *
 341  * For a SOCK_STREAM socketpair a listener is needed - in that case this
 342  * routine will create a new file descriptor as part of accepting the
 343  * connection. The library socketpair() will check if svs[2] has changed
 344  * in which case it will close the changed fd.
 345  *
 346  * Note that this code could use the TPI feature of accepting the connection
 347  * on the listening endpoint. However, that would require significant changes
 348  * to soaccept.
 349  */
 350 int
 351 so_socketpair(int sv[2])
 352 {
 353         int svs[2];
 354         struct sonode *so1, *so2;
 355         int error;
 356         int orig_flags;
 357         struct sockaddr_ux *name;
 358         size_t namelen;
 359         sotpi_info_t *sti1;
 360         sotpi_info_t *sti2;
 361 
 362         dprint(1, ("so_socketpair(%p)\n", (void *)sv));
 363 
 364         error = useracc(sv, sizeof (svs), B_WRITE);
 365         if (error && do_useracc)
 366                 return (set_errno(EFAULT));
 367 
 368         if (copyin(sv, svs, sizeof (svs)))
 369                 return (set_errno(EFAULT));
 370 
 371         if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
 372                 return (set_errno(error));
 373 
 374         if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
 375                 releasef(svs[0]);
 376                 return (set_errno(error));
 377         }
 378 
 379         if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
 380                 error = EOPNOTSUPP;
 381                 goto done;
 382         }
 383 
 384         sti1 = SOTOTPI(so1);
 385         sti2 = SOTOTPI(so2);
 386 
 387         /*
 388          * The code below makes assumptions about the "sockfs" implementation.
 389          * So make sure that the correct implementation is really used.
 390          */
 391         ASSERT(so1->so_ops == &sotpi_sonodeops);
 392         ASSERT(so2->so_ops == &sotpi_sonodeops);
 393 
 394         if (so1->so_type == SOCK_DGRAM) {
 395                 /*
 396                  * Bind both sockets and connect them with each other.
 397                  * Need to allocate name/namelen for soconnect.
 398                  */
 399                 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
 400                 if (error) {
 401                         eprintsoline(so1, error);
 402                         goto done;
 403                 }
 404                 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 405                 if (error) {
 406                         eprintsoline(so2, error);
 407                         goto done;
 408                 }
 409                 namelen = sizeof (struct sockaddr_ux);
 410                 name = kmem_alloc(namelen, KM_SLEEP);
 411                 name->sou_family = AF_UNIX;
 412                 name->sou_addr = sti2->sti_ux_laddr;
 413                 error = socket_connect(so1,
 414                     (struct sockaddr *)name,
 415                     (socklen_t)namelen,
 416                     0, _SOCONNECT_NOXLATE, CRED());
 417                 if (error) {
 418                         kmem_free(name, namelen);
 419                         eprintsoline(so1, error);
 420                         goto done;
 421                 }
 422                 name->sou_addr = sti1->sti_ux_laddr;
 423                 error = socket_connect(so2,
 424                     (struct sockaddr *)name,
 425                     (socklen_t)namelen,
 426                     0, _SOCONNECT_NOXLATE, CRED());
 427                 kmem_free(name, namelen);
 428                 if (error) {
 429                         eprintsoline(so2, error);
 430                         goto done;
 431                 }
 432                 releasef(svs[0]);
 433                 releasef(svs[1]);
 434         } else {
 435                 /*
 436                  * Bind both sockets, with so1 being a listener.
 437                  * Connect so2 to so1 - nonblocking to avoid waiting for
 438                  * soaccept to complete.
 439                  * Accept a connection on so1. Pass out the new fd as sv[0].
 440                  * The library will detect the changed fd and close
 441                  * the original one.
 442                  */
 443                 struct sonode *nso;
 444                 struct vnode *nvp;
 445                 struct file *nfp;
 446                 int nfd;
 447 
 448                 /*
 449                  * We could simply call socket_listen() here (which would do the
 450                  * binding automatically) if the code didn't rely on passing
 451                  * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
 452                  */
 453                 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
 454                     _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
 455                     CRED());
 456                 if (error) {
 457                         eprintsoline(so1, error);
 458                         goto done;
 459                 }
 460                 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 461                 if (error) {
 462                         eprintsoline(so2, error);
 463                         goto done;
 464                 }
 465 
 466                 namelen = sizeof (struct sockaddr_ux);
 467                 name = kmem_alloc(namelen, KM_SLEEP);
 468                 name->sou_family = AF_UNIX;
 469                 name->sou_addr = sti1->sti_ux_laddr;
 470                 error = socket_connect(so2,
 471                     (struct sockaddr *)name,
 472                     (socklen_t)namelen,
 473                     FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
 474                 kmem_free(name, namelen);
 475                 if (error) {
 476                         if (error != EINPROGRESS) {
 477                                 eprintsoline(so2, error); goto done;
 478                         }
 479                 }
 480 
 481                 error = socket_accept(so1, 0, CRED(), &nso);
 482                 if (error) {
 483                         eprintsoline(so1, error);
 484                         goto done;
 485                 }
 486 
 487                 /* wait for so2 being SS_CONNECTED ignoring signals */
 488                 mutex_enter(&so2->so_lock);
 489                 error = sowaitconnected(so2, 0, 1);
 490                 mutex_exit(&so2->so_lock);
 491                 if (error != 0) {
 492                         (void) socket_close(nso, 0, CRED());
 493                         socket_destroy(nso);
 494                         eprintsoline(so2, error);
 495                         goto done;
 496                 }
 497 
 498                 nvp = SOTOV(nso);
 499                 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
 500                         (void) socket_close(nso, 0, CRED());
 501                         socket_destroy(nso);
 502                         eprintsoline(nso, error);
 503                         goto done;
 504                 }
 505                 /*
 506                  * copy over FNONBLOCK and FNDELAY flags should they exist
 507                  */
 508                 if (so1->so_state & SS_NONBLOCK)
 509                         nfp->f_flag |= FNONBLOCK;
 510                 if (so1->so_state & SS_NDELAY)
 511                         nfp->f_flag |= FNDELAY;
 512 
 513                 /*
 514                  * fill in the entries that falloc reserved
 515                  */
 516                 mutex_exit(&nfp->f_tlock);
 517                 setf(nfd, nfp);
 518 
 519                 /*
 520                  * get the original flags before we release
 521                  */
 522                 VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
 523 
 524                 releasef(svs[0]);
 525                 releasef(svs[1]);
 526 
 527                 /*
 528                  * If FD_CLOEXEC was set on the filedescriptor we're
 529                  * swapping out, we should set it on the new one too.
 530                  */
 531                 if (orig_flags & FD_CLOEXEC) {
 532                         f_setfd(nfd, FD_CLOEXEC);
 533                 }
 534 
 535                 /*
 536                  * The socketpair library routine will close the original
 537                  * svs[0] when this code passes out a different file
 538                  * descriptor.
 539                  */
 540                 svs[0] = nfd;
 541 
 542                 if (copyout(svs, sv, sizeof (svs))) {
 543                         (void) closeandsetf(nfd, NULL);
 544                         eprintline(EFAULT);
 545                         return (set_errno(EFAULT));
 546                 }
 547         }
 548         return (0);
 549 
 550 done:
 551         releasef(svs[0]);
 552         releasef(svs[1]);
 553         return (set_errno(error));
 554 }
 555 
 556 int
 557 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
 558 {
 559         struct sonode *so;
 560         int error;
 561 
 562         dprint(1, ("bind(%d, %p, %d)\n",
 563             sock, (void *)name, namelen));
 564 
 565         if ((so = getsonode(sock, &error, NULL)) == NULL)
 566                 return (set_errno(error));
 567 
 568         /* Allocate and copyin name */
 569         /*
 570          * X/Open test does not expect EFAULT with NULL name and non-zero
 571          * namelen.
 572          */
 573         if (name != NULL && namelen != 0) {
 574                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 575                 name = copyin_name(so, name, &namelen, &error);
 576                 if (name == NULL) {
 577                         releasef(sock);
 578                         return (set_errno(error));
 579                 }
 580         } else {
 581                 name = NULL;
 582                 namelen = 0;
 583         }
 584 
 585         switch (version) {
 586         default:
 587                 error = socket_bind(so, name, namelen, 0, CRED());
 588                 break;
 589         case SOV_XPG4_2:
 590                 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
 591                 break;
 592         case SOV_SOCKBSD:
 593                 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
 594                 break;
 595         }
 596 done:
 597         releasef(sock);
 598         if (name != NULL)
 599                 kmem_free(name, (size_t)namelen);
 600 
 601         if (error)
 602                 return (set_errno(error));
 603         return (0);
 604 }
 605 
 606 /* ARGSUSED2 */
 607 int
 608 listen(int sock, int backlog, int version)
 609 {
 610         struct sonode *so;
 611         int error;
 612 
 613         dprint(1, ("listen(%d, %d)\n",
 614             sock, backlog));
 615 
 616         if ((so = getsonode(sock, &error, NULL)) == NULL)
 617                 return (set_errno(error));
 618 
 619         error = socket_listen(so, backlog, CRED());
 620 
 621         releasef(sock);
 622         if (error)
 623                 return (set_errno(error));
 624         return (0);
 625 }
 626 
 627 /*ARGSUSED3*/
 628 int
 629 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
 630     int flags)
 631 {
 632         struct sonode *so;
 633         file_t *fp;
 634         int error;
 635         socklen_t namelen;
 636         struct sonode *nso;
 637         struct vnode *nvp;
 638         struct file *nfp;
 639         int nfd;
 640         int ssflags;
 641         struct sockaddr *addrp;
 642         socklen_t addrlen;
 643 
 644         dprint(1, ("accept(%d, %p, %p)\n",
 645             sock, (void *)name, (void *)namelenp));
 646 
 647         if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
 648                 return (set_errno(EINVAL));
 649         }
 650 
 651         /* Translate SOCK_ flags to their SS_ variant */
 652         ssflags = 0;
 653         if (flags & SOCK_NONBLOCK)
 654                 ssflags |= SS_NONBLOCK;
 655         if (flags & SOCK_NDELAY)
 656                 ssflags |= SS_NDELAY;
 657 
 658         if ((so = getsonode(sock, &error, &fp)) == NULL)
 659                 return (set_errno(error));
 660 
 661         if (name != NULL) {
 662                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 663                 if (copyin(namelenp, &namelen, sizeof (namelen))) {
 664                         releasef(sock);
 665                         return (set_errno(EFAULT));
 666                 }
 667                 if (namelen != 0) {
 668                         error = useracc(name, (size_t)namelen, B_WRITE);
 669                         if (error && do_useracc) {
 670                                 releasef(sock);
 671                                 return (set_errno(EFAULT));
 672                         }
 673                 } else
 674                         name = NULL;
 675         } else {
 676                 namelen = 0;
 677         }
 678 
 679         /*
 680          * Allocate the user fd before socket_accept() in order to
 681          * catch EMFILE errors before calling socket_accept().
 682          */
 683         if ((nfd = ufalloc(0)) == -1) {
 684                 eprintsoline(so, EMFILE);
 685                 releasef(sock);
 686                 return (set_errno(EMFILE));
 687         }
 688         error = socket_accept(so, fp->f_flag, CRED(), &nso);
 689         if (error) {
 690                 setf(nfd, NULL);
 691                 releasef(sock);
 692                 return (set_errno(error));
 693         }
 694 
 695         nvp = SOTOV(nso);
 696 
 697         ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
 698         if (namelen != 0) {
 699                 addrlen = so->so_max_addr_len;
 700                 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
 701 
 702                 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
 703                     &addrlen, B_TRUE, CRED())) == 0) {
 704                         error = copyout_name(name, namelen, namelenp,
 705                             addrp, addrlen);
 706                 } else {
 707                         ASSERT(error == EINVAL || error == ENOTCONN);
 708                         error = ECONNABORTED;
 709                 }
 710                 kmem_free(addrp, so->so_max_addr_len);
 711         }
 712 
 713         if (error) {
 714                 setf(nfd, NULL);
 715                 (void) socket_close(nso, 0, CRED());
 716                 socket_destroy(nso);
 717                 releasef(sock);
 718                 return (set_errno(error));
 719         }
 720         if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
 721                 setf(nfd, NULL);
 722                 (void) socket_close(nso, 0, CRED());
 723                 socket_destroy(nso);
 724                 eprintsoline(so, error);
 725                 releasef(sock);
 726                 return (set_errno(error));
 727         }
 728         /*
 729          * fill in the entries that falloc reserved
 730          */
 731         nfp->f_vnode = nvp;
 732         mutex_exit(&nfp->f_tlock);
 733         setf(nfd, nfp);
 734 
 735         /*
 736          * Act on SOCK_CLOEXEC from flags
 737          */
 738         if (flags & SOCK_CLOEXEC) {
 739                 f_setfd(nfd, FD_CLOEXEC);
 740         }
 741 
 742         /*
 743          * Copy FNDELAY and FNONBLOCK from listener to acceptor
 744          * and from ssflags
 745          */
 746         if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
 747                 uint_t oflag = nfp->f_flag;
 748                 int arg = 0;
 749 
 750                 if ((ssflags | so->so_state) & SS_NONBLOCK)
 751                         arg |= FNONBLOCK;
 752                 else if ((ssflags | so->so_state) & SS_NDELAY)
 753                         arg |= FNDELAY;
 754 
 755                 /*
 756                  * This code is a simplification of the F_SETFL code in fcntl()
 757                  * Ignore any errors from VOP_SETFL.
 758                  */
 759                 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
 760                     != 0) {
 761                         eprintsoline(so, error);
 762                         error = 0;
 763                 } else {
 764                         mutex_enter(&nfp->f_tlock);
 765                         nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
 766                         nfp->f_flag |= arg;
 767                         mutex_exit(&nfp->f_tlock);
 768                 }
 769         }
 770         releasef(sock);
 771         return (nfd);
 772 }
 773 
 774 int
 775 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
 776 {
 777         struct sonode *so;
 778         file_t *fp;
 779         int error;
 780 
 781         dprint(1, ("connect(%d, %p, %d)\n",
 782             sock, (void *)name, namelen));
 783 
 784         if ((so = getsonode(sock, &error, &fp)) == NULL)
 785                 return (set_errno(error));
 786 
 787         /* Allocate and copyin name */
 788         if (namelen != 0) {
 789                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 790                 name = copyin_name(so, name, &namelen, &error);
 791                 if (name == NULL) {
 792                         releasef(sock);
 793                         return (set_errno(error));
 794                 }
 795         } else
 796                 name = NULL;
 797 
 798         error = socket_connect(so, name, namelen, fp->f_flag,
 799             (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
 800         releasef(sock);
 801         if (name)
 802                 kmem_free(name, (size_t)namelen);
 803         if (error)
 804                 return (set_errno(error));
 805         return (0);
 806 }
 807 
 808 /*ARGSUSED2*/
 809 int
 810 shutdown(int sock, int how, int version)
 811 {
 812         struct sonode *so;
 813         int error;
 814 
 815         dprint(1, ("shutdown(%d, %d)\n",
 816             sock, how));
 817 
 818         if ((so = getsonode(sock, &error, NULL)) == NULL)
 819                 return (set_errno(error));
 820 
 821         error = socket_shutdown(so, how, CRED());
 822 
 823         releasef(sock);
 824         if (error)
 825                 return (set_errno(error));
 826         return (0);
 827 }
 828 
 829 /*
 830  * Common receive routine.
 831  */
 832 static ssize_t
 833 recvit(int sock,
 834         struct nmsghdr *msg,
 835         struct uio *uiop,
 836         int flags,
 837         socklen_t *namelenp,
 838         socklen_t *controllenp,
 839         int *flagsp)
 840 {
 841         struct sonode *so;
 842         file_t *fp;
 843         void *name;
 844         socklen_t namelen;
 845         void *control;
 846         socklen_t controllen;
 847         ssize_t len;
 848         int error;
 849 
 850         if ((so = getsonode(sock, &error, &fp)) == NULL)
 851                 return (set_errno(error));
 852 
 853         len = uiop->uio_resid;
 854         uiop->uio_fmode = fp->f_flag;
 855         uiop->uio_extflg = UIO_COPY_CACHED;
 856 
 857         name = msg->msg_name;
 858         namelen = msg->msg_namelen;
 859         control = msg->msg_control;
 860         controllen = msg->msg_controllen;
 861 
 862         msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
 863             MSG_DONTWAIT | MSG_XPG4_2);
 864 
 865         error = socket_recvmsg(so, msg, uiop, CRED());
 866         if (error) {
 867                 releasef(sock);
 868                 return (set_errno(error));
 869         }
 870         lwp_stat_update(LWP_STAT_MSGRCV, 1);
 871         releasef(sock);
 872 
 873         error = copyout_name(name, namelen, namelenp,
 874             msg->msg_name, msg->msg_namelen);
 875         if (error)
 876                 goto err;
 877 
 878         if (flagsp != NULL) {
 879                 /*
 880                  * Clear internal flag.
 881                  */
 882                 msg->msg_flags &= ~MSG_XPG4_2;
 883 
 884                 /*
 885                  * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
 886                  * when controllen is zero and there is control data to
 887                  * copy out.
 888                  */
 889                 if (controllen != 0 &&
 890                     (msg->msg_controllen > controllen || control == NULL)) {
 891                         dprint(1, ("recvit: CTRUNC %d %d %p\n",
 892                             msg->msg_controllen, controllen, control));
 893 
 894                         msg->msg_flags |= MSG_CTRUNC;
 895                 }
 896                 if (copyout(&msg->msg_flags, flagsp,
 897                     sizeof (msg->msg_flags))) {
 898                         error = EFAULT;
 899                         goto err;
 900                 }
 901         }
 902         /*
 903          * Note: This MUST be done last. There can be no "goto err" after this
 904          * point since it could make so_closefds run twice on some part
 905          * of the file descriptor array.
 906          */
 907         if (controllen != 0) {
 908                 if (!(flags & MSG_XPG4_2)) {
 909                         /*
 910                          * Good old msg_accrights can only return a multiple
 911                          * of 4 bytes.
 912                          */
 913                         controllen &= ~((int)sizeof (uint32_t) - 1);
 914                 }
 915                 error = copyout_arg(control, controllen, controllenp,
 916                     msg->msg_control, msg->msg_controllen);
 917                 if (error)
 918                         goto err;
 919 
 920                 if (msg->msg_controllen > controllen || control == NULL) {
 921                         if (control == NULL)
 922                                 controllen = 0;
 923                         so_closefds(msg->msg_control, msg->msg_controllen,
 924                             !(flags & MSG_XPG4_2), controllen);
 925                 }
 926         }
 927         if (msg->msg_namelen != 0)
 928                 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 929         if (msg->msg_controllen != 0)
 930                 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 931         return (len - uiop->uio_resid);
 932 
 933 err:
 934         /*
 935          * If we fail and the control part contains file descriptors
 936          * we have to close the fd's.
 937          */
 938         if (msg->msg_controllen != 0)
 939                 so_closefds(msg->msg_control, msg->msg_controllen,
 940                     !(flags & MSG_XPG4_2), 0);
 941         if (msg->msg_namelen != 0)
 942                 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 943         if (msg->msg_controllen != 0)
 944                 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 945         return (set_errno(error));
 946 }
 947 
 948 /*
 949  * Native system call
 950  */
 951 ssize_t
 952 recv(int sock, void *buffer, size_t len, int flags)
 953 {
 954         struct nmsghdr lmsg;
 955         struct uio auio;
 956         struct iovec aiov[1];
 957 
 958         dprint(1, ("recv(%d, %p, %ld, %d)\n",
 959             sock, buffer, len, flags));
 960 
 961         if ((ssize_t)len < 0) {
 962                 return (set_errno(EINVAL));
 963         }
 964 
 965         aiov[0].iov_base = buffer;
 966         aiov[0].iov_len = len;
 967         auio.uio_loffset = 0;
 968         auio.uio_iov = aiov;
 969         auio.uio_iovcnt = 1;
 970         auio.uio_resid = len;
 971         auio.uio_segflg = UIO_USERSPACE;
 972         auio.uio_limit = 0;
 973 
 974         lmsg.msg_namelen = 0;
 975         lmsg.msg_controllen = 0;
 976         lmsg.msg_flags = 0;
 977         return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
 978 }
 979 
 980 ssize_t
 981 recvfrom(int sock, void *buffer, size_t len, int flags,
 982         struct sockaddr *name, socklen_t *namelenp)
 983 {
 984         struct nmsghdr lmsg;
 985         struct uio auio;
 986         struct iovec aiov[1];
 987 
 988         dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
 989             sock, buffer, len, flags, (void *)name, (void *)namelenp));
 990 
 991         if ((ssize_t)len < 0) {
 992                 return (set_errno(EINVAL));
 993         }
 994 
 995         aiov[0].iov_base = buffer;
 996         aiov[0].iov_len = len;
 997         auio.uio_loffset = 0;
 998         auio.uio_iov = aiov;
 999         auio.uio_iovcnt = 1;
1000         auio.uio_resid = len;
1001         auio.uio_segflg = UIO_USERSPACE;
1002         auio.uio_limit = 0;
1003 
1004         lmsg.msg_name = (char *)name;
1005         if (namelenp != NULL) {
1006                 if (copyin(namelenp, &lmsg.msg_namelen,
1007                     sizeof (lmsg.msg_namelen)))
1008                         return (set_errno(EFAULT));
1009         } else {
1010                 lmsg.msg_namelen = 0;
1011         }
1012         lmsg.msg_controllen = 0;
1013         lmsg.msg_flags = 0;
1014 
1015         return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1016 }
1017 
1018 /*
1019  * Uses the MSG_XPG4_2 flag to determine if the caller is using
1020  * struct omsghdr or struct nmsghdr.
1021  */
1022 ssize_t
1023 recvmsg(int sock, struct nmsghdr *msg, int flags)
1024 {
1025         STRUCT_DECL(nmsghdr, u_lmsg);
1026         STRUCT_HANDLE(nmsghdr, umsgptr);
1027         struct nmsghdr lmsg;
1028         struct uio auio;
1029         struct iovec aiov[MSG_MAXIOVLEN];
1030         int iovcnt;
1031         ssize_t len;
1032         int i;
1033         int *flagsp;
1034         model_t model;
1035 
1036         dprint(1, ("recvmsg(%d, %p, %d)\n",
1037             sock, (void *)msg, flags));
1038 
1039         model = get_udatamodel();
1040         STRUCT_INIT(u_lmsg, model);
1041         STRUCT_SET_HANDLE(umsgptr, model, msg);
1042 
1043         if (flags & MSG_XPG4_2) {
1044                 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1045                         return (set_errno(EFAULT));
1046                 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1047         } else {
1048                 /*
1049                  * Assumes that nmsghdr and omsghdr are identically shaped
1050                  * except for the added msg_flags field.
1051                  */
1052                 if (copyin(msg, STRUCT_BUF(u_lmsg),
1053                     SIZEOF_STRUCT(omsghdr, model)))
1054                         return (set_errno(EFAULT));
1055                 STRUCT_FSET(u_lmsg, msg_flags, 0);
1056                 flagsp = NULL;
1057         }
1058 
1059         /*
1060          * Code below us will kmem_alloc memory and hang it
1061          * off msg_control and msg_name fields. This forces
1062          * us to copy the structure to its native form.
1063          */
1064         lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1065         lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1066         lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1067         lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1068         lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1069         lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1070         lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1071 
1072         iovcnt = lmsg.msg_iovlen;
1073 
1074         if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1075                 return (set_errno(EMSGSIZE));
1076         }
1077 
1078 #ifdef _SYSCALL32_IMPL
1079         /*
1080          * 32-bit callers need to have their iovec expanded, while ensuring
1081          * that they can't move more than 2Gbytes of data in a single call.
1082          */
1083         if (model == DATAMODEL_ILP32) {
1084                 struct iovec32 aiov32[MSG_MAXIOVLEN];
1085                 ssize32_t count32;
1086 
1087                 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1088                     iovcnt * sizeof (struct iovec32)))
1089                         return (set_errno(EFAULT));
1090 
1091                 count32 = 0;
1092                 for (i = 0; i < iovcnt; i++) {
1093                         ssize32_t iovlen32;
1094 
1095                         iovlen32 = aiov32[i].iov_len;
1096                         count32 += iovlen32;
1097                         if (iovlen32 < 0 || count32 < 0)
1098                                 return (set_errno(EINVAL));
1099                         aiov[i].iov_len = iovlen32;
1100                         aiov[i].iov_base =
1101                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1102                 }
1103         } else
1104 #endif /* _SYSCALL32_IMPL */
1105         if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1106                 return (set_errno(EFAULT));
1107         }
1108         len = 0;
1109         for (i = 0; i < iovcnt; i++) {
1110                 ssize_t iovlen = aiov[i].iov_len;
1111                 len += iovlen;
1112                 if (iovlen < 0 || len < 0) {
1113                         return (set_errno(EINVAL));
1114                 }
1115         }
1116         auio.uio_loffset = 0;
1117         auio.uio_iov = aiov;
1118         auio.uio_iovcnt = iovcnt;
1119         auio.uio_resid = len;
1120         auio.uio_segflg = UIO_USERSPACE;
1121         auio.uio_limit = 0;
1122 
1123         if (lmsg.msg_control != NULL &&
1124             (do_useracc == 0 ||
1125             useracc(lmsg.msg_control, lmsg.msg_controllen,
1126             B_WRITE) != 0)) {
1127                 return (set_errno(EFAULT));
1128         }
1129 
1130         return (recvit(sock, &lmsg, &auio, flags,
1131             STRUCT_FADDR(umsgptr, msg_namelen),
1132             STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1133 }
1134 
1135 /*
1136  * Common send function.
1137  */
1138 static ssize_t
1139 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1140 {
1141         struct sonode *so;
1142         file_t *fp;
1143         void *name;
1144         socklen_t namelen;
1145         void *control;
1146         socklen_t controllen;
1147         ssize_t len;
1148         int error;
1149 
1150         if ((so = getsonode(sock, &error, &fp)) == NULL)
1151                 return (set_errno(error));
1152 
1153         uiop->uio_fmode = fp->f_flag;
1154 
1155         if (so->so_family == AF_UNIX)
1156                 uiop->uio_extflg = UIO_COPY_CACHED;
1157         else
1158                 uiop->uio_extflg = UIO_COPY_DEFAULT;
1159 
1160         /* Allocate and copyin name and control */
1161         name = msg->msg_name;
1162         namelen = msg->msg_namelen;
1163         if (name != NULL && namelen != 0) {
1164                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165                 name = copyin_name(so,
1166                     (struct sockaddr *)name,
1167                     &namelen, &error);
1168                 if (name == NULL)
1169                         goto done3;
1170                 /* copyin_name null terminates addresses for AF_UNIX */
1171                 msg->msg_namelen = namelen;
1172                 msg->msg_name = name;
1173         } else {
1174                 msg->msg_name = name = NULL;
1175                 msg->msg_namelen = namelen = 0;
1176         }
1177 
1178         control = msg->msg_control;
1179         controllen = msg->msg_controllen;
1180         if ((control != NULL) && (controllen != 0)) {
1181                 /*
1182                  * Verify that the length is not excessive to prevent
1183                  * an application from consuming all of kernel memory.
1184                  */
1185                 if (controllen > SO_MAXARGSIZE) {
1186                         error = EINVAL;
1187                         goto done2;
1188                 }
1189                 control = kmem_alloc(controllen, KM_SLEEP);
1190 
1191                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1192                 if (copyin(msg->msg_control, control, controllen)) {
1193                         error = EFAULT;
1194                         goto done1;
1195                 }
1196                 msg->msg_control = control;
1197         } else {
1198                 msg->msg_control = control = NULL;
1199                 msg->msg_controllen = controllen = 0;
1200         }
1201 
1202         len = uiop->uio_resid;
1203         msg->msg_flags = flags;
1204 
1205         error = socket_sendmsg(so, msg, uiop, CRED());
1206 done1:
1207         if (control != NULL)
1208                 kmem_free(control, controllen);
1209 done2:
1210         if (name != NULL)
1211                 kmem_free(name, namelen);
1212 done3:
1213         if (error != 0) {
1214                 releasef(sock);
1215                 return (set_errno(error));
1216         }
1217         lwp_stat_update(LWP_STAT_MSGSND, 1);
1218         releasef(sock);
1219         return (len - uiop->uio_resid);
1220 }
1221 
1222 /*
1223  * Native system call
1224  */
1225 ssize_t
1226 send(int sock, void *buffer, size_t len, int flags)
1227 {
1228         struct nmsghdr lmsg;
1229         struct uio auio;
1230         struct iovec aiov[1];
1231 
1232         dprint(1, ("send(%d, %p, %ld, %d)\n",
1233             sock, buffer, len, flags));
1234 
1235         if ((ssize_t)len < 0) {
1236                 return (set_errno(EINVAL));
1237         }
1238 
1239         aiov[0].iov_base = buffer;
1240         aiov[0].iov_len = len;
1241         auio.uio_loffset = 0;
1242         auio.uio_iov = aiov;
1243         auio.uio_iovcnt = 1;
1244         auio.uio_resid = len;
1245         auio.uio_segflg = UIO_USERSPACE;
1246         auio.uio_limit = 0;
1247 
1248         lmsg.msg_name = NULL;
1249         lmsg.msg_control = NULL;
1250         if (!(flags & MSG_XPG4_2)) {
1251                 /*
1252                  * In order to be compatible with the libsocket/sockmod
1253                  * implementation we set EOR for all send* calls.
1254                  */
1255                 flags |= MSG_EOR;
1256         }
1257         return (sendit(sock, &lmsg, &auio, flags));
1258 }
1259 
1260 /*
1261  * Uses the MSG_XPG4_2 flag to determine if the caller is using
1262  * struct omsghdr or struct nmsghdr.
1263  */
1264 ssize_t
1265 sendmsg(int sock, struct nmsghdr *msg, int flags)
1266 {
1267         struct nmsghdr lmsg;
1268         STRUCT_DECL(nmsghdr, u_lmsg);
1269         struct uio auio;
1270         struct iovec aiov[MSG_MAXIOVLEN];
1271         int iovcnt;
1272         ssize_t len;
1273         int i;
1274         model_t model;
1275 
1276         dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1277 
1278         model = get_udatamodel();
1279         STRUCT_INIT(u_lmsg, model);
1280 
1281         if (flags & MSG_XPG4_2) {
1282                 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1283                     STRUCT_SIZE(u_lmsg)))
1284                         return (set_errno(EFAULT));
1285         } else {
1286                 /*
1287                  * Assumes that nmsghdr and omsghdr are identically shaped
1288                  * except for the added msg_flags field.
1289                  */
1290                 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1291                     SIZEOF_STRUCT(omsghdr, model)))
1292                         return (set_errno(EFAULT));
1293                 /*
1294                  * In order to be compatible with the libsocket/sockmod
1295                  * implementation we set EOR for all send* calls.
1296                  */
1297                 flags |= MSG_EOR;
1298         }
1299 
1300         /*
1301          * Code below us will kmem_alloc memory and hang it
1302          * off msg_control and msg_name fields. This forces
1303          * us to copy the structure to its native form.
1304          */
1305         lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1306         lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1307         lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1308         lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1309         lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1310         lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1311         lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1312 
1313         iovcnt = lmsg.msg_iovlen;
1314 
1315         if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1316                 /*
1317                  * Unless this is XPG 4.2 we allow iovcnt == 0 to
1318                  * be compatible with SunOS 4.X and 4.4BSD.
1319                  */
1320                 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1321                         return (set_errno(EMSGSIZE));
1322         }
1323 
1324 #ifdef _SYSCALL32_IMPL
1325         /*
1326          * 32-bit callers need to have their iovec expanded, while ensuring
1327          * that they can't move more than 2Gbytes of data in a single call.
1328          */
1329         if (model == DATAMODEL_ILP32) {
1330                 struct iovec32 aiov32[MSG_MAXIOVLEN];
1331                 ssize32_t count32;
1332 
1333                 if (iovcnt != 0 &&
1334                     copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1335                     iovcnt * sizeof (struct iovec32)))
1336                         return (set_errno(EFAULT));
1337 
1338                 count32 = 0;
1339                 for (i = 0; i < iovcnt; i++) {
1340                         ssize32_t iovlen32;
1341 
1342                         iovlen32 = aiov32[i].iov_len;
1343                         count32 += iovlen32;
1344                         if (iovlen32 < 0 || count32 < 0)
1345                                 return (set_errno(EINVAL));
1346                         aiov[i].iov_len = iovlen32;
1347                         aiov[i].iov_base =
1348                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1349                 }
1350         } else
1351 #endif /* _SYSCALL32_IMPL */
1352         if (iovcnt != 0 &&
1353             copyin(lmsg.msg_iov, aiov,
1354             (unsigned)iovcnt * sizeof (struct iovec))) {
1355                 return (set_errno(EFAULT));
1356         }
1357         len = 0;
1358         for (i = 0; i < iovcnt; i++) {
1359                 ssize_t iovlen = aiov[i].iov_len;
1360                 len += iovlen;
1361                 if (iovlen < 0 || len < 0) {
1362                         return (set_errno(EINVAL));
1363                 }
1364         }
1365         auio.uio_loffset = 0;
1366         auio.uio_iov = aiov;
1367         auio.uio_iovcnt = iovcnt;
1368         auio.uio_resid = len;
1369         auio.uio_segflg = UIO_USERSPACE;
1370         auio.uio_limit = 0;
1371 
1372         return (sendit(sock, &lmsg, &auio, flags));
1373 }
1374 
1375 ssize_t
1376 sendto(int sock, void *buffer, size_t len, int flags,
1377     struct sockaddr *name, socklen_t namelen)
1378 {
1379         struct nmsghdr lmsg;
1380         struct uio auio;
1381         struct iovec aiov[1];
1382 
1383         dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1384             sock, buffer, len, flags, (void *)name, namelen));
1385 
1386         if ((ssize_t)len < 0) {
1387                 return (set_errno(EINVAL));
1388         }
1389 
1390         aiov[0].iov_base = buffer;
1391         aiov[0].iov_len = len;
1392         auio.uio_loffset = 0;
1393         auio.uio_iov = aiov;
1394         auio.uio_iovcnt = 1;
1395         auio.uio_resid = len;
1396         auio.uio_segflg = UIO_USERSPACE;
1397         auio.uio_limit = 0;
1398 
1399         lmsg.msg_name = (char *)name;
1400         lmsg.msg_namelen = namelen;
1401         lmsg.msg_control = NULL;
1402         if (!(flags & MSG_XPG4_2)) {
1403                 /*
1404                  * In order to be compatible with the libsocket/sockmod
1405                  * implementation we set EOR for all send* calls.
1406                  */
1407                 flags |= MSG_EOR;
1408         }
1409         return (sendit(sock, &lmsg, &auio, flags));
1410 }
1411 
1412 /*ARGSUSED3*/
1413 int
1414 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1415 {
1416         struct sonode *so;
1417         int error;
1418         socklen_t namelen;
1419         socklen_t sock_addrlen;
1420         struct sockaddr *sock_addrp;
1421 
1422         dprint(1, ("getpeername(%d, %p, %p)\n",
1423             sock, (void *)name, (void *)namelenp));
1424 
1425         if ((so = getsonode(sock, &error, NULL)) == NULL)
1426                 goto bad;
1427 
1428         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1429         if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1430             (name == NULL && namelen != 0)) {
1431                 error = EFAULT;
1432                 goto rel_out;
1433         }
1434         sock_addrlen = so->so_max_addr_len;
1435         sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1436 
1437         if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1438             B_FALSE, CRED())) == 0) {
1439                 ASSERT(sock_addrlen <= so->so_max_addr_len);
1440                 error = copyout_name(name, namelen, namelenp,
1441                     (void *)sock_addrp, sock_addrlen);
1442         }
1443         kmem_free(sock_addrp, so->so_max_addr_len);
1444 rel_out:
1445         releasef(sock);
1446 bad:    return (error != 0 ? set_errno(error) : 0);
1447 }
1448 
1449 /*ARGSUSED3*/
1450 int
1451 getsockname(int sock, struct sockaddr *name,
1452                 socklen_t *namelenp, int version)
1453 {
1454         struct sonode *so;
1455         int error;
1456         socklen_t namelen, sock_addrlen;
1457         struct sockaddr *sock_addrp;
1458 
1459         dprint(1, ("getsockname(%d, %p, %p)\n",
1460             sock, (void *)name, (void *)namelenp));
1461 
1462         if ((so = getsonode(sock, &error, NULL)) == NULL)
1463                 goto bad;
1464 
1465         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1466         if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1467             (name == NULL && namelen != 0)) {
1468                 error = EFAULT;
1469                 goto rel_out;
1470         }
1471 
1472         sock_addrlen = so->so_max_addr_len;
1473         sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1474         if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1475             CRED())) == 0) {
1476                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1477                 ASSERT(sock_addrlen <= so->so_max_addr_len);
1478                 error = copyout_name(name, namelen, namelenp,
1479                     (void *)sock_addrp, sock_addrlen);
1480         }
1481         kmem_free(sock_addrp, so->so_max_addr_len);
1482 rel_out:
1483         releasef(sock);
1484 bad:    return (error != 0 ? set_errno(error) : 0);
1485 }
1486 
1487 /*ARGSUSED5*/
1488 int
1489 getsockopt(int sock,
1490         int level,
1491         int option_name,
1492         void *option_value,
1493         socklen_t *option_lenp,
1494         int version)
1495 {
1496         struct sonode *so;
1497         socklen_t optlen, optlen_res;
1498         void *optval;
1499         int error;
1500 
1501         dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1502             sock, level, option_name, option_value, (void *)option_lenp));
1503 
1504         if ((so = getsonode(sock, &error, NULL)) == NULL)
1505                 return (set_errno(error));
1506 
1507         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1508         if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1509                 releasef(sock);
1510                 return (set_errno(EFAULT));
1511         }
1512         /*
1513          * Verify that the length is not excessive to prevent
1514          * an application from consuming all of kernel memory.
1515          */
1516         if (optlen > SO_MAXARGSIZE) {
1517                 error = EINVAL;
1518                 releasef(sock);
1519                 return (set_errno(error));
1520         }
1521         optval = kmem_alloc(optlen, KM_SLEEP);
1522         optlen_res = optlen;
1523         error = socket_getsockopt(so, level, option_name, optval,
1524             &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1525             CRED());
1526         releasef(sock);
1527         if (error) {
1528                 kmem_free(optval, optlen);
1529                 return (set_errno(error));
1530         }
1531         error = copyout_arg(option_value, optlen, option_lenp,
1532             optval, optlen_res);
1533         kmem_free(optval, optlen);
1534         if (error)
1535                 return (set_errno(error));
1536         return (0);
1537 }
1538 
1539 /*ARGSUSED5*/
1540 int
1541 setsockopt(int sock,
1542         int level,
1543         int option_name,
1544         void *option_value,
1545         socklen_t option_len,
1546         int version)
1547 {
1548         struct sonode *so;
1549         intptr_t buffer[2];
1550         void *optval = NULL;
1551         int error;
1552 
1553         dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1554             sock, level, option_name, option_value, option_len));
1555 
1556         if ((so = getsonode(sock, &error, NULL)) == NULL)
1557                 return (set_errno(error));
1558 
1559         if (option_value != NULL) {
1560                 if (option_len != 0) {
1561                         /*
1562                          * Verify that the length is not excessive to prevent
1563                          * an application from consuming all of kernel memory.
1564                          */
1565                         if (option_len > SO_MAXARGSIZE) {
1566                                 error = EINVAL;
1567                                 goto done2;
1568                         }
1569                         optval = option_len <= sizeof (buffer) ?
1570                             &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1571                         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1572                         if (copyin(option_value, optval, (size_t)option_len)) {
1573                                 error = EFAULT;
1574                                 goto done1;
1575                         }
1576                 }
1577         } else
1578                 option_len = 0;
1579 
1580         error = socket_setsockopt(so, level, option_name, optval,
1581             (t_uscalar_t)option_len, CRED());
1582 done1:
1583         if (optval != buffer)
1584                 kmem_free(optval, (size_t)option_len);
1585 done2:
1586         releasef(sock);
1587         if (error)
1588                 return (set_errno(error));
1589         return (0);
1590 }
1591 
1592 static int
1593 sockconf_add_sock(int family, int type, int protocol, char *name)
1594 {
1595         int error = 0;
1596         char *kdevpath = NULL;
1597         char *kmodule = NULL;
1598         char *buf = NULL;
1599         size_t pathlen = 0;
1600         struct sockparams *sp;
1601 
1602         if (name == NULL)
1603                 return (EINVAL);
1604         /*
1605          * Copyin the name.
1606          * This also makes it possible to check for too long pathnames.
1607          * Compress the space needed for the name before passing it
1608          * to soconfig - soconfig will store the string until
1609          * the configuration is removed.
1610          */
1611         buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1612         if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1613                 kmem_free(buf, MAXPATHLEN);
1614                 return (error);
1615         }
1616         if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1617                 /* For device */
1618 
1619                 /*
1620                  * Special handling for NCA:
1621                  *
1622                  * DEV_NCA is never opened even if an application
1623                  * requests for AF_NCA. The device opened is instead a
1624                  * predefined AF_INET transport (NCA_INET_DEV).
1625                  *
1626                  * Prior to Volo (PSARC/2007/587) NCA would determine
1627                  * the device using a lookup, which worked then because
1628                  * all protocols were based on TPI. Since TPI is no
1629                  * longer the default, we have to explicitly state
1630                  * which device to use.
1631                  */
1632                 if (strcmp(buf, NCA_DEV) == 0) {
1633                         /* only support entry <28, 2, 0> */
1634                         if (family != AF_NCA || type != SOCK_STREAM ||
1635                             protocol != 0) {
1636                                 kmem_free(buf, MAXPATHLEN);
1637                                 return (EINVAL);
1638                         }
1639 
1640                         pathlen = strlen(NCA_INET_DEV) + 1;
1641                         kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1642                         bcopy(NCA_INET_DEV, kdevpath, pathlen);
1643                         kdevpath[pathlen - 1] = '\0';
1644                 } else {
1645                         kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1646                         bcopy(buf, kdevpath, pathlen);
1647                         kdevpath[pathlen - 1] = '\0';
1648                 }
1649         } else {
1650                 /* For socket module */
1651                 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1652                 bcopy(buf, kmodule, pathlen);
1653                 kmodule[pathlen - 1] = '\0';
1654                 pathlen = 0;
1655         }
1656         kmem_free(buf, MAXPATHLEN);
1657 
1658         /* sockparams_create frees mod name and devpath upon failure */
1659         sp = sockparams_create(family, type, protocol, kmodule,
1660             kdevpath, pathlen, 0, KM_SLEEP, &error);
1661         if (sp != NULL) {
1662                 error = sockparams_add(sp);
1663                 if (error != 0)
1664                         sockparams_destroy(sp);
1665         }
1666 
1667         return (error);
1668 }
1669 
1670 static int
1671 sockconf_remove_sock(int family, int type, int protocol)
1672 {
1673         return (sockparams_delete(family, type, protocol));
1674 }
1675 
1676 static int
1677 sockconfig_remove_filter(const char *uname)
1678 {
1679         char kname[SOF_MAXNAMELEN];
1680         size_t len;
1681         int error;
1682         sof_entry_t *ent;
1683 
1684         if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1685                 return (error);
1686 
1687         ent = sof_entry_remove_by_name(kname);
1688         if (ent == NULL)
1689                 return (ENXIO);
1690 
1691         mutex_enter(&ent->sofe_lock);
1692         ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1693         if (ent->sofe_refcnt == 0) {
1694                 mutex_exit(&ent->sofe_lock);
1695                 sof_entry_free(ent);
1696         } else {
1697                 /* let the last socket free the filter */
1698                 ent->sofe_flags |= SOFEF_CONDEMED;
1699                 mutex_exit(&ent->sofe_lock);
1700         }
1701 
1702         return (0);
1703 }
1704 
1705 static int
1706 sockconfig_add_filter(const char *uname, void *ufilpropp)
1707 {
1708         struct sockconfig_filter_props filprop;
1709         sof_entry_t *ent;
1710         int error;
1711         size_t tuplesz, len;
1712         char hintbuf[SOF_MAXNAMELEN];
1713 
1714         ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1715         mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1716 
1717         if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1718             &len)) != 0) {
1719                 sof_entry_free(ent);
1720                 return (error);
1721         }
1722 
1723         if (get_udatamodel() == DATAMODEL_NATIVE) {
1724                 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1725                         sof_entry_free(ent);
1726                         return (EFAULT);
1727                 }
1728         }
1729 #ifdef  _SYSCALL32_IMPL
1730         else {
1731                 struct sockconfig_filter_props32 filprop32;
1732 
1733                 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1734                         sof_entry_free(ent);
1735                         return (EFAULT);
1736                 }
1737                 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1738                 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1739                 filprop.sfp_hint = filprop32.sfp_hint;
1740                 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1741                 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1742                 filprop.sfp_socktuple =
1743                     (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1744         }
1745 #endif  /* _SYSCALL32_IMPL */
1746 
1747         if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1748             sizeof (ent->sofe_modname), &len)) != 0) {
1749                 sof_entry_free(ent);
1750                 return (error);
1751         }
1752 
1753         /*
1754          * A filter must specify at least one socket tuple.
1755          */
1756         if (filprop.sfp_socktuple_cnt == 0 ||
1757             filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1758                 sof_entry_free(ent);
1759                 return (EINVAL);
1760         }
1761         ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1762         ent->sofe_hint = filprop.sfp_hint;
1763 
1764         /*
1765          * Verify the hint, and copy in the hint argument, if necessary.
1766          */
1767         switch (ent->sofe_hint) {
1768         case SOF_HINT_BEFORE:
1769         case SOF_HINT_AFTER:
1770                 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1771                     sizeof (hintbuf), &len)) != 0) {
1772                         sof_entry_free(ent);
1773                         return (error);
1774                 }
1775                 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1776                 bcopy(hintbuf, ent->sofe_hintarg, len);
1777                 /* FALLTHRU */
1778         case SOF_HINT_TOP:
1779         case SOF_HINT_BOTTOM:
1780                 /* hints cannot be used with programmatic filters */
1781                 if (ent->sofe_flags & SOFEF_PROG) {
1782                         sof_entry_free(ent);
1783                         return (EINVAL);
1784                 }
1785                 break;
1786         case SOF_HINT_NONE:
1787                 break;
1788         default:
1789                 /* bad hint value */
1790                 sof_entry_free(ent);
1791                 return (EINVAL);
1792         }
1793 
1794         ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1795         tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1796         ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1797 
1798         if (get_udatamodel() == DATAMODEL_NATIVE) {
1799                 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1800                     tuplesz)) {
1801                         sof_entry_free(ent);
1802                         return (EFAULT);
1803                 }
1804         }
1805 #ifdef  _SYSCALL32_IMPL
1806         else {
1807                 int i;
1808                 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1809                 sof_socktuple_t *tup = ent->sofe_socktuple;
1810                 sof_socktuple32_t tup32;
1811 
1812                 tup = ent->sofe_socktuple;
1813                 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1814                         ASSERT(tup < ent->sofe_socktuple + tuplesz);
1815 
1816                         if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1817                                 sof_entry_free(ent);
1818                                 return (EFAULT);
1819                         }
1820                         tup->sofst_family = tup32.sofst_family;
1821                         tup->sofst_type = tup32.sofst_type;
1822                         tup->sofst_protocol = tup32.sofst_protocol;
1823 
1824                         data += sizeof (tup32);
1825                 }
1826         }
1827 #endif  /* _SYSCALL32_IMPL */
1828 
1829         /* Sockets can start using the filter as soon as the filter is added */
1830         if ((error = sof_entry_add(ent)) != 0)
1831                 sof_entry_free(ent);
1832 
1833         return (error);
1834 }
1835 
1836 /*
1837  * Socket configuration system call. It is used to add and remove
1838  * socket types.
1839  */
1840 int
1841 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1842 {
1843         int error = 0;
1844 
1845         if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1846                 return (set_errno(EPERM));
1847 
1848         if (sockfs_defer_nl7c_init) {
1849                 nl7c_init();
1850                 sockfs_defer_nl7c_init = 0;
1851         }
1852 
1853         switch (cmd) {
1854         case SOCKCONFIG_ADD_SOCK:
1855                 error = sockconf_add_sock((int)(uintptr_t)arg1,
1856                     (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1857                 break;
1858         case SOCKCONFIG_REMOVE_SOCK:
1859                 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1860                     (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1861                 break;
1862         case SOCKCONFIG_ADD_FILTER:
1863                 error = sockconfig_add_filter((const char *)arg1, arg2);
1864                 break;
1865         case SOCKCONFIG_REMOVE_FILTER:
1866                 error = sockconfig_remove_filter((const char *)arg1);
1867                 break;
1868         case SOCKCONFIG_GET_SOCKTABLE:
1869                 error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1870                 break;
1871         default:
1872 #ifdef  DEBUG
1873                 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1874 #endif
1875                 error = EINVAL;
1876                 break;
1877         }
1878 
1879         if (error != 0) {
1880                 eprintline(error);
1881                 return (set_errno(error));
1882         }
1883         return (0);
1884 }
1885 
1886 
1887 /*
1888  * Sendfile is implemented through two schemes, direct I/O or by
1889  * caching in the filesystem page cache. We cache the input file by
1890  * default and use direct I/O only if sendfile_max_size is set
1891  * appropriately as explained below. Note that this logic is consistent
1892  * with other filesystems where caching is turned on by default
1893  * unless explicitly turned off by using the DIRECTIO ioctl.
1894  *
1895  * We choose a slightly different scheme here. One can turn off
1896  * caching by setting sendfile_max_size to 0. One can also enable
1897  * caching of files <= sendfile_max_size by setting sendfile_max_size
1898  * to an appropriate value. By default sendfile_max_size is set to the
1899  * maximum value so that all files are cached. In future, we may provide
1900  * better interfaces for caching the file.
1901  *
1902  * Sendfile through Direct I/O (Zero copy)
1903  * --------------------------------------
1904  *
1905  * As disks are normally slower than the network, we can't have a
1906  * single thread that reads the disk and writes to the network. We
1907  * need to have parallelism. This is done by having the sendfile
1908  * thread create another thread that reads from the filesystem
1909  * and queues it for network processing. In this scheme, the data
1910  * is never copied anywhere i.e it is zero copy unlike the other
1911  * scheme.
1912  *
1913  * We have a sendfile queue (snfq) where each sendfile
1914  * request (snf_req_t) is queued for processing by a thread. Number
1915  * of threads is dynamically allocated and they exit if they are idling
1916  * beyond a specified amount of time. When each request (snf_req_t) is
1917  * processed by a thread, it produces a number of mblk_t structures to
1918  * be consumed by the sendfile thread. snf_deque and snf_enque are
1919  * used for consuming and producing mblks. Size of the filesystem
1920  * read is determined by the tunable (sendfile_read_size). A single
1921  * mblk holds sendfile_read_size worth of data (except the last
1922  * read of the file) which is sent down as a whole to the network.
1923  * sendfile_read_size is set to 1 MB as this seems to be the optimal
1924  * value for the UFS filesystem backed by a striped storage array.
1925  *
1926  * Synchronisation between read (producer) and write (consumer) threads.
1927  * --------------------------------------------------------------------
1928  *
1929  * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1930  * adding and deleting items in this list. Error can happen anytime
1931  * during read or write. There could be unprocessed mblks in the
1932  * sr_ib_XXX list when a read or write error occurs. Whenever error
1933  * is encountered, we need two things to happen :
1934  *
1935  * a) One of the threads need to clean the mblks.
1936  * b) When one thread encounters an error, the other should stop.
1937  *
1938  * For (a), we don't want to penalize the reader thread as it could do
1939  * some useful work processing other requests. For (b), the error can
1940  * be detected by examining sr_read_error or sr_write_error.
1941  * sr_lock protects sr_read_error and sr_write_error. If both reader and
1942  * writer encounters error, we need to report the write error back to
1943  * the application as that's what would have happened if the operations
1944  * were done sequentially. With this in mind, following should work :
1945  *
1946  *      - Check for errors before read or write.
1947  *      - If the reader encounters error, set the error in sr_read_error.
1948  *        Check sr_write_error, if it is set, send cv_signal as it is
1949  *        waiting for reader to complete. If it is not set, the writer
1950  *        is either running sinking data to the network or blocked
1951  *        because of flow control. For handling the latter case, we
1952  *        always send a signal. In any case, it will examine sr_read_error
1953  *        and return. sr_read_error is marked with SR_READ_DONE to tell
1954  *        the writer that the reader is done in all the cases.
1955  *      - If the writer encounters error, set the error in sr_write_error.
1956  *        The reader thread is either blocked because of flow control or
1957  *        running reading data from the disk. For the former, we need to
1958  *        wakeup the thread. Again to keep it simple, we always wake up
1959  *        the reader thread. Then, wait for the read thread to complete
1960  *        if it is not done yet. Cleanup and return.
1961  *
1962  * High and low water marks for the read thread.
1963  * --------------------------------------------
1964  *
1965  * If sendfile() is used to send data over a slow network, we need to
1966  * make sure that the read thread does not produce data at a faster
1967  * rate than the network. This can happen if the disk is faster than
1968  * the network. In such a case, we don't want to build a very large queue.
1969  * But we would still like to get all of the network throughput possible.
1970  * This implies that network should never block waiting for data.
1971  * As there are lot of disk throughput/network throughput combinations
1972  * possible, it is difficult to come up with an accurate number.
1973  * A typical 10K RPM disk has a max seek latency 17ms and rotational
1974  * latency of 3ms for reading a disk block. Thus, the total latency to
1975  * initiate a new read, transfer data from the disk and queue for
1976  * transmission would take about a max of 25ms. Todays max transfer rate
1977  * for network is 100MB/sec. If the thread is blocked because of flow
1978  * control, it would take 25ms to get new data ready for transmission.
1979  * We have to make sure that network is not idling, while we are initiating
1980  * new transfers. So, at 100MB/sec, to keep network busy we would need
1981  * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1982  * We need to pick a high water mark so that the woken up thread would
1983  * do considerable work before blocking again to prevent thrashing. Currently,
1984  * we pick this to be 10 times that of the low water mark.
1985  *
1986  * Sendfile with segmap caching (One copy from page cache to mblks).
1987  * ----------------------------------------------------------------
1988  *
1989  * We use the segmap cache for caching the file, if the size of file
1990  * is <= sendfile_max_size. In this case we don't use threads as VM
1991  * is reasonably fast enough to keep up with the network. If the underlying
1992  * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1993  * of data into segmap space, and use the virtual address from segmap
1994  * directly through desballoc() to avoid copy. Once the transport is done
1995  * with the data, the mapping will be released through segmap_release()
1996  * called by the call-back routine.
1997  *
1998  * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1999  * to copy the data from the filesystem into our temporary network buffer.
2000  *
2001  * To disable caching, set sendfile_max_size to 0.
2002  */
2003 
2004 uint_t sendfile_read_size = 1024 * 1024;
2005 #define SENDFILE_REQ_LOWAT      3 * 1024 * 1024
2006 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
2007 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
2008 struct sendfile_stats sf_stats;
2009 struct sendfile_queue *snfq;
2010 clock_t snfq_timeout;
2011 off64_t sendfile_max_size;
2012 
2013 static void snf_enque(snf_req_t *, mblk_t *);
2014 static mblk_t *snf_deque(snf_req_t *);
2015 
2016 void
2017 sendfile_init(void)
2018 {
2019         snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2020 
2021         mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2022         cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2023         snfq->snfq_max_threads = max_ncpus;
2024         snfq_timeout = SNFQ_TIMEOUT;
2025         /* Cache all files by default. */
2026         sendfile_max_size = MAXOFFSET_T;
2027 }
2028 
2029 /*
2030  * Queues a mblk_t for network processing.
2031  */
2032 static void
2033 snf_enque(snf_req_t *sr, mblk_t *mp)
2034 {
2035         mp->b_next = NULL;
2036         mutex_enter(&sr->sr_lock);
2037         if (sr->sr_mp_head == NULL) {
2038                 sr->sr_mp_head = sr->sr_mp_tail = mp;
2039                 cv_signal(&sr->sr_cv);
2040         } else {
2041                 sr->sr_mp_tail->b_next = mp;
2042                 sr->sr_mp_tail = mp;
2043         }
2044         sr->sr_qlen += MBLKL(mp);
2045         while ((sr->sr_qlen > sr->sr_hiwat) &&
2046             (sr->sr_write_error == 0)) {
2047                 sf_stats.ss_full_waits++;
2048                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2049         }
2050         mutex_exit(&sr->sr_lock);
2051 }
2052 
2053 /*
2054  * De-queues a mblk_t for network processing.
2055  */
2056 static mblk_t *
2057 snf_deque(snf_req_t *sr)
2058 {
2059         mblk_t *mp;
2060 
2061         mutex_enter(&sr->sr_lock);
2062         /*
2063          * If we have encountered an error on read or read is
2064          * completed and no more mblks, return NULL.
2065          * We need to check for NULL sr_mp_head also as
2066          * the reads could have completed and there is
2067          * nothing more to come.
2068          */
2069         if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2070             ((sr->sr_read_error & SR_READ_DONE) &&
2071             sr->sr_mp_head == NULL)) {
2072                 mutex_exit(&sr->sr_lock);
2073                 return (NULL);
2074         }
2075         /*
2076          * To start with neither SR_READ_DONE is marked nor
2077          * the error is set. When we wake up from cv_wait,
2078          * following are the possibilities :
2079          *
2080          *      a) sr_read_error is zero and mblks are queued.
2081          *      b) sr_read_error is set to SR_READ_DONE
2082          *         and mblks are queued.
2083          *      c) sr_read_error is set to SR_READ_DONE
2084          *         and no mblks.
2085          *      d) sr_read_error is set to some error other
2086          *         than SR_READ_DONE.
2087          */
2088 
2089         while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2090                 sf_stats.ss_empty_waits++;
2091                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2092         }
2093         /* Handle (a) and (b) first  - the normal case. */
2094         if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2095             (sr->sr_mp_head != NULL)) {
2096                 mp = sr->sr_mp_head;
2097                 sr->sr_mp_head = mp->b_next;
2098                 sr->sr_qlen -= MBLKL(mp);
2099                 if (sr->sr_qlen < sr->sr_lowat)
2100                         cv_signal(&sr->sr_cv);
2101                 mutex_exit(&sr->sr_lock);
2102                 mp->b_next = NULL;
2103                 return (mp);
2104         }
2105         /* Handle (c) and (d). */
2106         mutex_exit(&sr->sr_lock);
2107         return (NULL);
2108 }
2109 
2110 /*
2111  * Reads data from the filesystem and queues it for network processing.
2112  */
2113 void
2114 snf_async_read(snf_req_t *sr)
2115 {
2116         size_t iosize;
2117         u_offset_t fileoff;
2118         u_offset_t size;
2119         int ret_size;
2120         int error;
2121         file_t *fp;
2122         mblk_t *mp;
2123         struct vnode *vp;
2124         int extra = 0;
2125         int maxblk = 0;
2126         int wroff = 0;
2127         struct sonode *so;
2128 
2129         fp = sr->sr_fp;
2130         size = sr->sr_file_size;
2131         fileoff = sr->sr_file_off;
2132 
2133         /*
2134          * Ignore the error for filesystems that doesn't support DIRECTIO.
2135          */
2136         (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2137             kcred, NULL, NULL);
2138 
2139         vp = sr->sr_vp;
2140         if (vp->v_type == VSOCK) {
2141                 stdata_t *stp;
2142 
2143                 /*
2144                  * Get the extra space to insert a header and a trailer.
2145                  */
2146                 so = VTOSO(vp);
2147                 stp = vp->v_stream;
2148                 if (stp == NULL) {
2149                         wroff = so->so_proto_props.sopp_wroff;
2150                         maxblk = so->so_proto_props.sopp_maxblk;
2151                         extra = wroff + so->so_proto_props.sopp_tail;
2152                 } else {
2153                         wroff = (int)(stp->sd_wroff);
2154                         maxblk = (int)(stp->sd_maxblk);
2155                         extra = wroff + (int)(stp->sd_tail);
2156                 }
2157         }
2158 
2159         while ((size != 0) && (sr->sr_write_error == 0)) {
2160 
2161                 iosize = (int)MIN(sr->sr_maxpsz, size);
2162 
2163                 /*
2164                  * Socket filters can limit the mblk size,
2165                  * so limit reads to maxblk if there are
2166                  * filters present.
2167                  */
2168                 if (vp->v_type == VSOCK &&
2169                     so->so_filter_active > 0 && maxblk != INFPSZ)
2170                         iosize = (int)MIN(iosize, maxblk);
2171 
2172                 if (is_system_labeled()) {
2173                         mp = allocb_cred(iosize + extra, CRED(),
2174                             curproc->p_pid);
2175                 } else {
2176                         mp = allocb(iosize + extra, BPRI_MED);
2177                 }
2178                 if (mp == NULL) {
2179                         error = EAGAIN;
2180                         break;
2181                 }
2182 
2183                 mp->b_rptr += wroff;
2184 
2185                 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2186 
2187                 /* Error or Reached EOF ? */
2188                 if ((error != 0) || (ret_size == 0)) {
2189                         freeb(mp);
2190                         break;
2191                 }
2192                 mp->b_wptr = mp->b_rptr + ret_size;
2193 
2194                 snf_enque(sr, mp);
2195                 size -= ret_size;
2196                 fileoff += ret_size;
2197         }
2198         (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2199             kcred, NULL, NULL);
2200         mutex_enter(&sr->sr_lock);
2201         sr->sr_read_error = error;
2202         sr->sr_read_error |= SR_READ_DONE;
2203         cv_signal(&sr->sr_cv);
2204         mutex_exit(&sr->sr_lock);
2205 }
2206 
2207 void
2208 snf_async_thread(void)
2209 {
2210         snf_req_t *sr;
2211         callb_cpr_t cprinfo;
2212         clock_t time_left = 1;
2213 
2214         CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2215 
2216         mutex_enter(&snfq->snfq_lock);
2217         for (;;) {
2218                 /*
2219                  * If we didn't find a entry, then block until woken up
2220                  * again and then look through the queues again.
2221                  */
2222                 while ((sr = snfq->snfq_req_head) == NULL) {
2223                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2224                         if (time_left <= 0) {
2225                                 snfq->snfq_svc_threads--;
2226                                 CALLB_CPR_EXIT(&cprinfo);
2227                                 thread_exit();
2228                                 /* NOTREACHED */
2229                         }
2230                         snfq->snfq_idle_cnt++;
2231 
2232                         time_left = cv_reltimedwait(&snfq->snfq_cv,
2233                             &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2234                         snfq->snfq_idle_cnt--;
2235 
2236                         CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2237                 }
2238                 snfq->snfq_req_head = sr->sr_next;
2239                 snfq->snfq_req_cnt--;
2240                 mutex_exit(&snfq->snfq_lock);
2241                 snf_async_read(sr);
2242                 mutex_enter(&snfq->snfq_lock);
2243         }
2244 }
2245 
2246 
2247 snf_req_t *
2248 create_thread(int operation, struct vnode *vp, file_t *fp,
2249     u_offset_t fileoff, u_offset_t size)
2250 {
2251         snf_req_t *sr;
2252         stdata_t *stp;
2253 
2254         sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2255 
2256         sr->sr_vp = vp;
2257         sr->sr_fp = fp;
2258         stp = vp->v_stream;
2259 
2260         /*
2261          * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2262          * stream might be closed before thread returns from snf_async_read.
2263          */
2264         if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2265                 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2266         } else {
2267                 sr->sr_maxpsz = MAXBSIZE;
2268         }
2269 
2270         sr->sr_operation = operation;
2271         sr->sr_file_off = fileoff;
2272         sr->sr_file_size = size;
2273         sr->sr_hiwat = sendfile_req_hiwat;
2274         sr->sr_lowat = sendfile_req_lowat;
2275         mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2276         cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2277         /*
2278          * See whether we need another thread for servicing this
2279          * request. If there are already enough requests queued
2280          * for the threads, create one if not exceeding
2281          * snfq_max_threads.
2282          */
2283         mutex_enter(&snfq->snfq_lock);
2284         if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2285             snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2286                 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2287                     TS_RUN, minclsyspri);
2288                 snfq->snfq_svc_threads++;
2289         }
2290         if (snfq->snfq_req_head == NULL) {
2291                 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2292                 cv_signal(&snfq->snfq_cv);
2293         } else {
2294                 snfq->snfq_req_tail->sr_next = sr;
2295                 snfq->snfq_req_tail = sr;
2296         }
2297         snfq->snfq_req_cnt++;
2298         mutex_exit(&snfq->snfq_lock);
2299         return (sr);
2300 }
2301 
2302 int
2303 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2304     ssize_t *count)
2305 {
2306         snf_req_t *sr;
2307         mblk_t *mp;
2308         int iosize;
2309         int error = 0;
2310         short fflag;
2311         struct vnode *vp;
2312         int ksize;
2313         struct nmsghdr msg;
2314 
2315         ksize = 0;
2316         *count = 0;
2317         bzero(&msg, sizeof (msg));
2318 
2319         vp = fp->f_vnode;
2320         fflag = fp->f_flag;
2321         if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2322                 return (EAGAIN);
2323 
2324         /*
2325          * We check for read error in snf_deque. It has to check
2326          * for successful READ_DONE and return NULL, and we might
2327          * as well make an additional check there.
2328          */
2329         while ((mp = snf_deque(sr)) != NULL) {
2330 
2331                 if (ISSIG(curthread, JUSTLOOKING)) {
2332                         freeb(mp);
2333                         error = EINTR;
2334                         break;
2335                 }
2336                 iosize = MBLKL(mp);
2337 
2338                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2339 
2340                 if (error != 0) {
2341                         if (mp != NULL)
2342                                 freeb(mp);
2343                         break;
2344                 }
2345                 ksize += iosize;
2346         }
2347         *count = ksize;
2348 
2349         mutex_enter(&sr->sr_lock);
2350         sr->sr_write_error = error;
2351         /* Look at the big comments on why we cv_signal here. */
2352         cv_signal(&sr->sr_cv);
2353 
2354         /* Wait for the reader to complete always. */
2355         while (!(sr->sr_read_error & SR_READ_DONE)) {
2356                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2357         }
2358         /* If there is no write error, check for read error. */
2359         if (error == 0)
2360                 error = (sr->sr_read_error & ~SR_READ_DONE);
2361 
2362         if (error != 0) {
2363                 mblk_t *next_mp;
2364 
2365                 mp = sr->sr_mp_head;
2366                 while (mp != NULL) {
2367                         next_mp = mp->b_next;
2368                         mp->b_next = NULL;
2369                         freeb(mp);
2370                         mp = next_mp;
2371                 }
2372         }
2373         mutex_exit(&sr->sr_lock);
2374         kmem_free(sr, sizeof (snf_req_t));
2375         return (error);
2376 }
2377 
2378 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2379 #define SNF_VPMMAXPGS   (VPMMAXPGS/2)
2380 
2381 /*
2382  * Maximum no.of elements in the list returned by vpm, including
2383  * NULL for the last entry
2384  */
2385 #define SNF_MAXVMAPS    (SNF_VPMMAXPGS + 1)
2386 
2387 typedef struct {
2388         unsigned int    snfv_ref;
2389         frtn_t          snfv_frtn;
2390         vnode_t         *snfv_vp;
2391         struct vmap     snfv_vml[SNF_MAXVMAPS];
2392 } snf_vmap_desbinfo;
2393 
2394 typedef struct {
2395         frtn_t          snfi_frtn;
2396         caddr_t         snfi_base;
2397         uint_t          snfi_mapoff;
2398         size_t          snfi_len;
2399         vnode_t         *snfi_vp;
2400 } snf_smap_desbinfo;
2401 
2402 /*
2403  * The callback function used for vpm mapped mblks called when the last ref of
2404  * the mblk is dropped which normally occurs when TCP receives the ack. But it
2405  * can be the driver too due to lazy reclaim.
2406  */
2407 void
2408 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2409 {
2410         ASSERT(snfv->snfv_ref != 0);
2411         if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2412                 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2413                 VN_RELE(snfv->snfv_vp);
2414                 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2415         }
2416 }
2417 
2418 /*
2419  * The callback function used for segmap'ped mblks called when the last ref of
2420  * the mblk is dropped which normally occurs when TCP receives the ack. But it
2421  * can be the driver too due to lazy reclaim.
2422  */
2423 void
2424 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2425 {
2426         if (! IS_KPM_ADDR(snfi->snfi_base)) {
2427                 /*
2428                  * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2429                  * segmap_kpm as long as the latter never falls back to
2430                  * "use_segmap_range". (See segmap_getmapflt().)
2431                  *
2432                  * Using S_OTHER saves an redundant hat_setref() in
2433                  * segmap_unlock()
2434                  */
2435                 (void) segmap_fault(kas.a_hat, segkmap,
2436                     (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2437                     snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2438                     F_SOFTUNLOCK, S_OTHER);
2439         }
2440         (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2441         VN_RELE(snfi->snfi_vp);
2442         kmem_free(snfi, sizeof (*snfi));
2443 }
2444 
2445 /*
2446  * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2447  * When segmap is used, the mblk contains a segmap slot of no more
2448  * than MAXBSIZE.
2449  *
2450  * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2451  * in each iteration and sent by socket_sendmblk until an error occurs or
2452  * the requested size has been transferred. An mblk is esballoca'ed from
2453  * each mapped page and a chain of these mblk is sent to the transport layer.
2454  * vpm will be called to unmap the pages when all mblks have been freed by
2455  * free_func.
2456  *
2457  * At the end of the whole sendfile() operation, we wait till the data from
2458  * the last mblk is ack'ed by the transport before returning so that the
2459  * caller of sendfile() can safely modify the file content.
2460  */
2461 int
2462 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2463     ssize_t *count, boolean_t nowait)
2464 {
2465         caddr_t base;
2466         int mapoff;
2467         vnode_t *vp;
2468         mblk_t *mp = NULL;
2469         int chain_size;
2470         int error;
2471         clock_t deadlk_wait;
2472         short fflag;
2473         int ksize;
2474         struct vattr va;
2475         boolean_t dowait = B_FALSE;
2476         struct nmsghdr msg;
2477 
2478         vp = fp->f_vnode;
2479         fflag = fp->f_flag;
2480         ksize = 0;
2481         bzero(&msg, sizeof (msg));
2482 
2483         for (;;) {
2484                 if (ISSIG(curthread, JUSTLOOKING)) {
2485                         error = EINTR;
2486                         break;
2487                 }
2488 
2489                 if (vpm_enable) {
2490                         snf_vmap_desbinfo *snfv;
2491                         mblk_t *nmp;
2492                         int mblk_size;
2493                         int maxsize;
2494                         int i;
2495 
2496                         mapoff = fileoff & PAGEOFFSET;
2497                         maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2498 
2499                         snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2500                             KM_SLEEP);
2501 
2502                         /*
2503                          * Get vpm mappings for maxsize with read access.
2504                          * If the pages aren't available yet, we get
2505                          * DEADLK, so wait and try again a little later using
2506                          * an increasing wait. We might be here a long time.
2507                          *
2508                          * If delay_sig returns EINTR, be sure to exit and
2509                          * pass it up to the caller.
2510                          */
2511                         deadlk_wait = 0;
2512                         while ((error = vpm_map_pages(fvp, fileoff,
2513                             (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2514                             SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2515                                 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2516                                 if ((error = delay_sig(deadlk_wait)) != 0) {
2517                                         break;
2518                                 }
2519                         }
2520                         if (error != 0) {
2521                                 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2522                                 error = (error == EINTR) ? EINTR : EIO;
2523                                 goto out;
2524                         }
2525                         snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2526                         snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2527 
2528                         /* Construct the mblk chain from the page mappings */
2529                         chain_size = 0;
2530                         for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2531                             total_size > 0; i++) {
2532                                 ASSERT(chain_size < maxsize);
2533                                 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2534                                     mapoff, total_size);
2535                                 nmp = esballoca(
2536                                     (uchar_t *)snfv->snfv_vml[i].vs_addr +
2537                                     mapoff, mblk_size, BPRI_HI,
2538                                     &snfv->snfv_frtn);
2539 
2540                                 /*
2541                                  * We return EAGAIN after unmapping the pages
2542                                  * if we cannot allocate the the head of the
2543                                  * chain. Otherwise, we continue sending the
2544                                  * mblks constructed so far.
2545                                  */
2546                                 if (nmp == NULL) {
2547                                         if (i == 0) {
2548                                                 vpm_unmap_pages(snfv->snfv_vml,
2549                                                     S_READ);
2550                                                 kmem_free(snfv,
2551                                                     sizeof (snf_vmap_desbinfo));
2552                                                 error = EAGAIN;
2553                                                 goto out;
2554                                         }
2555                                         break;
2556                                 }
2557                                 /* Mark this dblk with the zero-copy flag */
2558                                 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2559                                 nmp->b_wptr += mblk_size;
2560                                 chain_size += mblk_size;
2561                                 fileoff += mblk_size;
2562                                 total_size -= mblk_size;
2563                                 snfv->snfv_ref++;
2564                                 mapoff = 0;
2565                                 if (i > 0)
2566                                         linkb(mp, nmp);
2567                                 else
2568                                         mp = nmp;
2569                         }
2570                         VN_HOLD(fvp);
2571                         snfv->snfv_vp = fvp;
2572                 } else {
2573                         /* vpm not supported. fallback to segmap */
2574                         snf_smap_desbinfo *snfi;
2575 
2576                         mapoff = fileoff & MAXBOFFSET;
2577                         chain_size = MAXBSIZE - mapoff;
2578                         if (chain_size > total_size)
2579                                 chain_size = total_size;
2580                         /*
2581                          * we don't forcefault because we'll call
2582                          * segmap_fault(F_SOFTLOCK) next.
2583                          *
2584                          * S_READ will get the ref bit set (by either
2585                          * segmap_getmapflt() or segmap_fault()) and page
2586                          * shared locked.
2587                          */
2588                         base = segmap_getmapflt(segkmap, fvp, fileoff,
2589                             chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2590 
2591                         snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2592                         snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2593                             PAGESIZE)- (mapoff & PAGEMASK);
2594                         /*
2595                          * We must call segmap_fault() even for segmap_kpm
2596                          * because that's how error gets returned.
2597                          * (segmap_getmapflt() never fails but segmap_fault()
2598                          * does.)
2599                          *
2600                          * If the pages aren't available yet, we get
2601                          * DEADLK, so wait and try again a little later using
2602                          * an increasing wait. We might be here a long time.
2603                          *
2604                          * If delay_sig returns EINTR, be sure to exit and
2605                          * pass it up to the caller.
2606                          */
2607                         deadlk_wait = 0;
2608                         while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2609                             segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2610                             mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2611                             S_READ))) == EDEADLK) {
2612                                 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2613                                 if ((error = delay_sig(deadlk_wait)) != 0) {
2614                                         break;
2615                                 }
2616                         }
2617                         if (error != 0) {
2618                                 (void) segmap_release(segkmap, base, 0);
2619                                 kmem_free(snfi, sizeof (*snfi));
2620                                 error = (error == EINTR) ? EINTR : EIO;
2621                                 goto out;
2622                         }
2623                         snfi->snfi_frtn.free_func = snf_smap_desbfree;
2624                         snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2625                         snfi->snfi_base = base;
2626                         snfi->snfi_mapoff = mapoff;
2627                         mp = esballoca((uchar_t *)base + mapoff, chain_size,
2628                             BPRI_HI, &snfi->snfi_frtn);
2629 
2630                         if (mp == NULL) {
2631                                 (void) segmap_fault(kas.a_hat, segkmap,
2632                                     (caddr_t)(uintptr_t)(((uintptr_t)base +
2633                                     mapoff) & PAGEMASK), snfi->snfi_len,
2634                                     F_SOFTUNLOCK, S_OTHER);
2635                                 (void) segmap_release(segkmap, base, 0);
2636                                 kmem_free(snfi, sizeof (*snfi));
2637                                 freemsg(mp);
2638                                 error = EAGAIN;
2639                                 goto out;
2640                         }
2641                         VN_HOLD(fvp);
2642                         snfi->snfi_vp = fvp;
2643                         mp->b_wptr += chain_size;
2644 
2645                         /* Mark this dblk with the zero-copy flag */
2646                         mp->b_datap->db_struioflag |= STRUIO_ZC;
2647                         fileoff += chain_size;
2648                         total_size -= chain_size;
2649                 }
2650 
2651                 if (total_size == 0 && !nowait) {
2652                         ASSERT(!dowait);
2653                         dowait = B_TRUE;
2654                         mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2655                 }
2656                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2657                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2658                 if (error != 0) {
2659                         /*
2660                          * mp contains the mblks that were not sent by
2661                          * socket_sendmblk. Use its size to update *count
2662                          */
2663                         *count = ksize + (chain_size - msgdsize(mp));
2664                         if (mp != NULL)
2665                                 freemsg(mp);
2666                         return (error);
2667                 }
2668                 ksize += chain_size;
2669                 if (total_size == 0)
2670                         goto done;
2671 
2672                 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2673                 va.va_mask = AT_SIZE;
2674                 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2675                 if (error)
2676                         break;
2677                 /* Read as much as possible. */
2678                 if (fileoff >= va.va_size)
2679                         break;
2680                 if (total_size + fileoff > va.va_size)
2681                         total_size = va.va_size - fileoff;
2682         }
2683 out:
2684         VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2685 done:
2686         *count = ksize;
2687         if (dowait) {
2688                 stdata_t *stp;
2689 
2690                 stp = vp->v_stream;
2691                 if (stp == NULL) {
2692                         struct sonode *so;
2693                         so = VTOSO(vp);
2694                         error = so_zcopy_wait(so);
2695                 } else {
2696                         mutex_enter(&stp->sd_lock);
2697                         while (!(stp->sd_flag & STZCNOTIFY)) {
2698                                 if (cv_wait_sig(&stp->sd_zcopy_wait,
2699                                     &stp->sd_lock) == 0) {
2700                                         error = EINTR;
2701                                         break;
2702                                 }
2703                         }
2704                         stp->sd_flag &= ~STZCNOTIFY;
2705                         mutex_exit(&stp->sd_lock);
2706                 }
2707         }
2708         return (error);
2709 }
2710 
2711 int
2712 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2713     uint_t maxpsz, ssize_t *count)
2714 {
2715         struct vnode *vp;
2716         mblk_t *mp;
2717         int iosize;
2718         int extra = 0;
2719         int error;
2720         short fflag;
2721         int ksize;
2722         int ioflag;
2723         struct uio auio;
2724         struct iovec aiov;
2725         struct vattr va;
2726         int maxblk = 0;
2727         int wroff = 0;
2728         struct sonode *so;
2729         struct nmsghdr msg;
2730 
2731         vp = fp->f_vnode;
2732         if (vp->v_type == VSOCK) {
2733                 stdata_t *stp;
2734 
2735                 /*
2736                  * Get the extra space to insert a header and a trailer.
2737                  */
2738                 so = VTOSO(vp);
2739                 stp = vp->v_stream;
2740                 if (stp == NULL) {
2741                         wroff = so->so_proto_props.sopp_wroff;
2742                         maxblk = so->so_proto_props.sopp_maxblk;
2743                         extra = wroff + so->so_proto_props.sopp_tail;
2744                 } else {
2745                         wroff = (int)(stp->sd_wroff);
2746                         maxblk = (int)(stp->sd_maxblk);
2747                         extra = wroff + (int)(stp->sd_tail);
2748                 }
2749         }
2750         bzero(&msg, sizeof (msg));
2751         fflag = fp->f_flag;
2752         ksize = 0;
2753         auio.uio_iov = &aiov;
2754         auio.uio_iovcnt = 1;
2755         auio.uio_segflg = UIO_SYSSPACE;
2756         auio.uio_llimit = MAXOFFSET_T;
2757         auio.uio_fmode = fflag;
2758         auio.uio_extflg = UIO_COPY_CACHED;
2759         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2760         /* If read sync is not asked for, filter sync flags */
2761         if ((ioflag & FRSYNC) == 0)
2762                 ioflag &= ~(FSYNC|FDSYNC);
2763         for (;;) {
2764                 if (ISSIG(curthread, JUSTLOOKING)) {
2765                         error = EINTR;
2766                         break;
2767                 }
2768                 iosize = (int)MIN(maxpsz, size);
2769 
2770                 /*
2771                  * Socket filters can limit the mblk size,
2772                  * so limit reads to maxblk if there are
2773                  * filters present.
2774                  */
2775                 if (vp->v_type == VSOCK &&
2776                     so->so_filter_active > 0 && maxblk != INFPSZ)
2777                         iosize = (int)MIN(iosize, maxblk);
2778 
2779                 if (is_system_labeled()) {
2780                         mp = allocb_cred(iosize + extra, CRED(),
2781                             curproc->p_pid);
2782                 } else {
2783                         mp = allocb(iosize + extra, BPRI_MED);
2784                 }
2785                 if (mp == NULL) {
2786                         error = EAGAIN;
2787                         break;
2788                 }
2789 
2790                 mp->b_rptr += wroff;
2791 
2792                 aiov.iov_base = (caddr_t)mp->b_rptr;
2793                 aiov.iov_len = iosize;
2794                 auio.uio_loffset = fileoff;
2795                 auio.uio_resid = iosize;
2796 
2797                 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2798                 iosize -= auio.uio_resid;
2799 
2800                 if (error == EINTR && iosize != 0)
2801                         error = 0;
2802 
2803                 if (error != 0 || iosize == 0) {
2804                         freeb(mp);
2805                         break;
2806                 }
2807                 mp->b_wptr = mp->b_rptr + iosize;
2808 
2809                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2810 
2811                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2812 
2813                 if (error != 0) {
2814                         *count = ksize;
2815                         if (mp != NULL)
2816                                 freeb(mp);
2817                         return (error);
2818                 }
2819                 ksize += iosize;
2820                 size -= iosize;
2821                 if (size == 0)
2822                         goto done;
2823 
2824                 fileoff += iosize;
2825                 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826                 va.va_mask = AT_SIZE;
2827                 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2828                 if (error)
2829                         break;
2830                 /* Read as much as possible. */
2831                 if (fileoff >= va.va_size)
2832                         size = 0;
2833                 else if (size + fileoff > va.va_size)
2834                         size = va.va_size - fileoff;
2835         }
2836         VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2837 done:
2838         *count = ksize;
2839         return (error);
2840 }
2841 
2842 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2843 /*
2844  * Largefile support for 32 bit applications only.
2845  */
2846 int
2847 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2848     ssize32_t *count32)
2849 {
2850         ssize32_t sfv_len;
2851         u_offset_t sfv_off, va_size;
2852         struct vnode *vp, *fvp, *realvp;
2853         struct vattr va;
2854         stdata_t *stp;
2855         ssize_t count = 0;
2856         int error = 0;
2857         boolean_t dozcopy = B_FALSE;
2858         uint_t maxpsz;
2859 
2860         sfv_len = (ssize32_t)sfv->sfv_len;
2861         if (sfv_len < 0) {
2862                 error = EINVAL;
2863                 goto out;
2864         }
2865 
2866         if (sfv_len == 0) goto out;
2867 
2868         sfv_off = (u_offset_t)sfv->sfv_off;
2869 
2870         /* Same checks as in pread */
2871         if (sfv_off > MAXOFFSET_T) {
2872                 error = EINVAL;
2873                 goto out;
2874         }
2875         if (sfv_off + sfv_len > MAXOFFSET_T)
2876                 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2877 
2878         /*
2879          * There are no more checks on sfv_len. So, we cast it to
2880          * u_offset_t and share the snf_direct_io/snf_cache code between
2881          * 32 bit and 64 bit.
2882          *
2883          * TODO: should do nbl_need_check() like read()?
2884          */
2885         if (sfv_len > sendfile_max_size) {
2886                 sf_stats.ss_file_not_cached++;
2887                 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2888                     &count);
2889                 goto out;
2890         }
2891         fvp = rfp->f_vnode;
2892         if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2893                 fvp = realvp;
2894         /*
2895          * Grab the lock as a reader to prevent the file size
2896          * from changing underneath.
2897          */
2898         (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2899         va.va_mask = AT_SIZE;
2900         error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2901         va_size = va.va_size;
2902         if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2903                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2904                 goto out;
2905         }
2906         /* Read as much as possible. */
2907         if (sfv_off + sfv_len > va_size)
2908                 sfv_len = va_size - sfv_off;
2909 
2910         vp = fp->f_vnode;
2911         stp = vp->v_stream;
2912         /*
2913          * When the NOWAIT flag is not set, we enable zero-copy only if the
2914          * transfer size is large enough. This prevents performance loss
2915          * when the caller sends the file piece by piece.
2916          */
2917         if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2918             (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2919             !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2920                 uint_t copyflag;
2921                 copyflag = stp != NULL ? stp->sd_copyflag :
2922                     VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2923                 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2924                         int on = 1;
2925 
2926                         if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2927                             SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2928                                 dozcopy = B_TRUE;
2929                 } else {
2930                         dozcopy = copyflag & STZCVMSAFE;
2931                 }
2932         }
2933         if (dozcopy) {
2934                 sf_stats.ss_file_segmap++;
2935                 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2936                     &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2937         } else {
2938                 if (vp->v_type == VSOCK && stp == NULL) {
2939                         sonode_t *so = VTOSO(vp);
2940                         maxpsz = so->so_proto_props.sopp_maxpsz;
2941                 } else if (stp != NULL) {
2942                         maxpsz = stp->sd_qn_maxpsz;
2943                 } else {
2944                         maxpsz = maxphys;
2945                 }
2946 
2947                 if (maxpsz == INFPSZ)
2948                         maxpsz = maxphys;
2949                 else
2950                         maxpsz = roundup(maxpsz, MAXBSIZE);
2951                 sf_stats.ss_file_cached++;
2952                 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2953                     maxpsz, &count);
2954         }
2955 out:
2956         releasef(sfv->sfv_fd);
2957         *count32 = (ssize32_t)count;
2958         return (error);
2959 }
2960 #endif
2961 
2962 #ifdef _SYSCALL32_IMPL
2963 /*
2964  * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2965  * ssize_t rather than ssize32_t; see the comments above read32 for details.
2966  */
2967 
2968 ssize_t
2969 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2970 {
2971         return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2972 }
2973 
2974 ssize_t
2975 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2976         caddr32_t name, caddr32_t namelenp)
2977 {
2978         return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2979             (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2980 }
2981 
2982 ssize_t
2983 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2984 {
2985         return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2986 }
2987 
2988 ssize_t
2989 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2990         caddr32_t name, socklen_t namelen)
2991 {
2992         return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2993             (void *)(uintptr_t)name, namelen));
2994 }
2995 #endif  /* _SYSCALL32_IMPL */
2996 
2997 /*
2998  * Function wrappers (mostly around the sonode switch) for
2999  * backward compatibility.
3000  */
3001 
3002 int
3003 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
3004 {
3005         return (socket_accept(so, fflag, CRED(), nsop));
3006 }
3007 
3008 int
3009 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3010     int backlog, int flags)
3011 {
3012         int     error;
3013 
3014         error = socket_bind(so, name, namelen, flags, CRED());
3015         if (error == 0 && backlog != 0)
3016                 return (socket_listen(so, backlog, CRED()));
3017 
3018         return (error);
3019 }
3020 
3021 int
3022 solisten(struct sonode *so, int backlog)
3023 {
3024         return (socket_listen(so, backlog, CRED()));
3025 }
3026 
3027 int
3028 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3029     int fflag, int flags)
3030 {
3031         return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3032 }
3033 
3034 int
3035 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3036 {
3037         return (socket_recvmsg(so, msg, uiop, CRED()));
3038 }
3039 
3040 int
3041 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3042 {
3043         return (socket_sendmsg(so, msg, uiop, CRED()));
3044 }
3045 
3046 int
3047 soshutdown(struct sonode *so, int how)
3048 {
3049         return (socket_shutdown(so, how, CRED()));
3050 }
3051 
3052 int
3053 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3054     socklen_t *optlenp, int flags)
3055 {
3056         return (socket_getsockopt(so, level, option_name, optval, optlenp,
3057             flags, CRED()));
3058 }
3059 
3060 int
3061 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3062     t_uscalar_t optlen)
3063 {
3064         return (socket_setsockopt(so, level, option_name, optval, optlen,
3065             CRED()));
3066 }
3067 
3068 /*
3069  * Because this is backward compatibility interface it only needs to be
3070  * able to handle the creation of TPI sockfs sockets.
3071  */
3072 struct sonode *
3073 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3074     int *errorp)
3075 {
3076         struct sonode *so;
3077 
3078         ASSERT(sp != NULL);
3079 
3080         so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3081             version, SOCKET_SLEEP, errorp, CRED());
3082         if (so == NULL) {
3083                 SOCKPARAMS_DEC_REF(sp);
3084         } else {
3085                 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3086                         /* Cannot fail, only bumps so_count */
3087                         (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3088                 } else {
3089                         socket_destroy(so);
3090                         so = NULL;
3091                 }
3092         }
3093         return (so);
3094 }