1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/t_lock.h>
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/buf.h>
  33 #include <sys/conf.h>
  34 #include <sys/cred.h>
  35 #include <sys/kmem.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vnode.h>
  39 #include <sys/debug.h>
  40 #include <sys/errno.h>
  41 #include <sys/time.h>
  42 #include <sys/file.h>
  43 #include <sys/user.h>
  44 #include <sys/stream.h>
  45 #include <sys/strsubr.h>
  46 #include <sys/strsun.h>
  47 #include <sys/sunddi.h>
  48 #include <sys/esunddi.h>
  49 #include <sys/flock.h>
  50 #include <sys/modctl.h>
  51 #include <sys/cmn_err.h>
  52 #include <sys/vmsystm.h>
  53 #include <sys/policy.h>
  54 
  55 #include <sys/socket.h>
  56 #include <sys/socketvar.h>
  57 
  58 #include <sys/isa_defs.h>
  59 #include <sys/inttypes.h>
  60 #include <sys/systm.h>
  61 #include <sys/cpuvar.h>
  62 #include <sys/filio.h>
  63 #include <sys/sendfile.h>
  64 #include <sys/ddi.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/seg_kpm.h>
  68 
  69 #include <fs/sockfs/nl7c.h>
  70 #include <fs/sockfs/sockcommon.h>
  71 #include <fs/sockfs/sockfilter_impl.h>
  72 #include <fs/sockfs/socktpi.h>
  73 
  74 #ifdef SOCK_TEST
  75 int do_useracc = 1;             /* Controlled by setting SO_DEBUG to 4 */
  76 #else
  77 #define do_useracc      1
  78 #endif /* SOCK_TEST */
  79 
  80 extern int      xnet_truncate_print;
  81 
  82 extern void     nl7c_init(void);
  83 extern int      sockfs_defer_nl7c_init;
  84 
  85 /*
  86  * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
  87  *       as there isn't a formal definition of IOV_MAX ???
  88  */
  89 #define MSG_MAXIOVLEN   16
  90 
  91 /*
  92  * Kernel component of socket creation.
  93  *
  94  * The socket library determines which version number to use.
  95  * First the library calls this with a NULL devpath. If this fails
  96  * to find a transport (using solookup) the library will look in /etc/netconfig
  97  * for the appropriate transport. If one is found it will pass in the
  98  * devpath for the kernel to use.
  99  */
 100 int
 101 so_socket(int family, int type_w_flags, int protocol, char *devpath,
 102     int version)
 103 {
 104         struct sonode *so;
 105         vnode_t *vp;
 106         struct file *fp;
 107         int fd;
 108         int error;
 109         int type;
 110 
 111         type = type_w_flags & SOCK_TYPE_MASK;
 112         if (devpath != NULL) {
 113                 char *buf;
 114                 size_t kdevpathlen = 0;
 115 
 116                 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 117                 if ((error = copyinstr(devpath, buf,
 118                     MAXPATHLEN, &kdevpathlen)) != 0) {
 119                         kmem_free(buf, MAXPATHLEN);
 120                         return (set_errno(error));
 121                 }
 122                 so = socket_create(family, type, protocol, buf, NULL,
 123                     SOCKET_SLEEP, version, CRED(), &error);
 124                 kmem_free(buf, MAXPATHLEN);
 125         } else {
 126                 so = socket_create(family, type, protocol, NULL, NULL,
 127                     SOCKET_SLEEP, version, CRED(), &error);
 128         }
 129         if (so == NULL)
 130                 return (set_errno(error));
 131 
 132         /* Allocate a file descriptor for the socket */
 133         vp = SOTOV(so);
 134         if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
 135                 (void) socket_close(so, 0, CRED());
 136                 socket_destroy(so);
 137                 return (set_errno(error));
 138         }
 139 
 140         /*
 141          * Now fill in the entries that falloc reserved
 142          */
 143         mutex_exit(&fp->f_tlock);
 144         setf(fd, fp);
 145         if ((type_w_flags & SOCK_CLOEXEC) != 0) {
 146                 f_setfd(fd, FD_CLOEXEC);
 147         }
 148 
 149         return (fd);
 150 }
 151 
 152 /*
 153  * Map from a file descriptor to a socket node.
 154  * Returns with the file descriptor held i.e. the caller has to
 155  * use releasef when done with the file descriptor.
 156  */
 157 struct sonode *
 158 getsonode(int sock, int *errorp, file_t **fpp)
 159 {
 160         file_t *fp;
 161         vnode_t *vp;
 162         struct sonode *so;
 163 
 164         if ((fp = getf(sock)) == NULL) {
 165                 *errorp = EBADF;
 166                 eprintline(*errorp);
 167                 return (NULL);
 168         }
 169         vp = fp->f_vnode;
 170         /* Check if it is a socket */
 171         if (vp->v_type != VSOCK) {
 172                 releasef(sock);
 173                 *errorp = ENOTSOCK;
 174                 eprintline(*errorp);
 175                 return (NULL);
 176         }
 177         /*
 178          * Use the stream head to find the real socket vnode.
 179          * This is needed when namefs sits above sockfs.
 180          */
 181         if (vp->v_stream) {
 182                 ASSERT(vp->v_stream->sd_vnode);
 183                 vp = vp->v_stream->sd_vnode;
 184 
 185                 so = VTOSO(vp);
 186                 if (so->so_version == SOV_STREAM) {
 187                         releasef(sock);
 188                         *errorp = ENOTSOCK;
 189                         eprintsoline(so, *errorp);
 190                         return (NULL);
 191                 }
 192         } else {
 193                 so = VTOSO(vp);
 194         }
 195         if (fpp)
 196                 *fpp = fp;
 197         return (so);
 198 }
 199 
 200 /*
 201  * Allocate and copyin a sockaddr.
 202  * Ensures NULL termination for AF_UNIX addresses by extending them
 203  * with one NULL byte if need be. Verifies that the length is not
 204  * excessive to prevent an application from consuming all of kernel
 205  * memory. Returns NULL when an error occurred.
 206  */
 207 static struct sockaddr *
 208 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
 209             int *errorp)
 210 {
 211         char    *faddr;
 212         size_t  namelen = (size_t)*namelenp;
 213 
 214         ASSERT(namelen != 0);
 215         if (namelen > SO_MAXARGSIZE) {
 216                 *errorp = EINVAL;
 217                 eprintsoline(so, *errorp);
 218                 return (NULL);
 219         }
 220 
 221         faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
 222         if (copyin(name, faddr, namelen)) {
 223                 kmem_free(faddr, namelen);
 224                 *errorp = EFAULT;
 225                 eprintsoline(so, *errorp);
 226                 return (NULL);
 227         }
 228 
 229         /*
 230          * Add space for NULL termination if needed.
 231          * Do a quick check if the last byte is NUL.
 232          */
 233         if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
 234                 /* Check if there is any NULL termination */
 235                 size_t  i;
 236                 int foundnull = 0;
 237 
 238                 for (i = sizeof (name->sa_family); i < namelen; i++) {
 239                         if (faddr[i] == '\0') {
 240                                 foundnull = 1;
 241                                 break;
 242                         }
 243                 }
 244                 if (!foundnull) {
 245                         /* Add extra byte for NUL padding */
 246                         char *nfaddr;
 247 
 248                         nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
 249                         bcopy(faddr, nfaddr, namelen);
 250                         kmem_free(faddr, namelen);
 251 
 252                         /* NUL terminate */
 253                         nfaddr[namelen] = '\0';
 254                         namelen++;
 255                         ASSERT((socklen_t)namelen == namelen);
 256                         *namelenp = (socklen_t)namelen;
 257                         faddr = nfaddr;
 258                 }
 259         }
 260         return ((struct sockaddr *)faddr);
 261 }
 262 
 263 /*
 264  * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 265  */
 266 static int
 267 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
 268                 void *kaddr, socklen_t klen)
 269 {
 270         if (uaddr != NULL) {
 271                 if (ulen > klen)
 272                         ulen = klen;
 273 
 274                 if (ulen != 0) {
 275                         if (copyout(kaddr, uaddr, ulen))
 276                                 return (EFAULT);
 277                 }
 278         } else
 279                 ulen = 0;
 280 
 281         if (ulenp != NULL) {
 282                 if (copyout(&ulen, ulenp, sizeof (ulen)))
 283                         return (EFAULT);
 284         }
 285         return (0);
 286 }
 287 
 288 /*
 289  * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 290  * If klen is greater than ulen it still uses the non-truncated
 291  * klen to update ulenp.
 292  */
 293 static int
 294 copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
 295                 void *kaddr, socklen_t klen)
 296 {
 297         if (uaddr != NULL) {
 298                 if (ulen >= klen)
 299                         ulen = klen;
 300                 else if (ulen != 0 && xnet_truncate_print) {
 301                         printf("sockfs: truncating copyout of address using "
 302                             "XNET semantics for pid = %d. Lengths %d, %d\n",
 303                             curproc->p_pid, klen, ulen);
 304                 }
 305 
 306                 if (ulen != 0) {
 307                         if (copyout(kaddr, uaddr, ulen))
 308                                 return (EFAULT);
 309                 } else
 310                         klen = 0;
 311         } else
 312                 klen = 0;
 313 
 314         if (ulenp != NULL) {
 315                 if (copyout(&klen, ulenp, sizeof (klen)))
 316                         return (EFAULT);
 317         }
 318         return (0);
 319 }
 320 
 321 /*
 322  * The socketpair() code in libsocket creates two sockets (using
 323  * the /etc/netconfig fallback if needed) before calling this routine
 324  * to connect the two sockets together.
 325  *
 326  * For a SOCK_STREAM socketpair a listener is needed - in that case this
 327  * routine will create a new file descriptor as part of accepting the
 328  * connection. The library socketpair() will check if svs[2] has changed
 329  * in which case it will close the changed fd.
 330  *
 331  * Note that this code could use the TPI feature of accepting the connection
 332  * on the listening endpoint. However, that would require significant changes
 333  * to soaccept.
 334  */
 335 int
 336 so_socketpair(int sv[2])
 337 {
 338         int svs[2];
 339         struct sonode *so1, *so2;
 340         int error;
 341         int orig_flags;
 342         struct sockaddr_ux *name;
 343         size_t namelen;
 344         sotpi_info_t *sti1;
 345         sotpi_info_t *sti2;
 346 
 347         dprint(1, ("so_socketpair(%p)\n", (void *)sv));
 348 
 349         error = useracc(sv, sizeof (svs), B_WRITE);
 350         if (error && do_useracc)
 351                 return (set_errno(EFAULT));
 352 
 353         if (copyin(sv, svs, sizeof (svs)))
 354                 return (set_errno(EFAULT));
 355 
 356         if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
 357                 return (set_errno(error));
 358 
 359         if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
 360                 releasef(svs[0]);
 361                 return (set_errno(error));
 362         }
 363 
 364         if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
 365                 error = EOPNOTSUPP;
 366                 goto done;
 367         }
 368 
 369         sti1 = SOTOTPI(so1);
 370         sti2 = SOTOTPI(so2);
 371 
 372         /*
 373          * The code below makes assumptions about the "sockfs" implementation.
 374          * So make sure that the correct implementation is really used.
 375          */
 376         ASSERT(so1->so_ops == &sotpi_sonodeops);
 377         ASSERT(so2->so_ops == &sotpi_sonodeops);
 378 
 379         if (so1->so_type == SOCK_DGRAM) {
 380                 /*
 381                  * Bind both sockets and connect them with each other.
 382                  * Need to allocate name/namelen for soconnect.
 383                  */
 384                 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
 385                 if (error) {
 386                         eprintsoline(so1, error);
 387                         goto done;
 388                 }
 389                 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 390                 if (error) {
 391                         eprintsoline(so2, error);
 392                         goto done;
 393                 }
 394                 namelen = sizeof (struct sockaddr_ux);
 395                 name = kmem_alloc(namelen, KM_SLEEP);
 396                 name->sou_family = AF_UNIX;
 397                 name->sou_addr = sti2->sti_ux_laddr;
 398                 error = socket_connect(so1,
 399                     (struct sockaddr *)name,
 400                     (socklen_t)namelen,
 401                     0, _SOCONNECT_NOXLATE, CRED());
 402                 if (error) {
 403                         kmem_free(name, namelen);
 404                         eprintsoline(so1, error);
 405                         goto done;
 406                 }
 407                 name->sou_addr = sti1->sti_ux_laddr;
 408                 error = socket_connect(so2,
 409                     (struct sockaddr *)name,
 410                     (socklen_t)namelen,
 411                     0, _SOCONNECT_NOXLATE, CRED());
 412                 kmem_free(name, namelen);
 413                 if (error) {
 414                         eprintsoline(so2, error);
 415                         goto done;
 416                 }
 417                 releasef(svs[0]);
 418                 releasef(svs[1]);
 419         } else {
 420                 /*
 421                  * Bind both sockets, with so1 being a listener.
 422                  * Connect so2 to so1 - nonblocking to avoid waiting for
 423                  * soaccept to complete.
 424                  * Accept a connection on so1. Pass out the new fd as sv[0].
 425                  * The library will detect the changed fd and close
 426                  * the original one.
 427                  */
 428                 struct sonode *nso;
 429                 struct vnode *nvp;
 430                 struct file *nfp;
 431                 int nfd;
 432 
 433                 /*
 434                  * We could simply call socket_listen() here (which would do the
 435                  * binding automatically) if the code didn't rely on passing
 436                  * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
 437                  */
 438                 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
 439                     _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
 440                     CRED());
 441                 if (error) {
 442                         eprintsoline(so1, error);
 443                         goto done;
 444                 }
 445                 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 446                 if (error) {
 447                         eprintsoline(so2, error);
 448                         goto done;
 449                 }
 450 
 451                 namelen = sizeof (struct sockaddr_ux);
 452                 name = kmem_alloc(namelen, KM_SLEEP);
 453                 name->sou_family = AF_UNIX;
 454                 name->sou_addr = sti1->sti_ux_laddr;
 455                 error = socket_connect(so2,
 456                     (struct sockaddr *)name,
 457                     (socklen_t)namelen,
 458                     FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
 459                 kmem_free(name, namelen);
 460                 if (error) {
 461                         if (error != EINPROGRESS) {
 462                                 eprintsoline(so2, error); goto done;
 463                         }
 464                 }
 465 
 466                 error = socket_accept(so1, 0, CRED(), &nso);
 467                 if (error) {
 468                         eprintsoline(so1, error);
 469                         goto done;
 470                 }
 471 
 472                 /* wait for so2 being SS_CONNECTED ignoring signals */
 473                 mutex_enter(&so2->so_lock);
 474                 error = sowaitconnected(so2, 0, 1);
 475                 mutex_exit(&so2->so_lock);
 476                 if (error != 0) {
 477                         (void) socket_close(nso, 0, CRED());
 478                         socket_destroy(nso);
 479                         eprintsoline(so2, error);
 480                         goto done;
 481                 }
 482 
 483                 nvp = SOTOV(nso);
 484                 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
 485                         (void) socket_close(nso, 0, CRED());
 486                         socket_destroy(nso);
 487                         eprintsoline(nso, error);
 488                         goto done;
 489                 }
 490                 /*
 491                  * fill in the entries that falloc reserved
 492                  */
 493                 mutex_exit(&nfp->f_tlock);
 494                 setf(nfd, nfp);
 495 
 496                 releasef(svs[0]);
 497                 releasef(svs[1]);
 498 
 499                 /*
 500                  * If FD_CLOEXEC was set on the filedescriptor we're
 501                  * swapping out, we should set it on the new one too.
 502                  */
 503                 VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
 504                 if (orig_flags & FD_CLOEXEC) {
 505                         f_setfd(nfd, FD_CLOEXEC);
 506                 }
 507 
 508                 /*
 509                  * The socketpair library routine will close the original
 510                  * svs[0] when this code passes out a different file
 511                  * descriptor.
 512                  */
 513                 svs[0] = nfd;
 514 
 515                 if (copyout(svs, sv, sizeof (svs))) {
 516                         (void) closeandsetf(nfd, NULL);
 517                         eprintline(EFAULT);
 518                         return (set_errno(EFAULT));
 519                 }
 520         }
 521         return (0);
 522 
 523 done:
 524         releasef(svs[0]);
 525         releasef(svs[1]);
 526         return (set_errno(error));
 527 }
 528 
 529 int
 530 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
 531 {
 532         struct sonode *so;
 533         int error;
 534 
 535         dprint(1, ("bind(%d, %p, %d)\n",
 536             sock, (void *)name, namelen));
 537 
 538         if ((so = getsonode(sock, &error, NULL)) == NULL)
 539                 return (set_errno(error));
 540 
 541         /* Allocate and copyin name */
 542         /*
 543          * X/Open test does not expect EFAULT with NULL name and non-zero
 544          * namelen.
 545          */
 546         if (name != NULL && namelen != 0) {
 547                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 548                 name = copyin_name(so, name, &namelen, &error);
 549                 if (name == NULL) {
 550                         releasef(sock);
 551                         return (set_errno(error));
 552                 }
 553         } else {
 554                 name = NULL;
 555                 namelen = 0;
 556         }
 557 
 558         switch (version) {
 559         default:
 560                 error = socket_bind(so, name, namelen, 0, CRED());
 561                 break;
 562         case SOV_XPG4_2:
 563                 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
 564                 break;
 565         case SOV_SOCKBSD:
 566                 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
 567                 break;
 568         }
 569 done:
 570         releasef(sock);
 571         if (name != NULL)
 572                 kmem_free(name, (size_t)namelen);
 573 
 574         if (error)
 575                 return (set_errno(error));
 576         return (0);
 577 }
 578 
 579 /* ARGSUSED2 */
 580 int
 581 listen(int sock, int backlog, int version)
 582 {
 583         struct sonode *so;
 584         int error;
 585 
 586         dprint(1, ("listen(%d, %d)\n",
 587             sock, backlog));
 588 
 589         if ((so = getsonode(sock, &error, NULL)) == NULL)
 590                 return (set_errno(error));
 591 
 592         error = socket_listen(so, backlog, CRED());
 593 
 594         releasef(sock);
 595         if (error)
 596                 return (set_errno(error));
 597         return (0);
 598 }
 599 
 600 /*ARGSUSED3*/
 601 int
 602 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
 603     int flags)
 604 {
 605         struct sonode *so;
 606         file_t *fp;
 607         int error;
 608         socklen_t namelen;
 609         struct sonode *nso;
 610         struct vnode *nvp;
 611         struct file *nfp;
 612         int nfd;
 613         int ssflags;
 614         struct sockaddr *addrp;
 615         socklen_t addrlen;
 616 
 617         dprint(1, ("accept(%d, %p, %p)\n",
 618             sock, (void *)name, (void *)namelenp));
 619 
 620         if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
 621                 return (set_errno(EINVAL));
 622         }
 623 
 624         /* Translate SOCK_ flags to their SS_ variant */
 625         ssflags = 0;
 626         if (flags & SOCK_NONBLOCK)
 627                 ssflags |= SS_NONBLOCK;
 628         if (flags & SOCK_NDELAY)
 629                 ssflags |= SS_NDELAY;
 630 
 631         if ((so = getsonode(sock, &error, &fp)) == NULL)
 632                 return (set_errno(error));
 633 
 634         if (name != NULL) {
 635                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 636                 if (copyin(namelenp, &namelen, sizeof (namelen))) {
 637                         releasef(sock);
 638                         return (set_errno(EFAULT));
 639                 }
 640                 if (namelen != 0) {
 641                         error = useracc(name, (size_t)namelen, B_WRITE);
 642                         if (error && do_useracc) {
 643                                 releasef(sock);
 644                                 return (set_errno(EFAULT));
 645                         }
 646                 } else
 647                         name = NULL;
 648         } else {
 649                 namelen = 0;
 650         }
 651 
 652         /*
 653          * Allocate the user fd before socket_accept() in order to
 654          * catch EMFILE errors before calling socket_accept().
 655          */
 656         if ((nfd = ufalloc(0)) == -1) {
 657                 eprintsoline(so, EMFILE);
 658                 releasef(sock);
 659                 return (set_errno(EMFILE));
 660         }
 661         error = socket_accept(so, fp->f_flag, CRED(), &nso);
 662         if (error) {
 663                 setf(nfd, NULL);
 664                 releasef(sock);
 665                 return (set_errno(error));
 666         }
 667 
 668         nvp = SOTOV(nso);
 669 
 670         ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
 671         if (namelen != 0) {
 672                 addrlen = so->so_max_addr_len;
 673                 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
 674 
 675                 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
 676                     &addrlen, B_TRUE, CRED())) == 0) {
 677                         error = copyout_name(name, namelen, namelenp,
 678                             addrp, addrlen);
 679                 } else {
 680                         ASSERT(error == EINVAL || error == ENOTCONN);
 681                         error = ECONNABORTED;
 682                 }
 683                 kmem_free(addrp, so->so_max_addr_len);
 684         }
 685 
 686         if (error) {
 687                 setf(nfd, NULL);
 688                 (void) socket_close(nso, 0, CRED());
 689                 socket_destroy(nso);
 690                 releasef(sock);
 691                 return (set_errno(error));
 692         }
 693         if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
 694                 setf(nfd, NULL);
 695                 (void) socket_close(nso, 0, CRED());
 696                 socket_destroy(nso);
 697                 eprintsoline(so, error);
 698                 releasef(sock);
 699                 return (set_errno(error));
 700         }
 701         /*
 702          * fill in the entries that falloc reserved
 703          */
 704         nfp->f_vnode = nvp;
 705         mutex_exit(&nfp->f_tlock);
 706         setf(nfd, nfp);
 707 
 708         /*
 709          * Act on SOCK_CLOEXEC from flags
 710          */
 711         if (flags & SOCK_CLOEXEC) {
 712                 f_setfd(nfd, FD_CLOEXEC);
 713         }
 714 
 715         /*
 716          * Copy FNDELAY and FNONBLOCK from listener to acceptor
 717          * and from ssflags
 718          */
 719         if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
 720                 uint_t oflag = nfp->f_flag;
 721                 int arg = 0;
 722 
 723                 if ((ssflags | so->so_state) & SS_NONBLOCK)
 724                         arg |= FNONBLOCK;
 725                 else if ((ssflags | so->so_state) & SS_NDELAY)
 726                         arg |= FNDELAY;
 727 
 728                 /*
 729                  * This code is a simplification of the F_SETFL code in fcntl()
 730                  * Ignore any errors from VOP_SETFL.
 731                  */
 732                 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
 733                     != 0) {
 734                         eprintsoline(so, error);
 735                         error = 0;
 736                 } else {
 737                         mutex_enter(&nfp->f_tlock);
 738                         nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
 739                         nfp->f_flag |= arg;
 740                         mutex_exit(&nfp->f_tlock);
 741                 }
 742         }
 743         releasef(sock);
 744         return (nfd);
 745 }
 746 
 747 int
 748 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
 749 {
 750         struct sonode *so;
 751         file_t *fp;
 752         int error;
 753 
 754         dprint(1, ("connect(%d, %p, %d)\n",
 755             sock, (void *)name, namelen));
 756 
 757         if ((so = getsonode(sock, &error, &fp)) == NULL)
 758                 return (set_errno(error));
 759 
 760         /* Allocate and copyin name */
 761         if (namelen != 0) {
 762                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 763                 name = copyin_name(so, name, &namelen, &error);
 764                 if (name == NULL) {
 765                         releasef(sock);
 766                         return (set_errno(error));
 767                 }
 768         } else
 769                 name = NULL;
 770 
 771         error = socket_connect(so, name, namelen, fp->f_flag,
 772             (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
 773         releasef(sock);
 774         if (name)
 775                 kmem_free(name, (size_t)namelen);
 776         if (error)
 777                 return (set_errno(error));
 778         return (0);
 779 }
 780 
 781 /*ARGSUSED2*/
 782 int
 783 shutdown(int sock, int how, int version)
 784 {
 785         struct sonode *so;
 786         int error;
 787 
 788         dprint(1, ("shutdown(%d, %d)\n",
 789             sock, how));
 790 
 791         if ((so = getsonode(sock, &error, NULL)) == NULL)
 792                 return (set_errno(error));
 793 
 794         error = socket_shutdown(so, how, CRED());
 795 
 796         releasef(sock);
 797         if (error)
 798                 return (set_errno(error));
 799         return (0);
 800 }
 801 
 802 /*
 803  * Common receive routine.
 804  */
 805 static ssize_t
 806 recvit(int sock,
 807         struct nmsghdr *msg,
 808         struct uio *uiop,
 809         int flags,
 810         socklen_t *namelenp,
 811         socklen_t *controllenp,
 812         int *flagsp)
 813 {
 814         struct sonode *so;
 815         file_t *fp;
 816         void *name;
 817         socklen_t namelen;
 818         void *control;
 819         socklen_t controllen;
 820         ssize_t len;
 821         int error;
 822 
 823         if ((so = getsonode(sock, &error, &fp)) == NULL)
 824                 return (set_errno(error));
 825 
 826         len = uiop->uio_resid;
 827         uiop->uio_fmode = fp->f_flag;
 828         uiop->uio_extflg = UIO_COPY_CACHED;
 829 
 830         name = msg->msg_name;
 831         namelen = msg->msg_namelen;
 832         control = msg->msg_control;
 833         controllen = msg->msg_controllen;
 834 
 835         msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
 836             MSG_DONTWAIT | MSG_XPG4_2);
 837 
 838         error = socket_recvmsg(so, msg, uiop, CRED());
 839         if (error) {
 840                 releasef(sock);
 841                 return (set_errno(error));
 842         }
 843         lwp_stat_update(LWP_STAT_MSGRCV, 1);
 844         releasef(sock);
 845 
 846         error = copyout_name(name, namelen, namelenp,
 847             msg->msg_name, msg->msg_namelen);
 848         if (error)
 849                 goto err;
 850 
 851         if (flagsp != NULL) {
 852                 /*
 853                  * Clear internal flag.
 854                  */
 855                 msg->msg_flags &= ~MSG_XPG4_2;
 856 
 857                 /*
 858                  * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
 859                  * when controllen is zero and there is control data to
 860                  * copy out.
 861                  */
 862                 if (controllen != 0 &&
 863                     (msg->msg_controllen > controllen || control == NULL)) {
 864                         dprint(1, ("recvit: CTRUNC %d %d %p\n",
 865                             msg->msg_controllen, controllen, control));
 866 
 867                         msg->msg_flags |= MSG_CTRUNC;
 868                 }
 869                 if (copyout(&msg->msg_flags, flagsp,
 870                     sizeof (msg->msg_flags))) {
 871                         error = EFAULT;
 872                         goto err;
 873                 }
 874         }
 875         /*
 876          * Note: This MUST be done last. There can be no "goto err" after this
 877          * point since it could make so_closefds run twice on some part
 878          * of the file descriptor array.
 879          */
 880         if (controllen != 0) {
 881                 if (!(flags & MSG_XPG4_2)) {
 882                         /*
 883                          * Good old msg_accrights can only return a multiple
 884                          * of 4 bytes.
 885                          */
 886                         controllen &= ~((int)sizeof (uint32_t) - 1);
 887                 }
 888                 error = copyout_arg(control, controllen, controllenp,
 889                     msg->msg_control, msg->msg_controllen);
 890                 if (error)
 891                         goto err;
 892 
 893                 if (msg->msg_controllen > controllen || control == NULL) {
 894                         if (control == NULL)
 895                                 controllen = 0;
 896                         so_closefds(msg->msg_control, msg->msg_controllen,
 897                             !(flags & MSG_XPG4_2), controllen);
 898                 }
 899         }
 900         if (msg->msg_namelen != 0)
 901                 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 902         if (msg->msg_controllen != 0)
 903                 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 904         return (len - uiop->uio_resid);
 905 
 906 err:
 907         /*
 908          * If we fail and the control part contains file descriptors
 909          * we have to close the fd's.
 910          */
 911         if (msg->msg_controllen != 0)
 912                 so_closefds(msg->msg_control, msg->msg_controllen,
 913                     !(flags & MSG_XPG4_2), 0);
 914         if (msg->msg_namelen != 0)
 915                 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 916         if (msg->msg_controllen != 0)
 917                 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 918         return (set_errno(error));
 919 }
 920 
 921 /*
 922  * Native system call
 923  */
 924 ssize_t
 925 recv(int sock, void *buffer, size_t len, int flags)
 926 {
 927         struct nmsghdr lmsg;
 928         struct uio auio;
 929         struct iovec aiov[1];
 930 
 931         dprint(1, ("recv(%d, %p, %ld, %d)\n",
 932             sock, buffer, len, flags));
 933 
 934         if ((ssize_t)len < 0) {
 935                 return (set_errno(EINVAL));
 936         }
 937 
 938         aiov[0].iov_base = buffer;
 939         aiov[0].iov_len = len;
 940         auio.uio_loffset = 0;
 941         auio.uio_iov = aiov;
 942         auio.uio_iovcnt = 1;
 943         auio.uio_resid = len;
 944         auio.uio_segflg = UIO_USERSPACE;
 945         auio.uio_limit = 0;
 946 
 947         lmsg.msg_namelen = 0;
 948         lmsg.msg_controllen = 0;
 949         lmsg.msg_flags = 0;
 950         return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
 951 }
 952 
 953 ssize_t
 954 recvfrom(int sock, void *buffer, size_t len, int flags,
 955         struct sockaddr *name, socklen_t *namelenp)
 956 {
 957         struct nmsghdr lmsg;
 958         struct uio auio;
 959         struct iovec aiov[1];
 960 
 961         dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
 962             sock, buffer, len, flags, (void *)name, (void *)namelenp));
 963 
 964         if ((ssize_t)len < 0) {
 965                 return (set_errno(EINVAL));
 966         }
 967 
 968         aiov[0].iov_base = buffer;
 969         aiov[0].iov_len = len;
 970         auio.uio_loffset = 0;
 971         auio.uio_iov = aiov;
 972         auio.uio_iovcnt = 1;
 973         auio.uio_resid = len;
 974         auio.uio_segflg = UIO_USERSPACE;
 975         auio.uio_limit = 0;
 976 
 977         lmsg.msg_name = (char *)name;
 978         if (namelenp != NULL) {
 979                 if (copyin(namelenp, &lmsg.msg_namelen,
 980                     sizeof (lmsg.msg_namelen)))
 981                         return (set_errno(EFAULT));
 982         } else {
 983                 lmsg.msg_namelen = 0;
 984         }
 985         lmsg.msg_controllen = 0;
 986         lmsg.msg_flags = 0;
 987 
 988         return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
 989 }
 990 
 991 /*
 992  * Uses the MSG_XPG4_2 flag to determine if the caller is using
 993  * struct omsghdr or struct nmsghdr.
 994  */
 995 ssize_t
 996 recvmsg(int sock, struct nmsghdr *msg, int flags)
 997 {
 998         STRUCT_DECL(nmsghdr, u_lmsg);
 999         STRUCT_HANDLE(nmsghdr, umsgptr);
1000         struct nmsghdr lmsg;
1001         struct uio auio;
1002         struct iovec aiov[MSG_MAXIOVLEN];
1003         int iovcnt;
1004         ssize_t len;
1005         int i;
1006         int *flagsp;
1007         model_t model;
1008 
1009         dprint(1, ("recvmsg(%d, %p, %d)\n",
1010             sock, (void *)msg, flags));
1011 
1012         model = get_udatamodel();
1013         STRUCT_INIT(u_lmsg, model);
1014         STRUCT_SET_HANDLE(umsgptr, model, msg);
1015 
1016         if (flags & MSG_XPG4_2) {
1017                 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1018                         return (set_errno(EFAULT));
1019                 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1020         } else {
1021                 /*
1022                  * Assumes that nmsghdr and omsghdr are identically shaped
1023                  * except for the added msg_flags field.
1024                  */
1025                 if (copyin(msg, STRUCT_BUF(u_lmsg),
1026                     SIZEOF_STRUCT(omsghdr, model)))
1027                         return (set_errno(EFAULT));
1028                 STRUCT_FSET(u_lmsg, msg_flags, 0);
1029                 flagsp = NULL;
1030         }
1031 
1032         /*
1033          * Code below us will kmem_alloc memory and hang it
1034          * off msg_control and msg_name fields. This forces
1035          * us to copy the structure to its native form.
1036          */
1037         lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1038         lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1039         lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1040         lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1041         lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1042         lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1043         lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1044 
1045         iovcnt = lmsg.msg_iovlen;
1046 
1047         if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1048                 return (set_errno(EMSGSIZE));
1049         }
1050 
1051 #ifdef _SYSCALL32_IMPL
1052         /*
1053          * 32-bit callers need to have their iovec expanded, while ensuring
1054          * that they can't move more than 2Gbytes of data in a single call.
1055          */
1056         if (model == DATAMODEL_ILP32) {
1057                 struct iovec32 aiov32[MSG_MAXIOVLEN];
1058                 ssize32_t count32;
1059 
1060                 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1061                     iovcnt * sizeof (struct iovec32)))
1062                         return (set_errno(EFAULT));
1063 
1064                 count32 = 0;
1065                 for (i = 0; i < iovcnt; i++) {
1066                         ssize32_t iovlen32;
1067 
1068                         iovlen32 = aiov32[i].iov_len;
1069                         count32 += iovlen32;
1070                         if (iovlen32 < 0 || count32 < 0)
1071                                 return (set_errno(EINVAL));
1072                         aiov[i].iov_len = iovlen32;
1073                         aiov[i].iov_base =
1074                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1075                 }
1076         } else
1077 #endif /* _SYSCALL32_IMPL */
1078         if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1079                 return (set_errno(EFAULT));
1080         }
1081         len = 0;
1082         for (i = 0; i < iovcnt; i++) {
1083                 ssize_t iovlen = aiov[i].iov_len;
1084                 len += iovlen;
1085                 if (iovlen < 0 || len < 0) {
1086                         return (set_errno(EINVAL));
1087                 }
1088         }
1089         auio.uio_loffset = 0;
1090         auio.uio_iov = aiov;
1091         auio.uio_iovcnt = iovcnt;
1092         auio.uio_resid = len;
1093         auio.uio_segflg = UIO_USERSPACE;
1094         auio.uio_limit = 0;
1095 
1096         if (lmsg.msg_control != NULL &&
1097             (do_useracc == 0 ||
1098             useracc(lmsg.msg_control, lmsg.msg_controllen,
1099             B_WRITE) != 0)) {
1100                 return (set_errno(EFAULT));
1101         }
1102 
1103         return (recvit(sock, &lmsg, &auio, flags,
1104             STRUCT_FADDR(umsgptr, msg_namelen),
1105             STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1106 }
1107 
1108 /*
1109  * Common send function.
1110  */
1111 static ssize_t
1112 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1113 {
1114         struct sonode *so;
1115         file_t *fp;
1116         void *name;
1117         socklen_t namelen;
1118         void *control;
1119         socklen_t controllen;
1120         ssize_t len;
1121         int error;
1122 
1123         if ((so = getsonode(sock, &error, &fp)) == NULL)
1124                 return (set_errno(error));
1125 
1126         uiop->uio_fmode = fp->f_flag;
1127 
1128         if (so->so_family == AF_UNIX)
1129                 uiop->uio_extflg = UIO_COPY_CACHED;
1130         else
1131                 uiop->uio_extflg = UIO_COPY_DEFAULT;
1132 
1133         /* Allocate and copyin name and control */
1134         name = msg->msg_name;
1135         namelen = msg->msg_namelen;
1136         if (name != NULL && namelen != 0) {
1137                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1138                 name = copyin_name(so,
1139                     (struct sockaddr *)name,
1140                     &namelen, &error);
1141                 if (name == NULL)
1142                         goto done3;
1143                 /* copyin_name null terminates addresses for AF_UNIX */
1144                 msg->msg_namelen = namelen;
1145                 msg->msg_name = name;
1146         } else {
1147                 msg->msg_name = name = NULL;
1148                 msg->msg_namelen = namelen = 0;
1149         }
1150 
1151         control = msg->msg_control;
1152         controllen = msg->msg_controllen;
1153         if ((control != NULL) && (controllen != 0)) {
1154                 /*
1155                  * Verify that the length is not excessive to prevent
1156                  * an application from consuming all of kernel memory.
1157                  */
1158                 if (controllen > SO_MAXARGSIZE) {
1159                         error = EINVAL;
1160                         goto done2;
1161                 }
1162                 control = kmem_alloc(controllen, KM_SLEEP);
1163 
1164                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165                 if (copyin(msg->msg_control, control, controllen)) {
1166                         error = EFAULT;
1167                         goto done1;
1168                 }
1169                 msg->msg_control = control;
1170         } else {
1171                 msg->msg_control = control = NULL;
1172                 msg->msg_controllen = controllen = 0;
1173         }
1174 
1175         len = uiop->uio_resid;
1176         msg->msg_flags = flags;
1177 
1178         error = socket_sendmsg(so, msg, uiop, CRED());
1179 done1:
1180         if (control != NULL)
1181                 kmem_free(control, controllen);
1182 done2:
1183         if (name != NULL)
1184                 kmem_free(name, namelen);
1185 done3:
1186         if (error != 0) {
1187                 releasef(sock);
1188                 return (set_errno(error));
1189         }
1190         lwp_stat_update(LWP_STAT_MSGSND, 1);
1191         releasef(sock);
1192         return (len - uiop->uio_resid);
1193 }
1194 
1195 /*
1196  * Native system call
1197  */
1198 ssize_t
1199 send(int sock, void *buffer, size_t len, int flags)
1200 {
1201         struct nmsghdr lmsg;
1202         struct uio auio;
1203         struct iovec aiov[1];
1204 
1205         dprint(1, ("send(%d, %p, %ld, %d)\n",
1206             sock, buffer, len, flags));
1207 
1208         if ((ssize_t)len < 0) {
1209                 return (set_errno(EINVAL));
1210         }
1211 
1212         aiov[0].iov_base = buffer;
1213         aiov[0].iov_len = len;
1214         auio.uio_loffset = 0;
1215         auio.uio_iov = aiov;
1216         auio.uio_iovcnt = 1;
1217         auio.uio_resid = len;
1218         auio.uio_segflg = UIO_USERSPACE;
1219         auio.uio_limit = 0;
1220 
1221         lmsg.msg_name = NULL;
1222         lmsg.msg_control = NULL;
1223         if (!(flags & MSG_XPG4_2)) {
1224                 /*
1225                  * In order to be compatible with the libsocket/sockmod
1226                  * implementation we set EOR for all send* calls.
1227                  */
1228                 flags |= MSG_EOR;
1229         }
1230         return (sendit(sock, &lmsg, &auio, flags));
1231 }
1232 
1233 /*
1234  * Uses the MSG_XPG4_2 flag to determine if the caller is using
1235  * struct omsghdr or struct nmsghdr.
1236  */
1237 ssize_t
1238 sendmsg(int sock, struct nmsghdr *msg, int flags)
1239 {
1240         struct nmsghdr lmsg;
1241         STRUCT_DECL(nmsghdr, u_lmsg);
1242         struct uio auio;
1243         struct iovec aiov[MSG_MAXIOVLEN];
1244         int iovcnt;
1245         ssize_t len;
1246         int i;
1247         model_t model;
1248 
1249         dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1250 
1251         model = get_udatamodel();
1252         STRUCT_INIT(u_lmsg, model);
1253 
1254         if (flags & MSG_XPG4_2) {
1255                 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1256                     STRUCT_SIZE(u_lmsg)))
1257                         return (set_errno(EFAULT));
1258         } else {
1259                 /*
1260                  * Assumes that nmsghdr and omsghdr are identically shaped
1261                  * except for the added msg_flags field.
1262                  */
1263                 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1264                     SIZEOF_STRUCT(omsghdr, model)))
1265                         return (set_errno(EFAULT));
1266                 /*
1267                  * In order to be compatible with the libsocket/sockmod
1268                  * implementation we set EOR for all send* calls.
1269                  */
1270                 flags |= MSG_EOR;
1271         }
1272 
1273         /*
1274          * Code below us will kmem_alloc memory and hang it
1275          * off msg_control and msg_name fields. This forces
1276          * us to copy the structure to its native form.
1277          */
1278         lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1279         lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1280         lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1281         lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1282         lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1283         lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1284         lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1285 
1286         iovcnt = lmsg.msg_iovlen;
1287 
1288         if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1289                 /*
1290                  * Unless this is XPG 4.2 we allow iovcnt == 0 to
1291                  * be compatible with SunOS 4.X and 4.4BSD.
1292                  */
1293                 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1294                         return (set_errno(EMSGSIZE));
1295         }
1296 
1297 #ifdef _SYSCALL32_IMPL
1298         /*
1299          * 32-bit callers need to have their iovec expanded, while ensuring
1300          * that they can't move more than 2Gbytes of data in a single call.
1301          */
1302         if (model == DATAMODEL_ILP32) {
1303                 struct iovec32 aiov32[MSG_MAXIOVLEN];
1304                 ssize32_t count32;
1305 
1306                 if (iovcnt != 0 &&
1307                     copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1308                     iovcnt * sizeof (struct iovec32)))
1309                         return (set_errno(EFAULT));
1310 
1311                 count32 = 0;
1312                 for (i = 0; i < iovcnt; i++) {
1313                         ssize32_t iovlen32;
1314 
1315                         iovlen32 = aiov32[i].iov_len;
1316                         count32 += iovlen32;
1317                         if (iovlen32 < 0 || count32 < 0)
1318                                 return (set_errno(EINVAL));
1319                         aiov[i].iov_len = iovlen32;
1320                         aiov[i].iov_base =
1321                             (caddr_t)(uintptr_t)aiov32[i].iov_base;
1322                 }
1323         } else
1324 #endif /* _SYSCALL32_IMPL */
1325         if (iovcnt != 0 &&
1326             copyin(lmsg.msg_iov, aiov,
1327             (unsigned)iovcnt * sizeof (struct iovec))) {
1328                 return (set_errno(EFAULT));
1329         }
1330         len = 0;
1331         for (i = 0; i < iovcnt; i++) {
1332                 ssize_t iovlen = aiov[i].iov_len;
1333                 len += iovlen;
1334                 if (iovlen < 0 || len < 0) {
1335                         return (set_errno(EINVAL));
1336                 }
1337         }
1338         auio.uio_loffset = 0;
1339         auio.uio_iov = aiov;
1340         auio.uio_iovcnt = iovcnt;
1341         auio.uio_resid = len;
1342         auio.uio_segflg = UIO_USERSPACE;
1343         auio.uio_limit = 0;
1344 
1345         return (sendit(sock, &lmsg, &auio, flags));
1346 }
1347 
1348 ssize_t
1349 sendto(int sock, void *buffer, size_t len, int flags,
1350     struct sockaddr *name, socklen_t namelen)
1351 {
1352         struct nmsghdr lmsg;
1353         struct uio auio;
1354         struct iovec aiov[1];
1355 
1356         dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1357             sock, buffer, len, flags, (void *)name, namelen));
1358 
1359         if ((ssize_t)len < 0) {
1360                 return (set_errno(EINVAL));
1361         }
1362 
1363         aiov[0].iov_base = buffer;
1364         aiov[0].iov_len = len;
1365         auio.uio_loffset = 0;
1366         auio.uio_iov = aiov;
1367         auio.uio_iovcnt = 1;
1368         auio.uio_resid = len;
1369         auio.uio_segflg = UIO_USERSPACE;
1370         auio.uio_limit = 0;
1371 
1372         lmsg.msg_name = (char *)name;
1373         lmsg.msg_namelen = namelen;
1374         lmsg.msg_control = NULL;
1375         if (!(flags & MSG_XPG4_2)) {
1376                 /*
1377                  * In order to be compatible with the libsocket/sockmod
1378                  * implementation we set EOR for all send* calls.
1379                  */
1380                 flags |= MSG_EOR;
1381         }
1382         return (sendit(sock, &lmsg, &auio, flags));
1383 }
1384 
1385 /*ARGSUSED3*/
1386 int
1387 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1388 {
1389         struct sonode *so;
1390         int error;
1391         socklen_t namelen;
1392         socklen_t sock_addrlen;
1393         struct sockaddr *sock_addrp;
1394 
1395         dprint(1, ("getpeername(%d, %p, %p)\n",
1396             sock, (void *)name, (void *)namelenp));
1397 
1398         if ((so = getsonode(sock, &error, NULL)) == NULL)
1399                 goto bad;
1400 
1401         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1402         if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1403             (name == NULL && namelen != 0)) {
1404                 error = EFAULT;
1405                 goto rel_out;
1406         }
1407         sock_addrlen = so->so_max_addr_len;
1408         sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1409 
1410         if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1411             B_FALSE, CRED())) == 0) {
1412                 ASSERT(sock_addrlen <= so->so_max_addr_len);
1413                 error = copyout_name(name, namelen, namelenp,
1414                     (void *)sock_addrp, sock_addrlen);
1415         }
1416         kmem_free(sock_addrp, so->so_max_addr_len);
1417 rel_out:
1418         releasef(sock);
1419 bad:    return (error != 0 ? set_errno(error) : 0);
1420 }
1421 
1422 /*ARGSUSED3*/
1423 int
1424 getsockname(int sock, struct sockaddr *name,
1425                 socklen_t *namelenp, int version)
1426 {
1427         struct sonode *so;
1428         int error;
1429         socklen_t namelen, sock_addrlen;
1430         struct sockaddr *sock_addrp;
1431 
1432         dprint(1, ("getsockname(%d, %p, %p)\n",
1433             sock, (void *)name, (void *)namelenp));
1434 
1435         if ((so = getsonode(sock, &error, NULL)) == NULL)
1436                 goto bad;
1437 
1438         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1439         if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1440             (name == NULL && namelen != 0)) {
1441                 error = EFAULT;
1442                 goto rel_out;
1443         }
1444 
1445         sock_addrlen = so->so_max_addr_len;
1446         sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1447         if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1448             CRED())) == 0) {
1449                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1450                 ASSERT(sock_addrlen <= so->so_max_addr_len);
1451                 error = copyout_name(name, namelen, namelenp,
1452                     (void *)sock_addrp, sock_addrlen);
1453         }
1454         kmem_free(sock_addrp, so->so_max_addr_len);
1455 rel_out:
1456         releasef(sock);
1457 bad:    return (error != 0 ? set_errno(error) : 0);
1458 }
1459 
1460 /*ARGSUSED5*/
1461 int
1462 getsockopt(int sock,
1463         int level,
1464         int option_name,
1465         void *option_value,
1466         socklen_t *option_lenp,
1467         int version)
1468 {
1469         struct sonode *so;
1470         socklen_t optlen, optlen_res;
1471         void *optval;
1472         int error;
1473 
1474         dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1475             sock, level, option_name, option_value, (void *)option_lenp));
1476 
1477         if ((so = getsonode(sock, &error, NULL)) == NULL)
1478                 return (set_errno(error));
1479 
1480         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1481         if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1482                 releasef(sock);
1483                 return (set_errno(EFAULT));
1484         }
1485         /*
1486          * Verify that the length is not excessive to prevent
1487          * an application from consuming all of kernel memory.
1488          */
1489         if (optlen > SO_MAXARGSIZE) {
1490                 error = EINVAL;
1491                 releasef(sock);
1492                 return (set_errno(error));
1493         }
1494         optval = kmem_alloc(optlen, KM_SLEEP);
1495         optlen_res = optlen;
1496         error = socket_getsockopt(so, level, option_name, optval,
1497             &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1498             CRED());
1499         releasef(sock);
1500         if (error) {
1501                 kmem_free(optval, optlen);
1502                 return (set_errno(error));
1503         }
1504         error = copyout_arg(option_value, optlen, option_lenp,
1505             optval, optlen_res);
1506         kmem_free(optval, optlen);
1507         if (error)
1508                 return (set_errno(error));
1509         return (0);
1510 }
1511 
1512 /*ARGSUSED5*/
1513 int
1514 setsockopt(int sock,
1515         int level,
1516         int option_name,
1517         void *option_value,
1518         socklen_t option_len,
1519         int version)
1520 {
1521         struct sonode *so;
1522         intptr_t buffer[2];
1523         void *optval = NULL;
1524         int error;
1525 
1526         dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1527             sock, level, option_name, option_value, option_len));
1528 
1529         if ((so = getsonode(sock, &error, NULL)) == NULL)
1530                 return (set_errno(error));
1531 
1532         if (option_value != NULL) {
1533                 if (option_len != 0) {
1534                         /*
1535                          * Verify that the length is not excessive to prevent
1536                          * an application from consuming all of kernel memory.
1537                          */
1538                         if (option_len > SO_MAXARGSIZE) {
1539                                 error = EINVAL;
1540                                 goto done2;
1541                         }
1542                         optval = option_len <= sizeof (buffer) ?
1543                             &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1544                         ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1545                         if (copyin(option_value, optval, (size_t)option_len)) {
1546                                 error = EFAULT;
1547                                 goto done1;
1548                         }
1549                 }
1550         } else
1551                 option_len = 0;
1552 
1553         error = socket_setsockopt(so, level, option_name, optval,
1554             (t_uscalar_t)option_len, CRED());
1555 done1:
1556         if (optval != buffer)
1557                 kmem_free(optval, (size_t)option_len);
1558 done2:
1559         releasef(sock);
1560         if (error)
1561                 return (set_errno(error));
1562         return (0);
1563 }
1564 
1565 static int
1566 sockconf_add_sock(int family, int type, int protocol, char *name)
1567 {
1568         int error = 0;
1569         char *kdevpath = NULL;
1570         char *kmodule = NULL;
1571         char *buf = NULL;
1572         size_t pathlen = 0;
1573         struct sockparams *sp;
1574 
1575         if (name == NULL)
1576                 return (EINVAL);
1577         /*
1578          * Copyin the name.
1579          * This also makes it possible to check for too long pathnames.
1580          * Compress the space needed for the name before passing it
1581          * to soconfig - soconfig will store the string until
1582          * the configuration is removed.
1583          */
1584         buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1585         if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1586                 kmem_free(buf, MAXPATHLEN);
1587                 return (error);
1588         }
1589         if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1590                 /* For device */
1591 
1592                 /*
1593                  * Special handling for NCA:
1594                  *
1595                  * DEV_NCA is never opened even if an application
1596                  * requests for AF_NCA. The device opened is instead a
1597                  * predefined AF_INET transport (NCA_INET_DEV).
1598                  *
1599                  * Prior to Volo (PSARC/2007/587) NCA would determine
1600                  * the device using a lookup, which worked then because
1601                  * all protocols were based on TPI. Since TPI is no
1602                  * longer the default, we have to explicitly state
1603                  * which device to use.
1604                  */
1605                 if (strcmp(buf, NCA_DEV) == 0) {
1606                         /* only support entry <28, 2, 0> */
1607                         if (family != AF_NCA || type != SOCK_STREAM ||
1608                             protocol != 0) {
1609                                 kmem_free(buf, MAXPATHLEN);
1610                                 return (EINVAL);
1611                         }
1612 
1613                         pathlen = strlen(NCA_INET_DEV) + 1;
1614                         kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1615                         bcopy(NCA_INET_DEV, kdevpath, pathlen);
1616                         kdevpath[pathlen - 1] = '\0';
1617                 } else {
1618                         kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1619                         bcopy(buf, kdevpath, pathlen);
1620                         kdevpath[pathlen - 1] = '\0';
1621                 }
1622         } else {
1623                 /* For socket module */
1624                 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1625                 bcopy(buf, kmodule, pathlen);
1626                 kmodule[pathlen - 1] = '\0';
1627                 pathlen = 0;
1628         }
1629         kmem_free(buf, MAXPATHLEN);
1630 
1631         /* sockparams_create frees mod name and devpath upon failure */
1632         sp = sockparams_create(family, type, protocol, kmodule,
1633             kdevpath, pathlen, 0, KM_SLEEP, &error);
1634         if (sp != NULL) {
1635                 error = sockparams_add(sp);
1636                 if (error != 0)
1637                         sockparams_destroy(sp);
1638         }
1639 
1640         return (error);
1641 }
1642 
1643 static int
1644 sockconf_remove_sock(int family, int type, int protocol)
1645 {
1646         return (sockparams_delete(family, type, protocol));
1647 }
1648 
1649 static int
1650 sockconfig_remove_filter(const char *uname)
1651 {
1652         char kname[SOF_MAXNAMELEN];
1653         size_t len;
1654         int error;
1655         sof_entry_t *ent;
1656 
1657         if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1658                 return (error);
1659 
1660         ent = sof_entry_remove_by_name(kname);
1661         if (ent == NULL)
1662                 return (ENXIO);
1663 
1664         mutex_enter(&ent->sofe_lock);
1665         ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1666         if (ent->sofe_refcnt == 0) {
1667                 mutex_exit(&ent->sofe_lock);
1668                 sof_entry_free(ent);
1669         } else {
1670                 /* let the last socket free the filter */
1671                 ent->sofe_flags |= SOFEF_CONDEMED;
1672                 mutex_exit(&ent->sofe_lock);
1673         }
1674 
1675         return (0);
1676 }
1677 
1678 static int
1679 sockconfig_add_filter(const char *uname, void *ufilpropp)
1680 {
1681         struct sockconfig_filter_props filprop;
1682         sof_entry_t *ent;
1683         int error;
1684         size_t tuplesz, len;
1685         char hintbuf[SOF_MAXNAMELEN];
1686 
1687         ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1688         mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1689 
1690         if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1691             &len)) != 0) {
1692                 sof_entry_free(ent);
1693                 return (error);
1694         }
1695 
1696         if (get_udatamodel() == DATAMODEL_NATIVE) {
1697                 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1698                         sof_entry_free(ent);
1699                         return (EFAULT);
1700                 }
1701         }
1702 #ifdef  _SYSCALL32_IMPL
1703         else {
1704                 struct sockconfig_filter_props32 filprop32;
1705 
1706                 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1707                         sof_entry_free(ent);
1708                         return (EFAULT);
1709                 }
1710                 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1711                 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1712                 filprop.sfp_hint = filprop32.sfp_hint;
1713                 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1714                 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1715                 filprop.sfp_socktuple =
1716                     (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1717         }
1718 #endif  /* _SYSCALL32_IMPL */
1719 
1720         if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1721             sizeof (ent->sofe_modname), &len)) != 0) {
1722                 sof_entry_free(ent);
1723                 return (error);
1724         }
1725 
1726         /*
1727          * A filter must specify at least one socket tuple.
1728          */
1729         if (filprop.sfp_socktuple_cnt == 0 ||
1730             filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1731                 sof_entry_free(ent);
1732                 return (EINVAL);
1733         }
1734         ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1735         ent->sofe_hint = filprop.sfp_hint;
1736 
1737         /*
1738          * Verify the hint, and copy in the hint argument, if necessary.
1739          */
1740         switch (ent->sofe_hint) {
1741         case SOF_HINT_BEFORE:
1742         case SOF_HINT_AFTER:
1743                 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1744                     sizeof (hintbuf), &len)) != 0) {
1745                         sof_entry_free(ent);
1746                         return (error);
1747                 }
1748                 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1749                 bcopy(hintbuf, ent->sofe_hintarg, len);
1750                 /* FALLTHRU */
1751         case SOF_HINT_TOP:
1752         case SOF_HINT_BOTTOM:
1753                 /* hints cannot be used with programmatic filters */
1754                 if (ent->sofe_flags & SOFEF_PROG) {
1755                         sof_entry_free(ent);
1756                         return (EINVAL);
1757                 }
1758                 break;
1759         case SOF_HINT_NONE:
1760                 break;
1761         default:
1762                 /* bad hint value */
1763                 sof_entry_free(ent);
1764                 return (EINVAL);
1765         }
1766 
1767         ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1768         tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1769         ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1770 
1771         if (get_udatamodel() == DATAMODEL_NATIVE) {
1772                 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1773                     tuplesz)) {
1774                         sof_entry_free(ent);
1775                         return (EFAULT);
1776                 }
1777         }
1778 #ifdef  _SYSCALL32_IMPL
1779         else {
1780                 int i;
1781                 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1782                 sof_socktuple_t *tup = ent->sofe_socktuple;
1783                 sof_socktuple32_t tup32;
1784 
1785                 tup = ent->sofe_socktuple;
1786                 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1787                         ASSERT(tup < ent->sofe_socktuple + tuplesz);
1788 
1789                         if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1790                                 sof_entry_free(ent);
1791                                 return (EFAULT);
1792                         }
1793                         tup->sofst_family = tup32.sofst_family;
1794                         tup->sofst_type = tup32.sofst_type;
1795                         tup->sofst_protocol = tup32.sofst_protocol;
1796 
1797                         data += sizeof (tup32);
1798                 }
1799         }
1800 #endif  /* _SYSCALL32_IMPL */
1801 
1802         /* Sockets can start using the filter as soon as the filter is added */
1803         if ((error = sof_entry_add(ent)) != 0)
1804                 sof_entry_free(ent);
1805 
1806         return (error);
1807 }
1808 
1809 /*
1810  * Socket configuration system call. It is used to add and remove
1811  * socket types.
1812  */
1813 int
1814 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1815 {
1816         int error = 0;
1817 
1818         if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1819                 return (set_errno(EPERM));
1820 
1821         if (sockfs_defer_nl7c_init) {
1822                 nl7c_init();
1823                 sockfs_defer_nl7c_init = 0;
1824         }
1825 
1826         switch (cmd) {
1827         case SOCKCONFIG_ADD_SOCK:
1828                 error = sockconf_add_sock((int)(uintptr_t)arg1,
1829                     (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1830                 break;
1831         case SOCKCONFIG_REMOVE_SOCK:
1832                 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1833                     (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1834                 break;
1835         case SOCKCONFIG_ADD_FILTER:
1836                 error = sockconfig_add_filter((const char *)arg1, arg2);
1837                 break;
1838         case SOCKCONFIG_REMOVE_FILTER:
1839                 error = sockconfig_remove_filter((const char *)arg1);
1840                 break;
1841         default:
1842 #ifdef  DEBUG
1843                 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1844 #endif
1845                 error = EINVAL;
1846                 break;
1847         }
1848 
1849         if (error != 0) {
1850                 eprintline(error);
1851                 return (set_errno(error));
1852         }
1853         return (0);
1854 }
1855 
1856 
1857 /*
1858  * Sendfile is implemented through two schemes, direct I/O or by
1859  * caching in the filesystem page cache. We cache the input file by
1860  * default and use direct I/O only if sendfile_max_size is set
1861  * appropriately as explained below. Note that this logic is consistent
1862  * with other filesystems where caching is turned on by default
1863  * unless explicitly turned off by using the DIRECTIO ioctl.
1864  *
1865  * We choose a slightly different scheme here. One can turn off
1866  * caching by setting sendfile_max_size to 0. One can also enable
1867  * caching of files <= sendfile_max_size by setting sendfile_max_size
1868  * to an appropriate value. By default sendfile_max_size is set to the
1869  * maximum value so that all files are cached. In future, we may provide
1870  * better interfaces for caching the file.
1871  *
1872  * Sendfile through Direct I/O (Zero copy)
1873  * --------------------------------------
1874  *
1875  * As disks are normally slower than the network, we can't have a
1876  * single thread that reads the disk and writes to the network. We
1877  * need to have parallelism. This is done by having the sendfile
1878  * thread create another thread that reads from the filesystem
1879  * and queues it for network processing. In this scheme, the data
1880  * is never copied anywhere i.e it is zero copy unlike the other
1881  * scheme.
1882  *
1883  * We have a sendfile queue (snfq) where each sendfile
1884  * request (snf_req_t) is queued for processing by a thread. Number
1885  * of threads is dynamically allocated and they exit if they are idling
1886  * beyond a specified amount of time. When each request (snf_req_t) is
1887  * processed by a thread, it produces a number of mblk_t structures to
1888  * be consumed by the sendfile thread. snf_deque and snf_enque are
1889  * used for consuming and producing mblks. Size of the filesystem
1890  * read is determined by the tunable (sendfile_read_size). A single
1891  * mblk holds sendfile_read_size worth of data (except the last
1892  * read of the file) which is sent down as a whole to the network.
1893  * sendfile_read_size is set to 1 MB as this seems to be the optimal
1894  * value for the UFS filesystem backed by a striped storage array.
1895  *
1896  * Synchronisation between read (producer) and write (consumer) threads.
1897  * --------------------------------------------------------------------
1898  *
1899  * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1900  * adding and deleting items in this list. Error can happen anytime
1901  * during read or write. There could be unprocessed mblks in the
1902  * sr_ib_XXX list when a read or write error occurs. Whenever error
1903  * is encountered, we need two things to happen :
1904  *
1905  * a) One of the threads need to clean the mblks.
1906  * b) When one thread encounters an error, the other should stop.
1907  *
1908  * For (a), we don't want to penalize the reader thread as it could do
1909  * some useful work processing other requests. For (b), the error can
1910  * be detected by examining sr_read_error or sr_write_error.
1911  * sr_lock protects sr_read_error and sr_write_error. If both reader and
1912  * writer encounters error, we need to report the write error back to
1913  * the application as that's what would have happened if the operations
1914  * were done sequentially. With this in mind, following should work :
1915  *
1916  *      - Check for errors before read or write.
1917  *      - If the reader encounters error, set the error in sr_read_error.
1918  *        Check sr_write_error, if it is set, send cv_signal as it is
1919  *        waiting for reader to complete. If it is not set, the writer
1920  *        is either running sinking data to the network or blocked
1921  *        because of flow control. For handling the latter case, we
1922  *        always send a signal. In any case, it will examine sr_read_error
1923  *        and return. sr_read_error is marked with SR_READ_DONE to tell
1924  *        the writer that the reader is done in all the cases.
1925  *      - If the writer encounters error, set the error in sr_write_error.
1926  *        The reader thread is either blocked because of flow control or
1927  *        running reading data from the disk. For the former, we need to
1928  *        wakeup the thread. Again to keep it simple, we always wake up
1929  *        the reader thread. Then, wait for the read thread to complete
1930  *        if it is not done yet. Cleanup and return.
1931  *
1932  * High and low water marks for the read thread.
1933  * --------------------------------------------
1934  *
1935  * If sendfile() is used to send data over a slow network, we need to
1936  * make sure that the read thread does not produce data at a faster
1937  * rate than the network. This can happen if the disk is faster than
1938  * the network. In such a case, we don't want to build a very large queue.
1939  * But we would still like to get all of the network throughput possible.
1940  * This implies that network should never block waiting for data.
1941  * As there are lot of disk throughput/network throughput combinations
1942  * possible, it is difficult to come up with an accurate number.
1943  * A typical 10K RPM disk has a max seek latency 17ms and rotational
1944  * latency of 3ms for reading a disk block. Thus, the total latency to
1945  * initiate a new read, transfer data from the disk and queue for
1946  * transmission would take about a max of 25ms. Todays max transfer rate
1947  * for network is 100MB/sec. If the thread is blocked because of flow
1948  * control, it would take 25ms to get new data ready for transmission.
1949  * We have to make sure that network is not idling, while we are initiating
1950  * new transfers. So, at 100MB/sec, to keep network busy we would need
1951  * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1952  * We need to pick a high water mark so that the woken up thread would
1953  * do considerable work before blocking again to prevent thrashing. Currently,
1954  * we pick this to be 10 times that of the low water mark.
1955  *
1956  * Sendfile with segmap caching (One copy from page cache to mblks).
1957  * ----------------------------------------------------------------
1958  *
1959  * We use the segmap cache for caching the file, if the size of file
1960  * is <= sendfile_max_size. In this case we don't use threads as VM
1961  * is reasonably fast enough to keep up with the network. If the underlying
1962  * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1963  * of data into segmap space, and use the virtual address from segmap
1964  * directly through desballoc() to avoid copy. Once the transport is done
1965  * with the data, the mapping will be released through segmap_release()
1966  * called by the call-back routine.
1967  *
1968  * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1969  * to copy the data from the filesystem into our temporary network buffer.
1970  *
1971  * To disable caching, set sendfile_max_size to 0.
1972  */
1973 
1974 uint_t sendfile_read_size = 1024 * 1024;
1975 #define SENDFILE_REQ_LOWAT      3 * 1024 * 1024
1976 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1977 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1978 struct sendfile_stats sf_stats;
1979 struct sendfile_queue *snfq;
1980 clock_t snfq_timeout;
1981 off64_t sendfile_max_size;
1982 
1983 static void snf_enque(snf_req_t *, mblk_t *);
1984 static mblk_t *snf_deque(snf_req_t *);
1985 
1986 void
1987 sendfile_init(void)
1988 {
1989         snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
1990 
1991         mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
1992         cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
1993         snfq->snfq_max_threads = max_ncpus;
1994         snfq_timeout = SNFQ_TIMEOUT;
1995         /* Cache all files by default. */
1996         sendfile_max_size = MAXOFFSET_T;
1997 }
1998 
1999 /*
2000  * Queues a mblk_t for network processing.
2001  */
2002 static void
2003 snf_enque(snf_req_t *sr, mblk_t *mp)
2004 {
2005         mp->b_next = NULL;
2006         mutex_enter(&sr->sr_lock);
2007         if (sr->sr_mp_head == NULL) {
2008                 sr->sr_mp_head = sr->sr_mp_tail = mp;
2009                 cv_signal(&sr->sr_cv);
2010         } else {
2011                 sr->sr_mp_tail->b_next = mp;
2012                 sr->sr_mp_tail = mp;
2013         }
2014         sr->sr_qlen += MBLKL(mp);
2015         while ((sr->sr_qlen > sr->sr_hiwat) &&
2016             (sr->sr_write_error == 0)) {
2017                 sf_stats.ss_full_waits++;
2018                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2019         }
2020         mutex_exit(&sr->sr_lock);
2021 }
2022 
2023 /*
2024  * De-queues a mblk_t for network processing.
2025  */
2026 static mblk_t *
2027 snf_deque(snf_req_t *sr)
2028 {
2029         mblk_t *mp;
2030 
2031         mutex_enter(&sr->sr_lock);
2032         /*
2033          * If we have encountered an error on read or read is
2034          * completed and no more mblks, return NULL.
2035          * We need to check for NULL sr_mp_head also as
2036          * the reads could have completed and there is
2037          * nothing more to come.
2038          */
2039         if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2040             ((sr->sr_read_error & SR_READ_DONE) &&
2041             sr->sr_mp_head == NULL)) {
2042                 mutex_exit(&sr->sr_lock);
2043                 return (NULL);
2044         }
2045         /*
2046          * To start with neither SR_READ_DONE is marked nor
2047          * the error is set. When we wake up from cv_wait,
2048          * following are the possibilities :
2049          *
2050          *      a) sr_read_error is zero and mblks are queued.
2051          *      b) sr_read_error is set to SR_READ_DONE
2052          *         and mblks are queued.
2053          *      c) sr_read_error is set to SR_READ_DONE
2054          *         and no mblks.
2055          *      d) sr_read_error is set to some error other
2056          *         than SR_READ_DONE.
2057          */
2058 
2059         while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2060                 sf_stats.ss_empty_waits++;
2061                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2062         }
2063         /* Handle (a) and (b) first  - the normal case. */
2064         if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2065             (sr->sr_mp_head != NULL)) {
2066                 mp = sr->sr_mp_head;
2067                 sr->sr_mp_head = mp->b_next;
2068                 sr->sr_qlen -= MBLKL(mp);
2069                 if (sr->sr_qlen < sr->sr_lowat)
2070                         cv_signal(&sr->sr_cv);
2071                 mutex_exit(&sr->sr_lock);
2072                 mp->b_next = NULL;
2073                 return (mp);
2074         }
2075         /* Handle (c) and (d). */
2076         mutex_exit(&sr->sr_lock);
2077         return (NULL);
2078 }
2079 
2080 /*
2081  * Reads data from the filesystem and queues it for network processing.
2082  */
2083 void
2084 snf_async_read(snf_req_t *sr)
2085 {
2086         size_t iosize;
2087         u_offset_t fileoff;
2088         u_offset_t size;
2089         int ret_size;
2090         int error;
2091         file_t *fp;
2092         mblk_t *mp;
2093         struct vnode *vp;
2094         int extra = 0;
2095         int maxblk = 0;
2096         int wroff = 0;
2097         struct sonode *so;
2098 
2099         fp = sr->sr_fp;
2100         size = sr->sr_file_size;
2101         fileoff = sr->sr_file_off;
2102 
2103         /*
2104          * Ignore the error for filesystems that doesn't support DIRECTIO.
2105          */
2106         (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2107             kcred, NULL, NULL);
2108 
2109         vp = sr->sr_vp;
2110         if (vp->v_type == VSOCK) {
2111                 stdata_t *stp;
2112 
2113                 /*
2114                  * Get the extra space to insert a header and a trailer.
2115                  */
2116                 so = VTOSO(vp);
2117                 stp = vp->v_stream;
2118                 if (stp == NULL) {
2119                         wroff = so->so_proto_props.sopp_wroff;
2120                         maxblk = so->so_proto_props.sopp_maxblk;
2121                         extra = wroff + so->so_proto_props.sopp_tail;
2122                 } else {
2123                         wroff = (int)(stp->sd_wroff);
2124                         maxblk = (int)(stp->sd_maxblk);
2125                         extra = wroff + (int)(stp->sd_tail);
2126                 }
2127         }
2128 
2129         while ((size != 0) && (sr->sr_write_error == 0)) {
2130 
2131                 iosize = (int)MIN(sr->sr_maxpsz, size);
2132 
2133                 /*
2134                  * Socket filters can limit the mblk size,
2135                  * so limit reads to maxblk if there are
2136                  * filters present.
2137                  */
2138                 if (vp->v_type == VSOCK &&
2139                     so->so_filter_active > 0 && maxblk != INFPSZ)
2140                         iosize = (int)MIN(iosize, maxblk);
2141 
2142                 if (is_system_labeled()) {
2143                         mp = allocb_cred(iosize + extra, CRED(),
2144                             curproc->p_pid);
2145                 } else {
2146                         mp = allocb(iosize + extra, BPRI_MED);
2147                 }
2148                 if (mp == NULL) {
2149                         error = EAGAIN;
2150                         break;
2151                 }
2152 
2153                 mp->b_rptr += wroff;
2154 
2155                 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2156 
2157                 /* Error or Reached EOF ? */
2158                 if ((error != 0) || (ret_size == 0)) {
2159                         freeb(mp);
2160                         break;
2161                 }
2162                 mp->b_wptr = mp->b_rptr + ret_size;
2163 
2164                 snf_enque(sr, mp);
2165                 size -= ret_size;
2166                 fileoff += ret_size;
2167         }
2168         (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2169             kcred, NULL, NULL);
2170         mutex_enter(&sr->sr_lock);
2171         sr->sr_read_error = error;
2172         sr->sr_read_error |= SR_READ_DONE;
2173         cv_signal(&sr->sr_cv);
2174         mutex_exit(&sr->sr_lock);
2175 }
2176 
2177 void
2178 snf_async_thread(void)
2179 {
2180         snf_req_t *sr;
2181         callb_cpr_t cprinfo;
2182         clock_t time_left = 1;
2183 
2184         CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2185 
2186         mutex_enter(&snfq->snfq_lock);
2187         for (;;) {
2188                 /*
2189                  * If we didn't find a entry, then block until woken up
2190                  * again and then look through the queues again.
2191                  */
2192                 while ((sr = snfq->snfq_req_head) == NULL) {
2193                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2194                         if (time_left <= 0) {
2195                                 snfq->snfq_svc_threads--;
2196                                 CALLB_CPR_EXIT(&cprinfo);
2197                                 thread_exit();
2198                                 /* NOTREACHED */
2199                         }
2200                         snfq->snfq_idle_cnt++;
2201 
2202                         time_left = cv_reltimedwait(&snfq->snfq_cv,
2203                             &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2204                         snfq->snfq_idle_cnt--;
2205 
2206                         CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2207                 }
2208                 snfq->snfq_req_head = sr->sr_next;
2209                 snfq->snfq_req_cnt--;
2210                 mutex_exit(&snfq->snfq_lock);
2211                 snf_async_read(sr);
2212                 mutex_enter(&snfq->snfq_lock);
2213         }
2214 }
2215 
2216 
2217 snf_req_t *
2218 create_thread(int operation, struct vnode *vp, file_t *fp,
2219     u_offset_t fileoff, u_offset_t size)
2220 {
2221         snf_req_t *sr;
2222         stdata_t *stp;
2223 
2224         sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2225 
2226         sr->sr_vp = vp;
2227         sr->sr_fp = fp;
2228         stp = vp->v_stream;
2229 
2230         /*
2231          * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2232          * stream might be closed before thread returns from snf_async_read.
2233          */
2234         if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2235                 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2236         } else {
2237                 sr->sr_maxpsz = MAXBSIZE;
2238         }
2239 
2240         sr->sr_operation = operation;
2241         sr->sr_file_off = fileoff;
2242         sr->sr_file_size = size;
2243         sr->sr_hiwat = sendfile_req_hiwat;
2244         sr->sr_lowat = sendfile_req_lowat;
2245         mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2246         cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2247         /*
2248          * See whether we need another thread for servicing this
2249          * request. If there are already enough requests queued
2250          * for the threads, create one if not exceeding
2251          * snfq_max_threads.
2252          */
2253         mutex_enter(&snfq->snfq_lock);
2254         if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2255             snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2256                 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2257                     TS_RUN, minclsyspri);
2258                 snfq->snfq_svc_threads++;
2259         }
2260         if (snfq->snfq_req_head == NULL) {
2261                 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2262                 cv_signal(&snfq->snfq_cv);
2263         } else {
2264                 snfq->snfq_req_tail->sr_next = sr;
2265                 snfq->snfq_req_tail = sr;
2266         }
2267         snfq->snfq_req_cnt++;
2268         mutex_exit(&snfq->snfq_lock);
2269         return (sr);
2270 }
2271 
2272 int
2273 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2274     ssize_t *count)
2275 {
2276         snf_req_t *sr;
2277         mblk_t *mp;
2278         int iosize;
2279         int error = 0;
2280         short fflag;
2281         struct vnode *vp;
2282         int ksize;
2283         struct nmsghdr msg;
2284 
2285         ksize = 0;
2286         *count = 0;
2287         bzero(&msg, sizeof (msg));
2288 
2289         vp = fp->f_vnode;
2290         fflag = fp->f_flag;
2291         if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2292                 return (EAGAIN);
2293 
2294         /*
2295          * We check for read error in snf_deque. It has to check
2296          * for successful READ_DONE and return NULL, and we might
2297          * as well make an additional check there.
2298          */
2299         while ((mp = snf_deque(sr)) != NULL) {
2300 
2301                 if (ISSIG(curthread, JUSTLOOKING)) {
2302                         freeb(mp);
2303                         error = EINTR;
2304                         break;
2305                 }
2306                 iosize = MBLKL(mp);
2307 
2308                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2309 
2310                 if (error != 0) {
2311                         if (mp != NULL)
2312                                 freeb(mp);
2313                         break;
2314                 }
2315                 ksize += iosize;
2316         }
2317         *count = ksize;
2318 
2319         mutex_enter(&sr->sr_lock);
2320         sr->sr_write_error = error;
2321         /* Look at the big comments on why we cv_signal here. */
2322         cv_signal(&sr->sr_cv);
2323 
2324         /* Wait for the reader to complete always. */
2325         while (!(sr->sr_read_error & SR_READ_DONE)) {
2326                 cv_wait(&sr->sr_cv, &sr->sr_lock);
2327         }
2328         /* If there is no write error, check for read error. */
2329         if (error == 0)
2330                 error = (sr->sr_read_error & ~SR_READ_DONE);
2331 
2332         if (error != 0) {
2333                 mblk_t *next_mp;
2334 
2335                 mp = sr->sr_mp_head;
2336                 while (mp != NULL) {
2337                         next_mp = mp->b_next;
2338                         mp->b_next = NULL;
2339                         freeb(mp);
2340                         mp = next_mp;
2341                 }
2342         }
2343         mutex_exit(&sr->sr_lock);
2344         kmem_free(sr, sizeof (snf_req_t));
2345         return (error);
2346 }
2347 
2348 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2349 #define SNF_VPMMAXPGS   (VPMMAXPGS/2)
2350 
2351 /*
2352  * Maximum no.of elements in the list returned by vpm, including
2353  * NULL for the last entry
2354  */
2355 #define SNF_MAXVMAPS    (SNF_VPMMAXPGS + 1)
2356 
2357 typedef struct {
2358         unsigned int    snfv_ref;
2359         frtn_t          snfv_frtn;
2360         vnode_t         *snfv_vp;
2361         struct vmap     snfv_vml[SNF_MAXVMAPS];
2362 } snf_vmap_desbinfo;
2363 
2364 typedef struct {
2365         frtn_t          snfi_frtn;
2366         caddr_t         snfi_base;
2367         uint_t          snfi_mapoff;
2368         size_t          snfi_len;
2369         vnode_t         *snfi_vp;
2370 } snf_smap_desbinfo;
2371 
2372 /*
2373  * The callback function used for vpm mapped mblks called when the last ref of
2374  * the mblk is dropped which normally occurs when TCP receives the ack. But it
2375  * can be the driver too due to lazy reclaim.
2376  */
2377 void
2378 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2379 {
2380         ASSERT(snfv->snfv_ref != 0);
2381         if (atomic_add_32_nv(&snfv->snfv_ref, -1) == 0) {
2382                 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2383                 VN_RELE(snfv->snfv_vp);
2384                 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2385         }
2386 }
2387 
2388 /*
2389  * The callback function used for segmap'ped mblks called when the last ref of
2390  * the mblk is dropped which normally occurs when TCP receives the ack. But it
2391  * can be the driver too due to lazy reclaim.
2392  */
2393 void
2394 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2395 {
2396         if (! IS_KPM_ADDR(snfi->snfi_base)) {
2397                 /*
2398                  * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2399                  * segmap_kpm as long as the latter never falls back to
2400                  * "use_segmap_range". (See segmap_getmapflt().)
2401                  *
2402                  * Using S_OTHER saves an redundant hat_setref() in
2403                  * segmap_unlock()
2404                  */
2405                 (void) segmap_fault(kas.a_hat, segkmap,
2406                     (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2407                     snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2408                     F_SOFTUNLOCK, S_OTHER);
2409         }
2410         (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2411         VN_RELE(snfi->snfi_vp);
2412         kmem_free(snfi, sizeof (*snfi));
2413 }
2414 
2415 /*
2416  * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2417  * When segmap is used, the mblk contains a segmap slot of no more
2418  * than MAXBSIZE.
2419  *
2420  * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2421  * in each iteration and sent by socket_sendmblk until an error occurs or
2422  * the requested size has been transferred. An mblk is esballoca'ed from
2423  * each mapped page and a chain of these mblk is sent to the transport layer.
2424  * vpm will be called to unmap the pages when all mblks have been freed by
2425  * free_func.
2426  *
2427  * At the end of the whole sendfile() operation, we wait till the data from
2428  * the last mblk is ack'ed by the transport before returning so that the
2429  * caller of sendfile() can safely modify the file content.
2430  */
2431 int
2432 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2433     ssize_t *count, boolean_t nowait)
2434 {
2435         caddr_t base;
2436         int mapoff;
2437         vnode_t *vp;
2438         mblk_t *mp = NULL;
2439         int chain_size;
2440         int error;
2441         clock_t deadlk_wait;
2442         short fflag;
2443         int ksize;
2444         struct vattr va;
2445         boolean_t dowait = B_FALSE;
2446         struct nmsghdr msg;
2447 
2448         vp = fp->f_vnode;
2449         fflag = fp->f_flag;
2450         ksize = 0;
2451         bzero(&msg, sizeof (msg));
2452 
2453         for (;;) {
2454                 if (ISSIG(curthread, JUSTLOOKING)) {
2455                         error = EINTR;
2456                         break;
2457                 }
2458 
2459                 if (vpm_enable) {
2460                         snf_vmap_desbinfo *snfv;
2461                         mblk_t *nmp;
2462                         int mblk_size;
2463                         int maxsize;
2464                         int i;
2465 
2466                         mapoff = fileoff & PAGEOFFSET;
2467                         maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2468 
2469                         snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2470                             KM_SLEEP);
2471 
2472                         /*
2473                          * Get vpm mappings for maxsize with read access.
2474                          * If the pages aren't available yet, we get
2475                          * DEADLK, so wait and try again a little later using
2476                          * an increasing wait. We might be here a long time.
2477                          *
2478                          * If delay_sig returns EINTR, be sure to exit and
2479                          * pass it up to the caller.
2480                          */
2481                         deadlk_wait = 0;
2482                         while ((error = vpm_map_pages(fvp, fileoff,
2483                             (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2484                             SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2485                                 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2486                                 if ((error = delay_sig(deadlk_wait)) != 0) {
2487                                         break;
2488                                 }
2489                         }
2490                         if (error != 0) {
2491                                 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2492                                 error = (error == EINTR) ? EINTR : EIO;
2493                                 goto out;
2494                         }
2495                         snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2496                         snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2497 
2498                         /* Construct the mblk chain from the page mappings */
2499                         chain_size = 0;
2500                         for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2501                             total_size > 0; i++) {
2502                                 ASSERT(chain_size < maxsize);
2503                                 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2504                                     mapoff, total_size);
2505                                 nmp = esballoca(
2506                                     (uchar_t *)snfv->snfv_vml[i].vs_addr +
2507                                     mapoff, mblk_size, BPRI_HI,
2508                                     &snfv->snfv_frtn);
2509 
2510                                 /*
2511                                  * We return EAGAIN after unmapping the pages
2512                                  * if we cannot allocate the the head of the
2513                                  * chain. Otherwise, we continue sending the
2514                                  * mblks constructed so far.
2515                                  */
2516                                 if (nmp == NULL) {
2517                                         if (i == 0) {
2518                                                 vpm_unmap_pages(snfv->snfv_vml,
2519                                                     S_READ);
2520                                                 kmem_free(snfv,
2521                                                     sizeof (snf_vmap_desbinfo));
2522                                                 error = EAGAIN;
2523                                                 goto out;
2524                                         }
2525                                         break;
2526                                 }
2527                                 /* Mark this dblk with the zero-copy flag */
2528                                 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2529                                 nmp->b_wptr += mblk_size;
2530                                 chain_size += mblk_size;
2531                                 fileoff += mblk_size;
2532                                 total_size -= mblk_size;
2533                                 snfv->snfv_ref++;
2534                                 mapoff = 0;
2535                                 if (i > 0)
2536                                         linkb(mp, nmp);
2537                                 else
2538                                         mp = nmp;
2539                         }
2540                         VN_HOLD(fvp);
2541                         snfv->snfv_vp = fvp;
2542                 } else {
2543                         /* vpm not supported. fallback to segmap */
2544                         snf_smap_desbinfo *snfi;
2545 
2546                         mapoff = fileoff & MAXBOFFSET;
2547                         chain_size = MAXBSIZE - mapoff;
2548                         if (chain_size > total_size)
2549                                 chain_size = total_size;
2550                         /*
2551                          * we don't forcefault because we'll call
2552                          * segmap_fault(F_SOFTLOCK) next.
2553                          *
2554                          * S_READ will get the ref bit set (by either
2555                          * segmap_getmapflt() or segmap_fault()) and page
2556                          * shared locked.
2557                          */
2558                         base = segmap_getmapflt(segkmap, fvp, fileoff,
2559                             chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2560 
2561                         snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2562                         snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2563                             PAGESIZE)- (mapoff & PAGEMASK);
2564                         /*
2565                          * We must call segmap_fault() even for segmap_kpm
2566                          * because that's how error gets returned.
2567                          * (segmap_getmapflt() never fails but segmap_fault()
2568                          * does.)
2569                          *
2570                          * If the pages aren't available yet, we get
2571                          * DEADLK, so wait and try again a little later using
2572                          * an increasing wait. We might be here a long time.
2573                          *
2574                          * If delay_sig returns EINTR, be sure to exit and
2575                          * pass it up to the caller.
2576                          */
2577                         deadlk_wait = 0;
2578                         while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2579                             segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2580                             mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2581                             S_READ))) == EDEADLK) {
2582                                 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2583                                 if ((error = delay_sig(deadlk_wait)) != 0) {
2584                                         break;
2585                                 }
2586                         }
2587                         if (error != 0) {
2588                                 (void) segmap_release(segkmap, base, 0);
2589                                 kmem_free(snfi, sizeof (*snfi));
2590                                 error = (error == EINTR) ? EINTR : EIO;
2591                                 goto out;
2592                         }
2593                         snfi->snfi_frtn.free_func = snf_smap_desbfree;
2594                         snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2595                         snfi->snfi_base = base;
2596                         snfi->snfi_mapoff = mapoff;
2597                         mp = esballoca((uchar_t *)base + mapoff, chain_size,
2598                             BPRI_HI, &snfi->snfi_frtn);
2599 
2600                         if (mp == NULL) {
2601                                 (void) segmap_fault(kas.a_hat, segkmap,
2602                                     (caddr_t)(uintptr_t)(((uintptr_t)base +
2603                                     mapoff) & PAGEMASK), snfi->snfi_len,
2604                                     F_SOFTUNLOCK, S_OTHER);
2605                                 (void) segmap_release(segkmap, base, 0);
2606                                 kmem_free(snfi, sizeof (*snfi));
2607                                 freemsg(mp);
2608                                 error = EAGAIN;
2609                                 goto out;
2610                         }
2611                         VN_HOLD(fvp);
2612                         snfi->snfi_vp = fvp;
2613                         mp->b_wptr += chain_size;
2614 
2615                         /* Mark this dblk with the zero-copy flag */
2616                         mp->b_datap->db_struioflag |= STRUIO_ZC;
2617                         fileoff += chain_size;
2618                         total_size -= chain_size;
2619                 }
2620 
2621                 if (total_size == 0 && !nowait) {
2622                         ASSERT(!dowait);
2623                         dowait = B_TRUE;
2624                         mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2625                 }
2626                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2627                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2628                 if (error != 0) {
2629                         /*
2630                          * mp contains the mblks that were not sent by
2631                          * socket_sendmblk. Use its size to update *count
2632                          */
2633                         *count = ksize + (chain_size - msgdsize(mp));
2634                         if (mp != NULL)
2635                                 freemsg(mp);
2636                         return (error);
2637                 }
2638                 ksize += chain_size;
2639                 if (total_size == 0)
2640                         goto done;
2641 
2642                 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2643                 va.va_mask = AT_SIZE;
2644                 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2645                 if (error)
2646                         break;
2647                 /* Read as much as possible. */
2648                 if (fileoff >= va.va_size)
2649                         break;
2650                 if (total_size + fileoff > va.va_size)
2651                         total_size = va.va_size - fileoff;
2652         }
2653 out:
2654         VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2655 done:
2656         *count = ksize;
2657         if (dowait) {
2658                 stdata_t *stp;
2659 
2660                 stp = vp->v_stream;
2661                 if (stp == NULL) {
2662                         struct sonode *so;
2663                         so = VTOSO(vp);
2664                         error = so_zcopy_wait(so);
2665                 } else {
2666                         mutex_enter(&stp->sd_lock);
2667                         while (!(stp->sd_flag & STZCNOTIFY)) {
2668                                 if (cv_wait_sig(&stp->sd_zcopy_wait,
2669                                     &stp->sd_lock) == 0) {
2670                                         error = EINTR;
2671                                         break;
2672                                 }
2673                         }
2674                         stp->sd_flag &= ~STZCNOTIFY;
2675                         mutex_exit(&stp->sd_lock);
2676                 }
2677         }
2678         return (error);
2679 }
2680 
2681 int
2682 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2683     uint_t maxpsz, ssize_t *count)
2684 {
2685         struct vnode *vp;
2686         mblk_t *mp;
2687         int iosize;
2688         int extra = 0;
2689         int error;
2690         short fflag;
2691         int ksize;
2692         int ioflag;
2693         struct uio auio;
2694         struct iovec aiov;
2695         struct vattr va;
2696         int maxblk = 0;
2697         int wroff = 0;
2698         struct sonode *so;
2699         struct nmsghdr msg;
2700 
2701         vp = fp->f_vnode;
2702         if (vp->v_type == VSOCK) {
2703                 stdata_t *stp;
2704 
2705                 /*
2706                  * Get the extra space to insert a header and a trailer.
2707                  */
2708                 so = VTOSO(vp);
2709                 stp = vp->v_stream;
2710                 if (stp == NULL) {
2711                         wroff = so->so_proto_props.sopp_wroff;
2712                         maxblk = so->so_proto_props.sopp_maxblk;
2713                         extra = wroff + so->so_proto_props.sopp_tail;
2714                 } else {
2715                         wroff = (int)(stp->sd_wroff);
2716                         maxblk = (int)(stp->sd_maxblk);
2717                         extra = wroff + (int)(stp->sd_tail);
2718                 }
2719         }
2720         bzero(&msg, sizeof (msg));
2721         fflag = fp->f_flag;
2722         ksize = 0;
2723         auio.uio_iov = &aiov;
2724         auio.uio_iovcnt = 1;
2725         auio.uio_segflg = UIO_SYSSPACE;
2726         auio.uio_llimit = MAXOFFSET_T;
2727         auio.uio_fmode = fflag;
2728         auio.uio_extflg = UIO_COPY_CACHED;
2729         ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2730         /* If read sync is not asked for, filter sync flags */
2731         if ((ioflag & FRSYNC) == 0)
2732                 ioflag &= ~(FSYNC|FDSYNC);
2733         for (;;) {
2734                 if (ISSIG(curthread, JUSTLOOKING)) {
2735                         error = EINTR;
2736                         break;
2737                 }
2738                 iosize = (int)MIN(maxpsz, size);
2739 
2740                 /*
2741                  * Socket filters can limit the mblk size,
2742                  * so limit reads to maxblk if there are
2743                  * filters present.
2744                  */
2745                 if (vp->v_type == VSOCK &&
2746                     so->so_filter_active > 0 && maxblk != INFPSZ)
2747                         iosize = (int)MIN(iosize, maxblk);
2748 
2749                 if (is_system_labeled()) {
2750                         mp = allocb_cred(iosize + extra, CRED(),
2751                             curproc->p_pid);
2752                 } else {
2753                         mp = allocb(iosize + extra, BPRI_MED);
2754                 }
2755                 if (mp == NULL) {
2756                         error = EAGAIN;
2757                         break;
2758                 }
2759 
2760                 mp->b_rptr += wroff;
2761 
2762                 aiov.iov_base = (caddr_t)mp->b_rptr;
2763                 aiov.iov_len = iosize;
2764                 auio.uio_loffset = fileoff;
2765                 auio.uio_resid = iosize;
2766 
2767                 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2768                 iosize -= auio.uio_resid;
2769 
2770                 if (error == EINTR && iosize != 0)
2771                         error = 0;
2772 
2773                 if (error != 0 || iosize == 0) {
2774                         freeb(mp);
2775                         break;
2776                 }
2777                 mp->b_wptr = mp->b_rptr + iosize;
2778 
2779                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2780 
2781                 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2782 
2783                 if (error != 0) {
2784                         *count = ksize;
2785                         if (mp != NULL)
2786                                 freeb(mp);
2787                         return (error);
2788                 }
2789                 ksize += iosize;
2790                 size -= iosize;
2791                 if (size == 0)
2792                         goto done;
2793 
2794                 fileoff += iosize;
2795                 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2796                 va.va_mask = AT_SIZE;
2797                 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2798                 if (error)
2799                         break;
2800                 /* Read as much as possible. */
2801                 if (fileoff >= va.va_size)
2802                         size = 0;
2803                 else if (size + fileoff > va.va_size)
2804                         size = va.va_size - fileoff;
2805         }
2806         VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2807 done:
2808         *count = ksize;
2809         return (error);
2810 }
2811 
2812 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2813 /*
2814  * Largefile support for 32 bit applications only.
2815  */
2816 int
2817 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2818     ssize32_t *count32)
2819 {
2820         ssize32_t sfv_len;
2821         u_offset_t sfv_off, va_size;
2822         struct vnode *vp, *fvp, *realvp;
2823         struct vattr va;
2824         stdata_t *stp;
2825         ssize_t count = 0;
2826         int error = 0;
2827         boolean_t dozcopy = B_FALSE;
2828         uint_t maxpsz;
2829 
2830         sfv_len = (ssize32_t)sfv->sfv_len;
2831         if (sfv_len < 0) {
2832                 error = EINVAL;
2833                 goto out;
2834         }
2835 
2836         if (sfv_len == 0) goto out;
2837 
2838         sfv_off = (u_offset_t)sfv->sfv_off;
2839 
2840         /* Same checks as in pread */
2841         if (sfv_off > MAXOFFSET_T) {
2842                 error = EINVAL;
2843                 goto out;
2844         }
2845         if (sfv_off + sfv_len > MAXOFFSET_T)
2846                 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2847 
2848         /*
2849          * There are no more checks on sfv_len. So, we cast it to
2850          * u_offset_t and share the snf_direct_io/snf_cache code between
2851          * 32 bit and 64 bit.
2852          *
2853          * TODO: should do nbl_need_check() like read()?
2854          */
2855         if (sfv_len > sendfile_max_size) {
2856                 sf_stats.ss_file_not_cached++;
2857                 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2858                     &count);
2859                 goto out;
2860         }
2861         fvp = rfp->f_vnode;
2862         if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2863                 fvp = realvp;
2864         /*
2865          * Grab the lock as a reader to prevent the file size
2866          * from changing underneath.
2867          */
2868         (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2869         va.va_mask = AT_SIZE;
2870         error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2871         va_size = va.va_size;
2872         if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2873                 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2874                 goto out;
2875         }
2876         /* Read as much as possible. */
2877         if (sfv_off + sfv_len > va_size)
2878                 sfv_len = va_size - sfv_off;
2879 
2880         vp = fp->f_vnode;
2881         stp = vp->v_stream;
2882         /*
2883          * When the NOWAIT flag is not set, we enable zero-copy only if the
2884          * transfer size is large enough. This prevents performance loss
2885          * when the caller sends the file piece by piece.
2886          */
2887         if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2888             (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2889             !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2890                 uint_t copyflag;
2891                 copyflag = stp != NULL ? stp->sd_copyflag :
2892                     VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2893                 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2894                         int on = 1;
2895 
2896                         if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2897                             SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2898                                 dozcopy = B_TRUE;
2899                 } else {
2900                         dozcopy = copyflag & STZCVMSAFE;
2901                 }
2902         }
2903         if (dozcopy) {
2904                 sf_stats.ss_file_segmap++;
2905                 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2906                     &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2907         } else {
2908                 if (vp->v_type == VSOCK && stp == NULL) {
2909                         sonode_t *so = VTOSO(vp);
2910                         maxpsz = so->so_proto_props.sopp_maxpsz;
2911                 } else if (stp != NULL) {
2912                         maxpsz = stp->sd_qn_maxpsz;
2913                 } else {
2914                         maxpsz = maxphys;
2915                 }
2916 
2917                 if (maxpsz == INFPSZ)
2918                         maxpsz = maxphys;
2919                 else
2920                         maxpsz = roundup(maxpsz, MAXBSIZE);
2921                 sf_stats.ss_file_cached++;
2922                 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2923                     maxpsz, &count);
2924         }
2925 out:
2926         releasef(sfv->sfv_fd);
2927         *count32 = (ssize32_t)count;
2928         return (error);
2929 }
2930 #endif
2931 
2932 #ifdef _SYSCALL32_IMPL
2933 /*
2934  * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2935  * ssize_t rather than ssize32_t; see the comments above read32 for details.
2936  */
2937 
2938 ssize_t
2939 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2940 {
2941         return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2942 }
2943 
2944 ssize_t
2945 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2946         caddr32_t name, caddr32_t namelenp)
2947 {
2948         return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2949             (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2950 }
2951 
2952 ssize_t
2953 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2954 {
2955         return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2956 }
2957 
2958 ssize_t
2959 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2960         caddr32_t name, socklen_t namelen)
2961 {
2962         return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2963             (void *)(uintptr_t)name, namelen));
2964 }
2965 #endif  /* _SYSCALL32_IMPL */
2966 
2967 /*
2968  * Function wrappers (mostly around the sonode switch) for
2969  * backward compatibility.
2970  */
2971 
2972 int
2973 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2974 {
2975         return (socket_accept(so, fflag, CRED(), nsop));
2976 }
2977 
2978 int
2979 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2980     int backlog, int flags)
2981 {
2982         int     error;
2983 
2984         error = socket_bind(so, name, namelen, flags, CRED());
2985         if (error == 0 && backlog != 0)
2986                 return (socket_listen(so, backlog, CRED()));
2987 
2988         return (error);
2989 }
2990 
2991 int
2992 solisten(struct sonode *so, int backlog)
2993 {
2994         return (socket_listen(so, backlog, CRED()));
2995 }
2996 
2997 int
2998 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2999     int fflag, int flags)
3000 {
3001         return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3002 }
3003 
3004 int
3005 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3006 {
3007         return (socket_recvmsg(so, msg, uiop, CRED()));
3008 }
3009 
3010 int
3011 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3012 {
3013         return (socket_sendmsg(so, msg, uiop, CRED()));
3014 }
3015 
3016 int
3017 soshutdown(struct sonode *so, int how)
3018 {
3019         return (socket_shutdown(so, how, CRED()));
3020 }
3021 
3022 int
3023 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3024     socklen_t *optlenp, int flags)
3025 {
3026         return (socket_getsockopt(so, level, option_name, optval, optlenp,
3027             flags, CRED()));
3028 }
3029 
3030 int
3031 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3032     t_uscalar_t optlen)
3033 {
3034         return (socket_setsockopt(so, level, option_name, optval, optlen,
3035             CRED()));
3036 }
3037 
3038 /*
3039  * Because this is backward compatibility interface it only needs to be
3040  * able to handle the creation of TPI sockfs sockets.
3041  */
3042 struct sonode *
3043 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3044     int *errorp)
3045 {
3046         struct sonode *so;
3047 
3048         ASSERT(sp != NULL);
3049 
3050         so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3051             version, SOCKET_SLEEP, errorp, CRED());
3052         if (so == NULL) {
3053                 SOCKPARAMS_DEC_REF(sp);
3054         } else {
3055                 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3056                         /* Cannot fail, only bumps so_count */
3057                         (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3058                 } else {
3059                         socket_destroy(so);
3060                         so = NULL;
3061                 }
3062         }
3063         return (so);
3064 }