2 Wdiff usr/src/uts/common/fs/sockfs/socksyscalls.c

Print this page

5880 Increase IOV_MAX to at least 1024
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/sockfs/socksyscalls.c
          +++ new/usr/src/uts/common/fs/sockfs/socksyscalls.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - */
  25      -
  26      -/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
  27      -/*
       24 + * Copyright 2015, Joyent, Inc.  All rights reserved.
       25 + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  28   26   * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  29   27   */
  30   28  
  31   29  #include <sys/types.h>
  32   30  #include <sys/t_lock.h>
  33   31  #include <sys/param.h>
  34   32  #include <sys/systm.h>
  35   33  #include <sys/buf.h>
  36   34  #include <sys/conf.h>
  37   35  #include <sys/cred.h>

  38   36  #include <sys/kmem.h>
  39   37  #include <sys/sysmacros.h>
  40   38  #include <sys/vfs.h>
  41   39  #include <sys/vnode.h>
  42   40  #include <sys/debug.h>
  43   41  #include <sys/errno.h>
  44   42  #include <sys/time.h>
  45   43  #include <sys/file.h>
  46   44  #include <sys/user.h>

↓ open down ↓

9 lines elided

↑ open up ↑

  47   45  #include <sys/stream.h>
  48   46  #include <sys/strsubr.h>
  49   47  #include <sys/strsun.h>
  50   48  #include <sys/sunddi.h>
  51   49  #include <sys/esunddi.h>
  52   50  #include <sys/flock.h>
  53   51  #include <sys/modctl.h>
  54   52  #include <sys/cmn_err.h>
  55   53  #include <sys/vmsystm.h>
  56   54  #include <sys/policy.h>
       55 +#include <sys/limits.h>
  57   56  
  58   57  #include <sys/socket.h>
  59   58  #include <sys/socketvar.h>
  60   59  
  61   60  #include <sys/isa_defs.h>
  62   61  #include <sys/inttypes.h>
  63   62  #include <sys/systm.h>
  64   63  #include <sys/cpuvar.h>
  65   64  #include <sys/filio.h>
  66   65  #include <sys/sendfile.h>

  67   66  #include <sys/ddi.h>
  68   67  #include <vm/seg.h>
  69   68  #include <vm/seg_map.h>
  70   69  #include <vm/seg_kpm.h>
  71   70  
  72   71  #include <fs/sockfs/nl7c.h>
  73   72  #include <fs/sockfs/sockcommon.h>
  74   73  #include <fs/sockfs/sockfilter_impl.h>
  75   74  #include <fs/sockfs/socktpi.h>
  76   75  
  77   76  #ifdef SOCK_TEST
  78   77  int do_useracc = 1;             /* Controlled by setting SO_DEBUG to 4 */

↓ open down ↓

12 lines elided

↑ open up ↑

  79   78  #else
  80   79  #define do_useracc      1
  81   80  #endif /* SOCK_TEST */
  82   81  
  83   82  extern int      xnet_truncate_print;
  84   83  
  85   84  extern void     nl7c_init(void);
  86   85  extern int      sockfs_defer_nl7c_init;
  87   86  
  88   87  /*
  89      - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
  90      - *       as there isn't a formal definition of IOV_MAX ???
  91      - */
  92      -#define MSG_MAXIOVLEN   16
  93      -
  94      -/*
  95   88   * Kernel component of socket creation.
  96   89   *
  97   90   * The socket library determines which version number to use.
  98   91   * First the library calls this with a NULL devpath. If this fails
  99   92   * to find a transport (using solookup) the library will look in /etc/netconfig
 100   93   * for the appropriate transport. If one is found it will pass in the
 101   94   * devpath for the kernel to use.
 102   95   */
 103   96  int
 104   97  so_socket(int family, int type_w_flags, int protocol, char *devpath,

 105   98      int version)
 106   99  {
 107  100          struct sonode *so;
 108  101          vnode_t *vp;
 109  102          struct file *fp;
 110  103          int fd;
 111  104          int error;
 112  105          int type;
 113  106  
 114  107          type = type_w_flags & SOCK_TYPE_MASK;
 115  108          type_w_flags &= ~SOCK_TYPE_MASK;
 116  109          if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
 117  110                  return (set_errno(EINVAL));
 118  111  
 119  112          if (devpath != NULL) {
 120  113                  char *buf;
 121  114                  size_t kdevpathlen = 0;
 122  115  
 123  116                  buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 124  117                  if ((error = copyinstr(devpath, buf,
 125  118                      MAXPATHLEN, &kdevpathlen)) != 0) {
 126  119                          kmem_free(buf, MAXPATHLEN);
 127  120                          return (set_errno(error));
 128  121                  }
 129  122                  so = socket_create(family, type, protocol, buf, NULL,
 130  123                      SOCKET_SLEEP, version, CRED(), &error);
 131  124                  kmem_free(buf, MAXPATHLEN);
 132  125          } else {
 133  126                  so = socket_create(family, type, protocol, NULL, NULL,
 134  127                      SOCKET_SLEEP, version, CRED(), &error);
 135  128          }
 136  129          if (so == NULL)
 137  130                  return (set_errno(error));
 138  131  
 139  132          /* Allocate a file descriptor for the socket */
 140  133          vp = SOTOV(so);
 141  134          if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
 142  135                  (void) socket_close(so, 0, CRED());
 143  136                  socket_destroy(so);
 144  137                  return (set_errno(error));
 145  138          }
 146  139  
 147  140          /*
 148  141           * Now fill in the entries that falloc reserved
 149  142           */
 150  143          if (type_w_flags & SOCK_NDELAY) {
 151  144                  so->so_state |= SS_NDELAY;
 152  145                  fp->f_flag |= FNDELAY;
 153  146          }
 154  147          if (type_w_flags & SOCK_NONBLOCK) {
 155  148                  so->so_state |= SS_NONBLOCK;
 156  149                  fp->f_flag |= FNONBLOCK;
 157  150          }
 158  151          mutex_exit(&fp->f_tlock);
 159  152          setf(fd, fp);
 160  153          if ((type_w_flags & SOCK_CLOEXEC) != 0) {
 161  154                  f_setfd(fd, FD_CLOEXEC);
 162  155          }
 163  156  
 164  157          return (fd);
 165  158  }
 166  159  
 167  160  /*
 168  161   * Map from a file descriptor to a socket node.
 169  162   * Returns with the file descriptor held i.e. the caller has to
 170  163   * use releasef when done with the file descriptor.
 171  164   */
 172  165  struct sonode *
 173  166  getsonode(int sock, int *errorp, file_t **fpp)
 174  167  {
 175  168          file_t *fp;
 176  169          vnode_t *vp;
 177  170          struct sonode *so;
 178  171  
 179  172          if ((fp = getf(sock)) == NULL) {
 180  173                  *errorp = EBADF;
 181  174                  eprintline(*errorp);
 182  175                  return (NULL);
 183  176          }
 184  177          vp = fp->f_vnode;
 185  178          /* Check if it is a socket */
 186  179          if (vp->v_type != VSOCK) {
 187  180                  releasef(sock);
 188  181                  *errorp = ENOTSOCK;
 189  182                  eprintline(*errorp);
 190  183                  return (NULL);
 191  184          }
 192  185          /*
 193  186           * Use the stream head to find the real socket vnode.
 194  187           * This is needed when namefs sits above sockfs.
 195  188           */
 196  189          if (vp->v_stream) {
 197  190                  ASSERT(vp->v_stream->sd_vnode);
 198  191                  vp = vp->v_stream->sd_vnode;
 199  192  
 200  193                  so = VTOSO(vp);
 201  194                  if (so->so_version == SOV_STREAM) {
 202  195                          releasef(sock);
 203  196                          *errorp = ENOTSOCK;
 204  197                          eprintsoline(so, *errorp);
 205  198                          return (NULL);
 206  199                  }
 207  200          } else {
 208  201                  so = VTOSO(vp);
 209  202          }
 210  203          if (fpp)
 211  204                  *fpp = fp;
 212  205          return (so);
 213  206  }
 214  207  
 215  208  /*
 216  209   * Allocate and copyin a sockaddr.
 217  210   * Ensures NULL termination for AF_UNIX addresses by extending them
 218  211   * with one NULL byte if need be. Verifies that the length is not
 219  212   * excessive to prevent an application from consuming all of kernel
 220  213   * memory. Returns NULL when an error occurred.
 221  214   */
 222  215  static struct sockaddr *
 223  216  copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
 224  217      int *errorp)
 225  218  {
 226  219          char    *faddr;
 227  220          size_t  namelen = (size_t)*namelenp;
 228  221  
 229  222          ASSERT(namelen != 0);
 230  223          if (namelen > SO_MAXARGSIZE) {
 231  224                  *errorp = EINVAL;
 232  225                  eprintsoline(so, *errorp);
 233  226                  return (NULL);
 234  227          }
 235  228  
 236  229          faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
 237  230          if (copyin(name, faddr, namelen)) {
 238  231                  kmem_free(faddr, namelen);
 239  232                  *errorp = EFAULT;
 240  233                  eprintsoline(so, *errorp);
 241  234                  return (NULL);
 242  235          }
 243  236  
 244  237          /*
 245  238           * Add space for NULL termination if needed.
 246  239           * Do a quick check if the last byte is NUL.
 247  240           */
 248  241          if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
 249  242                  /* Check if there is any NULL termination */
 250  243                  size_t  i;
 251  244                  int foundnull = 0;
 252  245  
 253  246                  for (i = sizeof (name->sa_family); i < namelen; i++) {
 254  247                          if (faddr[i] == '\0') {
 255  248                                  foundnull = 1;
 256  249                                  break;
 257  250                          }
 258  251                  }
 259  252                  if (!foundnull) {
 260  253                          /* Add extra byte for NUL padding */
 261  254                          char *nfaddr;
 262  255  
 263  256                          nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
 264  257                          bcopy(faddr, nfaddr, namelen);
 265  258                          kmem_free(faddr, namelen);
 266  259  
 267  260                          /* NUL terminate */
 268  261                          nfaddr[namelen] = '\0';
 269  262                          namelen++;
 270  263                          ASSERT((socklen_t)namelen == namelen);
 271  264                          *namelenp = (socklen_t)namelen;
 272  265                          faddr = nfaddr;
 273  266                  }
 274  267          }
 275  268          return ((struct sockaddr *)faddr);
 276  269  }
 277  270  
 278  271  /*
 279  272   * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 280  273   */
 281  274  static int
 282  275  copyout_arg(void *uaddr, socklen_t ulen, void *ulenp, void *kaddr,
 283  276      socklen_t klen)
 284  277  {
 285  278          if (uaddr != NULL) {
 286  279                  if (ulen > klen)
 287  280                          ulen = klen;
 288  281  
 289  282                  if (ulen != 0) {
 290  283                          if (copyout(kaddr, uaddr, ulen))
 291  284                                  return (EFAULT);
 292  285                  }
 293  286          } else
 294  287                  ulen = 0;
 295  288  
 296  289          if (ulenp != NULL) {
 297  290                  if (copyout(&ulen, ulenp, sizeof (ulen)))
 298  291                          return (EFAULT);
 299  292          }
 300  293          return (0);
 301  294  }
 302  295  
 303  296  /*
 304  297   * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
 305  298   * If klen is greater than ulen it still uses the non-truncated
 306  299   * klen to update ulenp.
 307  300   */
 308  301  static int
 309  302  copyout_name(void *uaddr, socklen_t ulen, void *ulenp, void *kaddr,
 310  303      socklen_t klen)
 311  304  {
 312  305          if (uaddr != NULL) {
 313  306                  if (ulen >= klen)
 314  307                          ulen = klen;
 315  308                  else if (ulen != 0 && xnet_truncate_print) {
 316  309                          printf("sockfs: truncating copyout of address using "
 317  310                              "XNET semantics for pid = %d. Lengths %d, %d\n",
 318  311                              curproc->p_pid, klen, ulen);
 319  312                  }
 320  313  
 321  314                  if (ulen != 0) {
 322  315                          if (copyout(kaddr, uaddr, ulen))
 323  316                                  return (EFAULT);
 324  317                  } else
 325  318                          klen = 0;
 326  319          } else
 327  320                  klen = 0;
 328  321  
 329  322          if (ulenp != NULL) {
 330  323                  if (copyout(&klen, ulenp, sizeof (klen)))
 331  324                          return (EFAULT);
 332  325          }
 333  326          return (0);
 334  327  }
 335  328  
 336  329  /*
 337  330   * The socketpair() code in libsocket creates two sockets (using
 338  331   * the /etc/netconfig fallback if needed) before calling this routine
 339  332   * to connect the two sockets together.
 340  333   *
 341  334   * For a SOCK_STREAM socketpair a listener is needed - in that case this
 342  335   * routine will create a new file descriptor as part of accepting the
 343  336   * connection. The library socketpair() will check if svs[2] has changed
 344  337   * in which case it will close the changed fd.
 345  338   *
 346  339   * Note that this code could use the TPI feature of accepting the connection
 347  340   * on the listening endpoint. However, that would require significant changes
 348  341   * to soaccept.
 349  342   */
 350  343  int
 351  344  so_socketpair(int sv[2])
 352  345  {
 353  346          int svs[2];
 354  347          struct sonode *so1, *so2;
 355  348          int error;
 356  349          int orig_flags;
 357  350          struct sockaddr_ux *name;
 358  351          size_t namelen;
 359  352          sotpi_info_t *sti1;
 360  353          sotpi_info_t *sti2;
 361  354  
 362  355          dprint(1, ("so_socketpair(%p)\n", (void *)sv));
 363  356  
 364  357          error = useracc(sv, sizeof (svs), B_WRITE);
 365  358          if (error && do_useracc)
 366  359                  return (set_errno(EFAULT));
 367  360  
 368  361          if (copyin(sv, svs, sizeof (svs)))
 369  362                  return (set_errno(EFAULT));
 370  363  
 371  364          if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
 372  365                  return (set_errno(error));
 373  366  
 374  367          if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
 375  368                  releasef(svs[0]);
 376  369                  return (set_errno(error));
 377  370          }
 378  371  
 379  372          if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
 380  373                  error = EOPNOTSUPP;
 381  374                  goto done;
 382  375          }
 383  376  
 384  377          sti1 = SOTOTPI(so1);
 385  378          sti2 = SOTOTPI(so2);
 386  379  
 387  380          /*
 388  381           * The code below makes assumptions about the "sockfs" implementation.
 389  382           * So make sure that the correct implementation is really used.
 390  383           */
 391  384          ASSERT(so1->so_ops == &sotpi_sonodeops);
 392  385          ASSERT(so2->so_ops == &sotpi_sonodeops);
 393  386  
 394  387          if (so1->so_type == SOCK_DGRAM) {
 395  388                  /*
 396  389                   * Bind both sockets and connect them with each other.
 397  390                   * Need to allocate name/namelen for soconnect.
 398  391                   */
 399  392                  error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
 400  393                  if (error) {
 401  394                          eprintsoline(so1, error);
 402  395                          goto done;
 403  396                  }
 404  397                  error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 405  398                  if (error) {
 406  399                          eprintsoline(so2, error);
 407  400                          goto done;
 408  401                  }
 409  402                  namelen = sizeof (struct sockaddr_ux);
 410  403                  name = kmem_alloc(namelen, KM_SLEEP);
 411  404                  name->sou_family = AF_UNIX;
 412  405                  name->sou_addr = sti2->sti_ux_laddr;
 413  406                  error = socket_connect(so1,
 414  407                      (struct sockaddr *)name,
 415  408                      (socklen_t)namelen,
 416  409                      0, _SOCONNECT_NOXLATE, CRED());
 417  410                  if (error) {
 418  411                          kmem_free(name, namelen);
 419  412                          eprintsoline(so1, error);
 420  413                          goto done;
 421  414                  }
 422  415                  name->sou_addr = sti1->sti_ux_laddr;
 423  416                  error = socket_connect(so2,
 424  417                      (struct sockaddr *)name,
 425  418                      (socklen_t)namelen,
 426  419                      0, _SOCONNECT_NOXLATE, CRED());
 427  420                  kmem_free(name, namelen);
 428  421                  if (error) {
 429  422                          eprintsoline(so2, error);
 430  423                          goto done;
 431  424                  }
 432  425                  releasef(svs[0]);
 433  426                  releasef(svs[1]);
 434  427          } else {
 435  428                  /*
 436  429                   * Bind both sockets, with so1 being a listener.
 437  430                   * Connect so2 to so1 - nonblocking to avoid waiting for
 438  431                   * soaccept to complete.
 439  432                   * Accept a connection on so1. Pass out the new fd as sv[0].
 440  433                   * The library will detect the changed fd and close
 441  434                   * the original one.
 442  435                   */
 443  436                  struct sonode *nso;
 444  437                  struct vnode *nvp;
 445  438                  struct file *nfp;
 446  439                  int nfd;
 447  440  
 448  441                  /*
 449  442                   * We could simply call socket_listen() here (which would do the
 450  443                   * binding automatically) if the code didn't rely on passing
 451  444                   * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
 452  445                   */
 453  446                  error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
 454  447                      _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
 455  448                      CRED());
 456  449                  if (error) {
 457  450                          eprintsoline(so1, error);
 458  451                          goto done;
 459  452                  }
 460  453                  error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
 461  454                  if (error) {
 462  455                          eprintsoline(so2, error);
 463  456                          goto done;
 464  457                  }
 465  458  
 466  459                  namelen = sizeof (struct sockaddr_ux);
 467  460                  name = kmem_alloc(namelen, KM_SLEEP);
 468  461                  name->sou_family = AF_UNIX;
 469  462                  name->sou_addr = sti1->sti_ux_laddr;
 470  463                  error = socket_connect(so2,
 471  464                      (struct sockaddr *)name,
 472  465                      (socklen_t)namelen,
 473  466                      FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
 474  467                  kmem_free(name, namelen);
 475  468                  if (error) {
 476  469                          if (error != EINPROGRESS) {
 477  470                                  eprintsoline(so2, error); goto done;
 478  471                          }
 479  472                  }
 480  473  
 481  474                  error = socket_accept(so1, 0, CRED(), &nso);
 482  475                  if (error) {
 483  476                          eprintsoline(so1, error);
 484  477                          goto done;
 485  478                  }
 486  479  
 487  480                  /* wait for so2 being SS_CONNECTED ignoring signals */
 488  481                  mutex_enter(&so2->so_lock);
 489  482                  error = sowaitconnected(so2, 0, 1);
 490  483                  mutex_exit(&so2->so_lock);
 491  484                  if (error != 0) {
 492  485                          (void) socket_close(nso, 0, CRED());
 493  486                          socket_destroy(nso);
 494  487                          eprintsoline(so2, error);
 495  488                          goto done;
 496  489                  }
 497  490  
 498  491                  nvp = SOTOV(nso);
 499  492                  if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
 500  493                          (void) socket_close(nso, 0, CRED());
 501  494                          socket_destroy(nso);
 502  495                          eprintsoline(nso, error);
 503  496                          goto done;
 504  497                  }
 505  498                  /*
 506  499                   * copy over FNONBLOCK and FNDELAY flags should they exist
 507  500                   */
 508  501                  if (so1->so_state & SS_NONBLOCK)
 509  502                          nfp->f_flag |= FNONBLOCK;
 510  503                  if (so1->so_state & SS_NDELAY)
 511  504                          nfp->f_flag |= FNDELAY;
 512  505  
 513  506                  /*
 514  507                   * fill in the entries that falloc reserved
 515  508                   */
 516  509                  mutex_exit(&nfp->f_tlock);
 517  510                  setf(nfd, nfp);
 518  511  
 519  512                  /*
 520  513                   * get the original flags before we release
 521  514                   */
 522  515                  VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
 523  516  
 524  517                  releasef(svs[0]);
 525  518                  releasef(svs[1]);
 526  519  
 527  520                  /*
 528  521                   * If FD_CLOEXEC was set on the filedescriptor we're
 529  522                   * swapping out, we should set it on the new one too.
 530  523                   */
 531  524                  if (orig_flags & FD_CLOEXEC) {
 532  525                          f_setfd(nfd, FD_CLOEXEC);
 533  526                  }
 534  527  
 535  528                  /*
 536  529                   * The socketpair library routine will close the original
 537  530                   * svs[0] when this code passes out a different file
 538  531                   * descriptor.
 539  532                   */
 540  533                  svs[0] = nfd;
 541  534  
 542  535                  if (copyout(svs, sv, sizeof (svs))) {
 543  536                          (void) closeandsetf(nfd, NULL);
 544  537                          eprintline(EFAULT);
 545  538                          return (set_errno(EFAULT));
 546  539                  }
 547  540          }
 548  541          return (0);
 549  542  
 550  543  done:
 551  544          releasef(svs[0]);
 552  545          releasef(svs[1]);
 553  546          return (set_errno(error));
 554  547  }
 555  548  
 556  549  int
 557  550  bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
 558  551  {
 559  552          struct sonode *so;
 560  553          int error;
 561  554  
 562  555          dprint(1, ("bind(%d, %p, %d)\n",
 563  556              sock, (void *)name, namelen));
 564  557  
 565  558          if ((so = getsonode(sock, &error, NULL)) == NULL)
 566  559                  return (set_errno(error));
 567  560  
 568  561          /* Allocate and copyin name */
 569  562          /*
 570  563           * X/Open test does not expect EFAULT with NULL name and non-zero
 571  564           * namelen.
 572  565           */
 573  566          if (name != NULL && namelen != 0) {
 574  567                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 575  568                  name = copyin_name(so, name, &namelen, &error);
 576  569                  if (name == NULL) {
 577  570                          releasef(sock);
 578  571                          return (set_errno(error));
 579  572                  }
 580  573          } else {
 581  574                  name = NULL;
 582  575                  namelen = 0;
 583  576          }
 584  577  
 585  578          switch (version) {
 586  579          default:
 587  580                  error = socket_bind(so, name, namelen, 0, CRED());
 588  581                  break;
 589  582          case SOV_XPG4_2:
 590  583                  error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
 591  584                  break;
 592  585          case SOV_SOCKBSD:
 593  586                  error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
 594  587                  break;
 595  588          }
 596  589  done:
 597  590          releasef(sock);
 598  591          if (name != NULL)
 599  592                  kmem_free(name, (size_t)namelen);
 600  593  
 601  594          if (error)
 602  595                  return (set_errno(error));
 603  596          return (0);
 604  597  }
 605  598  
 606  599  /* ARGSUSED2 */
 607  600  int
 608  601  listen(int sock, int backlog, int version)
 609  602  {
 610  603          struct sonode *so;
 611  604          int error;
 612  605  
 613  606          dprint(1, ("listen(%d, %d)\n",
 614  607              sock, backlog));
 615  608  
 616  609          if ((so = getsonode(sock, &error, NULL)) == NULL)
 617  610                  return (set_errno(error));
 618  611  
 619  612          error = socket_listen(so, backlog, CRED());
 620  613  
 621  614          releasef(sock);
 622  615          if (error)
 623  616                  return (set_errno(error));
 624  617          return (0);
 625  618  }
 626  619  
 627  620  /*ARGSUSED3*/
 628  621  int
 629  622  accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
 630  623      int flags)
 631  624  {
 632  625          struct sonode *so;
 633  626          file_t *fp;
 634  627          int error;
 635  628          socklen_t namelen;
 636  629          struct sonode *nso;
 637  630          struct vnode *nvp;
 638  631          struct file *nfp;
 639  632          int nfd;
 640  633          int ssflags;
 641  634          struct sockaddr *addrp;
 642  635          socklen_t addrlen;
 643  636  
 644  637          dprint(1, ("accept(%d, %p, %p)\n",
 645  638              sock, (void *)name, (void *)namelenp));
 646  639  
 647  640          if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
 648  641                  return (set_errno(EINVAL));
 649  642          }
 650  643  
 651  644          /* Translate SOCK_ flags to their SS_ variant */
 652  645          ssflags = 0;
 653  646          if (flags & SOCK_NONBLOCK)
 654  647                  ssflags |= SS_NONBLOCK;
 655  648          if (flags & SOCK_NDELAY)
 656  649                  ssflags |= SS_NDELAY;
 657  650  
 658  651          if ((so = getsonode(sock, &error, &fp)) == NULL)
 659  652                  return (set_errno(error));
 660  653  
 661  654          if (name != NULL) {
 662  655                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 663  656                  if (copyin(namelenp, &namelen, sizeof (namelen))) {
 664  657                          releasef(sock);
 665  658                          return (set_errno(EFAULT));
 666  659                  }
 667  660                  if (namelen != 0) {
 668  661                          error = useracc(name, (size_t)namelen, B_WRITE);
 669  662                          if (error && do_useracc) {
 670  663                                  releasef(sock);
 671  664                                  return (set_errno(EFAULT));
 672  665                          }
 673  666                  } else
 674  667                          name = NULL;
 675  668          } else {
 676  669                  namelen = 0;
 677  670          }
 678  671  
 679  672          /*
 680  673           * Allocate the user fd before socket_accept() in order to
 681  674           * catch EMFILE errors before calling socket_accept().
 682  675           */
 683  676          if ((nfd = ufalloc(0)) == -1) {
 684  677                  eprintsoline(so, EMFILE);
 685  678                  releasef(sock);
 686  679                  return (set_errno(EMFILE));
 687  680          }
 688  681          error = socket_accept(so, fp->f_flag, CRED(), &nso);
 689  682          if (error) {
 690  683                  setf(nfd, NULL);
 691  684                  releasef(sock);
 692  685                  return (set_errno(error));
 693  686          }
 694  687  
 695  688          nvp = SOTOV(nso);
 696  689  
 697  690          ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
 698  691          if (namelen != 0) {
 699  692                  addrlen = so->so_max_addr_len;
 700  693                  addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
 701  694  
 702  695                  if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
 703  696                      &addrlen, B_TRUE, CRED())) == 0) {
 704  697                          error = copyout_name(name, namelen, namelenp,
 705  698                              addrp, addrlen);
 706  699                  } else {
 707  700                          ASSERT(error == EINVAL || error == ENOTCONN);
 708  701                          error = ECONNABORTED;
 709  702                  }
 710  703                  kmem_free(addrp, so->so_max_addr_len);
 711  704          }
 712  705  
 713  706          if (error) {
 714  707                  setf(nfd, NULL);
 715  708                  (void) socket_close(nso, 0, CRED());
 716  709                  socket_destroy(nso);
 717  710                  releasef(sock);
 718  711                  return (set_errno(error));
 719  712          }
 720  713          if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
 721  714                  setf(nfd, NULL);
 722  715                  (void) socket_close(nso, 0, CRED());
 723  716                  socket_destroy(nso);
 724  717                  eprintsoline(so, error);
 725  718                  releasef(sock);
 726  719                  return (set_errno(error));
 727  720          }
 728  721          /*
 729  722           * fill in the entries that falloc reserved
 730  723           */
 731  724          nfp->f_vnode = nvp;
 732  725          mutex_exit(&nfp->f_tlock);
 733  726          setf(nfd, nfp);
 734  727  
 735  728          /*
 736  729           * Act on SOCK_CLOEXEC from flags
 737  730           */
 738  731          if (flags & SOCK_CLOEXEC) {
 739  732                  f_setfd(nfd, FD_CLOEXEC);
 740  733          }
 741  734  
 742  735          /*
 743  736           * Copy FNDELAY and FNONBLOCK from listener to acceptor
 744  737           * and from ssflags
 745  738           */
 746  739          if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
 747  740                  uint_t oflag = nfp->f_flag;
 748  741                  int arg = 0;
 749  742  
 750  743                  if ((ssflags | so->so_state) & SS_NONBLOCK)
 751  744                          arg |= FNONBLOCK;
 752  745                  else if ((ssflags | so->so_state) & SS_NDELAY)
 753  746                          arg |= FNDELAY;
 754  747  
 755  748                  /*
 756  749                   * This code is a simplification of the F_SETFL code in fcntl()
 757  750                   * Ignore any errors from VOP_SETFL.
 758  751                   */
 759  752                  if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
 760  753                      != 0) {
 761  754                          eprintsoline(so, error);
 762  755                          error = 0;
 763  756                  } else {
 764  757                          mutex_enter(&nfp->f_tlock);
 765  758                          nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
 766  759                          nfp->f_flag |= arg;
 767  760                          mutex_exit(&nfp->f_tlock);
 768  761                  }
 769  762          }
 770  763          releasef(sock);
 771  764          return (nfd);
 772  765  }
 773  766  
 774  767  int
 775  768  connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
 776  769  {
 777  770          struct sonode *so;
 778  771          file_t *fp;
 779  772          int error;
 780  773  
 781  774          dprint(1, ("connect(%d, %p, %d)\n",
 782  775              sock, (void *)name, namelen));
 783  776  
 784  777          if ((so = getsonode(sock, &error, &fp)) == NULL)
 785  778                  return (set_errno(error));
 786  779  
 787  780          /* Allocate and copyin name */
 788  781          if (namelen != 0) {
 789  782                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 790  783                  name = copyin_name(so, name, &namelen, &error);
 791  784                  if (name == NULL) {
 792  785                          releasef(sock);
 793  786                          return (set_errno(error));
 794  787                  }
 795  788          } else
 796  789                  name = NULL;
 797  790  
 798  791          error = socket_connect(so, name, namelen, fp->f_flag,
 799  792              (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
 800  793          releasef(sock);
 801  794          if (name)
 802  795                  kmem_free(name, (size_t)namelen);
 803  796          if (error)
 804  797                  return (set_errno(error));
 805  798          return (0);
 806  799  }
 807  800  
 808  801  /*ARGSUSED2*/
 809  802  int
 810  803  shutdown(int sock, int how, int version)
 811  804  {
 812  805          struct sonode *so;
 813  806          int error;
 814  807  
 815  808          dprint(1, ("shutdown(%d, %d)\n",
 816  809              sock, how));
 817  810  
 818  811          if ((so = getsonode(sock, &error, NULL)) == NULL)
 819  812                  return (set_errno(error));
 820  813  
 821  814          error = socket_shutdown(so, how, CRED());
 822  815  
 823  816          releasef(sock);
 824  817          if (error)
 825  818                  return (set_errno(error));
 826  819          return (0);
 827  820  }
 828  821  
 829  822  /*
 830  823   * Common receive routine.
 831  824   */
 832  825  static ssize_t
 833  826  recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
 834  827      socklen_t *namelenp, socklen_t *controllenp, int *flagsp)
 835  828  {
 836  829          struct sonode *so;
 837  830          file_t *fp;
 838  831          void *name;
 839  832          socklen_t namelen;
 840  833          void *control;
 841  834          socklen_t controllen;
 842  835          ssize_t len;
 843  836          int error;
 844  837  
 845  838          if ((so = getsonode(sock, &error, &fp)) == NULL)
 846  839                  return (set_errno(error));
 847  840  
 848  841          len = uiop->uio_resid;
 849  842          uiop->uio_fmode = fp->f_flag;
 850  843          uiop->uio_extflg = UIO_COPY_CACHED;
 851  844  
 852  845          name = msg->msg_name;
 853  846          namelen = msg->msg_namelen;
 854  847          control = msg->msg_control;
 855  848          controllen = msg->msg_controllen;
 856  849  
 857  850          msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
 858  851              MSG_DONTWAIT | MSG_XPG4_2);
 859  852  
 860  853          error = socket_recvmsg(so, msg, uiop, CRED());
 861  854          if (error) {
 862  855                  releasef(sock);
 863  856                  return (set_errno(error));
 864  857          }
 865  858          lwp_stat_update(LWP_STAT_MSGRCV, 1);
 866  859          releasef(sock);
 867  860  
 868  861          error = copyout_name(name, namelen, namelenp,
 869  862              msg->msg_name, msg->msg_namelen);
 870  863          if (error)
 871  864                  goto err;
 872  865  
 873  866          if (flagsp != NULL) {
 874  867                  /*
 875  868                   * Clear internal flag.
 876  869                   */
 877  870                  msg->msg_flags &= ~MSG_XPG4_2;
 878  871  
 879  872                  /*
 880  873                   * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
 881  874                   * when controllen is zero and there is control data to
 882  875                   * copy out.
 883  876                   */
 884  877                  if (controllen != 0 &&
 885  878                      (msg->msg_controllen > controllen || control == NULL)) {
 886  879                          dprint(1, ("recvit: CTRUNC %d %d %p\n",
 887  880                              msg->msg_controllen, controllen, control));
 888  881  
 889  882                          msg->msg_flags |= MSG_CTRUNC;
 890  883                  }
 891  884                  if (copyout(&msg->msg_flags, flagsp,
 892  885                      sizeof (msg->msg_flags))) {
 893  886                          error = EFAULT;
 894  887                          goto err;
 895  888                  }
 896  889          }
 897  890          /*
 898  891           * Note: This MUST be done last. There can be no "goto err" after this
 899  892           * point since it could make so_closefds run twice on some part
 900  893           * of the file descriptor array.
 901  894           */
 902  895          if (controllen != 0) {
 903  896                  if (!(flags & MSG_XPG4_2)) {
 904  897                          /*
 905  898                           * Good old msg_accrights can only return a multiple
 906  899                           * of 4 bytes.
 907  900                           */
 908  901                          controllen &= ~((int)sizeof (uint32_t) - 1);
 909  902                  }
 910  903                  error = copyout_arg(control, controllen, controllenp,
 911  904                      msg->msg_control, msg->msg_controllen);
 912  905                  if (error)
 913  906                          goto err;
 914  907  
 915  908                  if (msg->msg_controllen > controllen || control == NULL) {
 916  909                          if (control == NULL)
 917  910                                  controllen = 0;
 918  911                          so_closefds(msg->msg_control, msg->msg_controllen,
 919  912                              !(flags & MSG_XPG4_2), controllen);
 920  913                  }
 921  914          }
 922  915          if (msg->msg_namelen != 0)
 923  916                  kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 924  917          if (msg->msg_controllen != 0)
 925  918                  kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 926  919          return (len - uiop->uio_resid);
 927  920  
 928  921  err:
 929  922          /*
 930  923           * If we fail and the control part contains file descriptors
 931  924           * we have to close the fd's.
 932  925           */
 933  926          if (msg->msg_controllen != 0)
 934  927                  so_closefds(msg->msg_control, msg->msg_controllen,
 935  928                      !(flags & MSG_XPG4_2), 0);
 936  929          if (msg->msg_namelen != 0)
 937  930                  kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
 938  931          if (msg->msg_controllen != 0)
 939  932                  kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
 940  933          return (set_errno(error));
 941  934  }
 942  935  
 943  936  /*
 944  937   * Native system call
 945  938   */
 946  939  ssize_t
 947  940  recv(int sock, void *buffer, size_t len, int flags)
 948  941  {
 949  942          struct nmsghdr lmsg;
 950  943          struct uio auio;
 951  944          struct iovec aiov[1];
 952  945  
 953  946          dprint(1, ("recv(%d, %p, %ld, %d)\n",
 954  947              sock, buffer, len, flags));
 955  948  
 956  949          if ((ssize_t)len < 0) {
 957  950                  return (set_errno(EINVAL));
 958  951          }
 959  952  
 960  953          aiov[0].iov_base = buffer;
 961  954          aiov[0].iov_len = len;
 962  955          auio.uio_loffset = 0;
 963  956          auio.uio_iov = aiov;
 964  957          auio.uio_iovcnt = 1;
 965  958          auio.uio_resid = len;
 966  959          auio.uio_segflg = UIO_USERSPACE;
 967  960          auio.uio_limit = 0;
 968  961  
 969  962          lmsg.msg_namelen = 0;
 970  963          lmsg.msg_controllen = 0;
 971  964          lmsg.msg_flags = 0;
 972  965          return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
 973  966  }
 974  967  
 975  968  ssize_t
 976  969  recvfrom(int sock, void *buffer, size_t len, int flags, struct sockaddr *name,
 977  970      socklen_t *namelenp)
 978  971  {
 979  972          struct nmsghdr lmsg;
 980  973          struct uio auio;
 981  974          struct iovec aiov[1];
 982  975  
 983  976          dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
 984  977              sock, buffer, len, flags, (void *)name, (void *)namelenp));
 985  978  
 986  979          if ((ssize_t)len < 0) {
 987  980                  return (set_errno(EINVAL));
 988  981          }
 989  982  
 990  983          aiov[0].iov_base = buffer;
 991  984          aiov[0].iov_len = len;
 992  985          auio.uio_loffset = 0;
 993  986          auio.uio_iov = aiov;
 994  987          auio.uio_iovcnt = 1;
 995  988          auio.uio_resid = len;
 996  989          auio.uio_segflg = UIO_USERSPACE;
 997  990          auio.uio_limit = 0;
 998  991  
 999  992          lmsg.msg_name = (char *)name;
1000  993          if (namelenp != NULL) {
1001  994                  if (copyin(namelenp, &lmsg.msg_namelen,
1002  995                      sizeof (lmsg.msg_namelen)))
1003  996                          return (set_errno(EFAULT));
1004  997          } else {
1005  998                  lmsg.msg_namelen = 0;
1006  999          }
1007 1000          lmsg.msg_controllen = 0;
1008 1001          lmsg.msg_flags = 0;
1009 1002  
1010 1003          return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1011 1004  }
1012 1005  
1013 1006  /*

↓ open down ↓

909 lines elided

↑ open up ↑

1014 1007   * Uses the MSG_XPG4_2 flag to determine if the caller is using
1015 1008   * struct omsghdr or struct nmsghdr.
1016 1009   */
1017 1010  ssize_t
1018 1011  recvmsg(int sock, struct nmsghdr *msg, int flags)
1019 1012  {
1020 1013          STRUCT_DECL(nmsghdr, u_lmsg);
1021 1014          STRUCT_HANDLE(nmsghdr, umsgptr);
1022 1015          struct nmsghdr lmsg;
1023 1016          struct uio auio;
1024      -        struct iovec aiov[MSG_MAXIOVLEN];
     1017 +        struct iovec buf[IOV_MAX_STACK], *aiov = buf;
     1018 +        ssize_t iovsize = 0;
1025 1019          int iovcnt;
1026      -        ssize_t len;
     1020 +        ssize_t len, rval;
1027 1021          int i;
1028 1022          int *flagsp;
1029 1023          model_t model;
1030 1024  
1031 1025          dprint(1, ("recvmsg(%d, %p, %d)\n",
1032 1026              sock, (void *)msg, flags));
1033 1027  
1034 1028          model = get_udatamodel();
1035 1029          STRUCT_INIT(u_lmsg, model);
1036 1030          STRUCT_SET_HANDLE(umsgptr, model, msg);

1037 1031  
1038 1032          if (flags & MSG_XPG4_2) {
1039 1033                  if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1040 1034                          return (set_errno(EFAULT));
1041 1035                  flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1042 1036          } else {
1043 1037                  /*
1044 1038                   * Assumes that nmsghdr and omsghdr are identically shaped
1045 1039                   * except for the added msg_flags field.
1046 1040                   */
1047 1041                  if (copyin(msg, STRUCT_BUF(u_lmsg),
1048 1042                      SIZEOF_STRUCT(omsghdr, model)))
1049 1043                          return (set_errno(EFAULT));
1050 1044                  STRUCT_FSET(u_lmsg, msg_flags, 0);
1051 1045                  flagsp = NULL;
1052 1046          }
1053 1047  
1054 1048          /*
1055 1049           * Code below us will kmem_alloc memory and hang it
1056 1050           * off msg_control and msg_name fields. This forces
1057 1051           * us to copy the structure to its native form.
1058 1052           */

↓ open down ↓

22 lines elided

↑ open up ↑

1059 1053          lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1060 1054          lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1061 1055          lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1062 1056          lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1063 1057          lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1064 1058          lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1065 1059          lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1066 1060  
1067 1061          iovcnt = lmsg.msg_iovlen;
1068 1062  
1069      -        if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
     1063 +        if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1070 1064                  return (set_errno(EMSGSIZE));
1071 1065          }
1072 1066  
     1067 +        if (iovcnt > IOV_MAX_STACK) {
     1068 +                iovsize = iovcnt * sizeof (struct iovec);
     1069 +                aiov = kmem_alloc(iovsize, KM_SLEEP);
     1070 +        }
     1071 +
1073 1072  #ifdef _SYSCALL32_IMPL
1074 1073          /*
1075 1074           * 32-bit callers need to have their iovec expanded, while ensuring
1076 1075           * that they can't move more than 2Gbytes of data in a single call.
1077 1076           */
1078 1077          if (model == DATAMODEL_ILP32) {
1079      -                struct iovec32 aiov32[MSG_MAXIOVLEN];
     1078 +                struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
     1079 +                ssize_t iov32size;
1080 1080                  ssize32_t count32;
1081 1081  
1082      -                if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1083      -                    iovcnt * sizeof (struct iovec32)))
     1082 +                iov32size = iovcnt * sizeof (struct iovec32);
     1083 +                if (iovsize != 0)
     1084 +                        aiov32 = kmem_alloc(iov32size, KM_SLEEP);
     1085 +
     1086 +                if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
     1087 +                        if (iovsize != 0) {
     1088 +                                kmem_free(aiov32, iov32size);
     1089 +                                kmem_free(aiov, iovsize);
     1090 +                        }
     1091 +
1084 1092                          return (set_errno(EFAULT));
     1093 +                }
1085 1094  
1086 1095                  count32 = 0;
1087 1096                  for (i = 0; i < iovcnt; i++) {
1088 1097                          ssize32_t iovlen32;
1089 1098  
1090 1099                          iovlen32 = aiov32[i].iov_len;
1091 1100                          count32 += iovlen32;
1092      -                        if (iovlen32 < 0 || count32 < 0)
     1101 +                        if (iovlen32 < 0 || count32 < 0) {
     1102 +                                if (iovsize != 0) {
     1103 +                                        kmem_free(aiov32, iov32size);
     1104 +                                        kmem_free(aiov, iovsize);
     1105 +                                }
     1106 +
1093 1107                                  return (set_errno(EINVAL));
     1108 +                        }
     1109 +
1094 1110                          aiov[i].iov_len = iovlen32;
1095 1111                          aiov[i].iov_base =
1096 1112                              (caddr_t)(uintptr_t)aiov32[i].iov_base;
1097 1113                  }
     1114 +
     1115 +                if (iovsize != 0)
     1116 +                        kmem_free(aiov32, iov32size);
1098 1117          } else
1099 1118  #endif /* _SYSCALL32_IMPL */
1100 1119          if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
     1120 +                if (iovsize != 0)
     1121 +                        kmem_free(aiov, iovsize);
     1122 +
1101 1123                  return (set_errno(EFAULT));
1102 1124          }
1103 1125          len = 0;
1104 1126          for (i = 0; i < iovcnt; i++) {
1105 1127                  ssize_t iovlen = aiov[i].iov_len;
1106 1128                  len += iovlen;
1107 1129                  if (iovlen < 0 || len < 0) {
     1130 +                        if (iovsize != 0)
     1131 +                                kmem_free(aiov, iovsize);
     1132 +
1108 1133                          return (set_errno(EINVAL));
1109 1134                  }
1110 1135          }
1111 1136          auio.uio_loffset = 0;
1112 1137          auio.uio_iov = aiov;
1113 1138          auio.uio_iovcnt = iovcnt;
1114 1139          auio.uio_resid = len;
1115 1140          auio.uio_segflg = UIO_USERSPACE;
1116 1141          auio.uio_limit = 0;
1117 1142  
1118 1143          if (lmsg.msg_control != NULL &&
1119 1144              (do_useracc == 0 ||
1120 1145              useracc(lmsg.msg_control, lmsg.msg_controllen,
1121 1146              B_WRITE) != 0)) {
     1147 +                if (iovsize != 0)
     1148 +                        kmem_free(aiov, iovsize);
     1149 +
1122 1150                  return (set_errno(EFAULT));
1123 1151          }
1124 1152  
1125      -        return (recvit(sock, &lmsg, &auio, flags,
     1153 +        rval = recvit(sock, &lmsg, &auio, flags,
1126 1154              STRUCT_FADDR(umsgptr, msg_namelen),
1127      -            STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
     1155 +            STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
     1156 +
     1157 +        if (iovsize != 0)
     1158 +                kmem_free(aiov, iovsize);
     1159 +
     1160 +        return (rval);
1128 1161  }
1129 1162  
1130 1163  /*
1131 1164   * Common send function.
1132 1165   */
1133 1166  static ssize_t
1134 1167  sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1135 1168  {
1136 1169          struct sonode *so;
1137 1170          file_t *fp;

1138 1171          void *name;
1139 1172          socklen_t namelen;
1140 1173          void *control;
1141 1174          socklen_t controllen;
1142 1175          ssize_t len;
1143 1176          int error;
1144 1177  
1145 1178          if ((so = getsonode(sock, &error, &fp)) == NULL)
1146 1179                  return (set_errno(error));
1147 1180  
1148 1181          uiop->uio_fmode = fp->f_flag;
1149 1182  
1150 1183          if (so->so_family == AF_UNIX)
1151 1184                  uiop->uio_extflg = UIO_COPY_CACHED;
1152 1185          else
1153 1186                  uiop->uio_extflg = UIO_COPY_DEFAULT;
1154 1187  
1155 1188          /* Allocate and copyin name and control */
1156 1189          name = msg->msg_name;
1157 1190          namelen = msg->msg_namelen;
1158 1191          if (name != NULL && namelen != 0) {
1159 1192                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1160 1193                  name = copyin_name(so,
1161 1194                      (struct sockaddr *)name,
1162 1195                      &namelen, &error);
1163 1196                  if (name == NULL)
1164 1197                          goto done3;
1165 1198                  /* copyin_name null terminates addresses for AF_UNIX */
1166 1199                  msg->msg_namelen = namelen;
1167 1200                  msg->msg_name = name;
1168 1201          } else {
1169 1202                  msg->msg_name = name = NULL;
1170 1203                  msg->msg_namelen = namelen = 0;
1171 1204          }
1172 1205  
1173 1206          control = msg->msg_control;
1174 1207          controllen = msg->msg_controllen;
1175 1208          if ((control != NULL) && (controllen != 0)) {
1176 1209                  /*
1177 1210                   * Verify that the length is not excessive to prevent
1178 1211                   * an application from consuming all of kernel memory.
1179 1212                   */
1180 1213                  if (controllen > SO_MAXARGSIZE) {
1181 1214                          error = EINVAL;
1182 1215                          goto done2;
1183 1216                  }
1184 1217                  control = kmem_alloc(controllen, KM_SLEEP);
1185 1218  
1186 1219                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1187 1220                  if (copyin(msg->msg_control, control, controllen)) {
1188 1221                          error = EFAULT;
1189 1222                          goto done1;
1190 1223                  }
1191 1224                  msg->msg_control = control;
1192 1225          } else {
1193 1226                  msg->msg_control = control = NULL;
1194 1227                  msg->msg_controllen = controllen = 0;
1195 1228          }
1196 1229  
1197 1230          len = uiop->uio_resid;
1198 1231          msg->msg_flags = flags;
1199 1232  
1200 1233          error = socket_sendmsg(so, msg, uiop, CRED());
1201 1234  done1:
1202 1235          if (control != NULL)
1203 1236                  kmem_free(control, controllen);
1204 1237  done2:
1205 1238          if (name != NULL)
1206 1239                  kmem_free(name, namelen);
1207 1240  done3:
1208 1241          if (error != 0) {
1209 1242                  releasef(sock);
1210 1243                  return (set_errno(error));
1211 1244          }
1212 1245          lwp_stat_update(LWP_STAT_MSGSND, 1);
1213 1246          releasef(sock);
1214 1247          return (len - uiop->uio_resid);
1215 1248  }
1216 1249  
1217 1250  /*
1218 1251   * Native system call
1219 1252   */
1220 1253  ssize_t
1221 1254  send(int sock, void *buffer, size_t len, int flags)
1222 1255  {
1223 1256          struct nmsghdr lmsg;
1224 1257          struct uio auio;
1225 1258          struct iovec aiov[1];
1226 1259  
1227 1260          dprint(1, ("send(%d, %p, %ld, %d)\n",
1228 1261              sock, buffer, len, flags));
1229 1262  
1230 1263          if ((ssize_t)len < 0) {
1231 1264                  return (set_errno(EINVAL));
1232 1265          }
1233 1266  
1234 1267          aiov[0].iov_base = buffer;
1235 1268          aiov[0].iov_len = len;
1236 1269          auio.uio_loffset = 0;
1237 1270          auio.uio_iov = aiov;
1238 1271          auio.uio_iovcnt = 1;
1239 1272          auio.uio_resid = len;
1240 1273          auio.uio_segflg = UIO_USERSPACE;
1241 1274          auio.uio_limit = 0;
1242 1275  
1243 1276          lmsg.msg_name = NULL;
1244 1277          lmsg.msg_control = NULL;
1245 1278          if (!(flags & MSG_XPG4_2)) {
1246 1279                  /*
1247 1280                   * In order to be compatible with the libsocket/sockmod
1248 1281                   * implementation we set EOR for all send* calls.
1249 1282                   */
1250 1283                  flags |= MSG_EOR;
1251 1284          }
1252 1285          return (sendit(sock, &lmsg, &auio, flags));
1253 1286  }
1254 1287

↓ open down ↓

117 lines elided

↑ open up ↑

1255 1288  /*
1256 1289   * Uses the MSG_XPG4_2 flag to determine if the caller is using
1257 1290   * struct omsghdr or struct nmsghdr.
1258 1291   */
1259 1292  ssize_t
1260 1293  sendmsg(int sock, struct nmsghdr *msg, int flags)
1261 1294  {
1262 1295          struct nmsghdr lmsg;
1263 1296          STRUCT_DECL(nmsghdr, u_lmsg);
1264 1297          struct uio auio;
1265      -        struct iovec aiov[MSG_MAXIOVLEN];
     1298 +        struct iovec buf[IOV_MAX_STACK], *aiov = buf;
     1299 +        ssize_t iovsize = 0;
1266 1300          int iovcnt;
1267      -        ssize_t len;
     1301 +        ssize_t len, rval;
1268 1302          int i;
1269 1303          model_t model;
1270 1304  
1271 1305          dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1272 1306  
1273 1307          model = get_udatamodel();
1274 1308          STRUCT_INIT(u_lmsg, model);
1275 1309  
1276 1310          if (flags & MSG_XPG4_2) {
1277 1311                  if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),

1278 1312                      STRUCT_SIZE(u_lmsg)))
1279 1313                          return (set_errno(EFAULT));
1280 1314          } else {
1281 1315                  /*
1282 1316                   * Assumes that nmsghdr and omsghdr are identically shaped
1283 1317                   * except for the added msg_flags field.
1284 1318                   */
1285 1319                  if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1286 1320                      SIZEOF_STRUCT(omsghdr, model)))
1287 1321                          return (set_errno(EFAULT));
1288 1322                  /*
1289 1323                   * In order to be compatible with the libsocket/sockmod
1290 1324                   * implementation we set EOR for all send* calls.
1291 1325                   */
1292 1326                  flags |= MSG_EOR;
1293 1327          }
1294 1328  
1295 1329          /*
1296 1330           * Code below us will kmem_alloc memory and hang it
1297 1331           * off msg_control and msg_name fields. This forces
1298 1332           * us to copy the structure to its native form.
1299 1333           */

↓ open down ↓

22 lines elided

↑ open up ↑

1300 1334          lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1301 1335          lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1302 1336          lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1303 1337          lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1304 1338          lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1305 1339          lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1306 1340          lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1307 1341  
1308 1342          iovcnt = lmsg.msg_iovlen;
1309 1343  
1310      -        if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
     1344 +        if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1311 1345                  /*
1312 1346                   * Unless this is XPG 4.2 we allow iovcnt == 0 to
1313 1347                   * be compatible with SunOS 4.X and 4.4BSD.
1314 1348                   */
1315 1349                  if (iovcnt != 0 || (flags & MSG_XPG4_2))
1316 1350                          return (set_errno(EMSGSIZE));
1317 1351          }
1318 1352  
     1353 +        if (iovcnt > IOV_MAX_STACK) {
     1354 +                iovsize = iovcnt * sizeof (struct iovec);
     1355 +                aiov = kmem_alloc(iovsize, KM_SLEEP);
     1356 +        }
     1357 +
1319 1358  #ifdef _SYSCALL32_IMPL
1320 1359          /*
1321 1360           * 32-bit callers need to have their iovec expanded, while ensuring
1322 1361           * that they can't move more than 2Gbytes of data in a single call.
1323 1362           */
1324 1363          if (model == DATAMODEL_ILP32) {
1325      -                struct iovec32 aiov32[MSG_MAXIOVLEN];
     1364 +                struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
     1365 +                ssize_t iov32size;
1326 1366                  ssize32_t count32;
1327 1367  
     1368 +                iov32size = iovcnt * sizeof (struct iovec32);
     1369 +                if (iovsize != 0)
     1370 +                        aiov32 = kmem_alloc(iov32size, KM_SLEEP);
     1371 +
1328 1372                  if (iovcnt != 0 &&
1329      -                    copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1330      -                    iovcnt * sizeof (struct iovec32)))
     1373 +                    copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
     1374 +                        if (iovsize != 0) {
     1375 +                                kmem_free(aiov32, iov32size);
     1376 +                                kmem_free(aiov, iovsize);
     1377 +                        }
     1378 +
1331 1379                          return (set_errno(EFAULT));
     1380 +                }
1332 1381  
1333 1382                  count32 = 0;
1334 1383                  for (i = 0; i < iovcnt; i++) {
1335 1384                          ssize32_t iovlen32;
1336 1385  
1337 1386                          iovlen32 = aiov32[i].iov_len;
1338 1387                          count32 += iovlen32;
1339      -                        if (iovlen32 < 0 || count32 < 0)
     1388 +                        if (iovlen32 < 0 || count32 < 0) {
     1389 +                                if (iovsize != 0) {
     1390 +                                        kmem_free(aiov32, iov32size);
     1391 +                                        kmem_free(aiov, iovsize);
     1392 +                                }
     1393 +
1340 1394                                  return (set_errno(EINVAL));
     1395 +                        }
     1396 +
1341 1397                          aiov[i].iov_len = iovlen32;
1342 1398                          aiov[i].iov_base =
1343 1399                              (caddr_t)(uintptr_t)aiov32[i].iov_base;
1344 1400                  }
     1401 +
     1402 +                if (iovsize != 0)
     1403 +                        kmem_free(aiov32, iov32size);
1345 1404          } else
1346 1405  #endif /* _SYSCALL32_IMPL */
1347 1406          if (iovcnt != 0 &&
1348 1407              copyin(lmsg.msg_iov, aiov,
1349 1408              (unsigned)iovcnt * sizeof (struct iovec))) {
     1409 +                if (iovsize != 0)
     1410 +                        kmem_free(aiov, iovsize);
     1411 +
1350 1412                  return (set_errno(EFAULT));
1351 1413          }
1352 1414          len = 0;
1353 1415          for (i = 0; i < iovcnt; i++) {
1354 1416                  ssize_t iovlen = aiov[i].iov_len;
1355 1417                  len += iovlen;
1356 1418                  if (iovlen < 0 || len < 0) {
     1419 +                        if (iovsize != 0)
     1420 +                                kmem_free(aiov, iovsize);
     1421 +
1357 1422                          return (set_errno(EINVAL));
1358 1423                  }
1359 1424          }
1360 1425          auio.uio_loffset = 0;
1361 1426          auio.uio_iov = aiov;
1362 1427          auio.uio_iovcnt = iovcnt;
1363 1428          auio.uio_resid = len;
1364 1429          auio.uio_segflg = UIO_USERSPACE;
1365 1430          auio.uio_limit = 0;
1366 1431  
1367      -        return (sendit(sock, &lmsg, &auio, flags));
     1432 +        rval = sendit(sock, &lmsg, &auio, flags);
     1433 +
     1434 +        if (iovsize != 0)
     1435 +                kmem_free(aiov, iovsize);
     1436 +
     1437 +        return (rval);
1368 1438  }
1369 1439  
1370 1440  ssize_t
1371 1441  sendto(int sock, void *buffer, size_t len, int flags,
1372 1442      struct sockaddr *name, socklen_t namelen)
1373 1443  {
1374 1444          struct nmsghdr lmsg;
1375 1445          struct uio auio;
1376 1446          struct iovec aiov[1];
1377 1447

1378 1448          dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1379 1449              sock, buffer, len, flags, (void *)name, namelen));
1380 1450  
1381 1451          if ((ssize_t)len < 0) {
1382 1452                  return (set_errno(EINVAL));
1383 1453          }
1384 1454  
1385 1455          aiov[0].iov_base = buffer;
1386 1456          aiov[0].iov_len = len;
1387 1457          auio.uio_loffset = 0;
1388 1458          auio.uio_iov = aiov;
1389 1459          auio.uio_iovcnt = 1;
1390 1460          auio.uio_resid = len;
1391 1461          auio.uio_segflg = UIO_USERSPACE;
1392 1462          auio.uio_limit = 0;
1393 1463  
1394 1464          lmsg.msg_name = (char *)name;
1395 1465          lmsg.msg_namelen = namelen;
1396 1466          lmsg.msg_control = NULL;
1397 1467          if (!(flags & MSG_XPG4_2)) {
1398 1468                  /*
1399 1469                   * In order to be compatible with the libsocket/sockmod
1400 1470                   * implementation we set EOR for all send* calls.
1401 1471                   */
1402 1472                  flags |= MSG_EOR;
1403 1473          }
1404 1474          return (sendit(sock, &lmsg, &auio, flags));
1405 1475  }
1406 1476  
1407 1477  /*ARGSUSED3*/
1408 1478  int
1409 1479  getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1410 1480  {
1411 1481          struct sonode *so;
1412 1482          int error;
1413 1483          socklen_t namelen;
1414 1484          socklen_t sock_addrlen;
1415 1485          struct sockaddr *sock_addrp;
1416 1486  
1417 1487          dprint(1, ("getpeername(%d, %p, %p)\n",
1418 1488              sock, (void *)name, (void *)namelenp));
1419 1489  
1420 1490          if ((so = getsonode(sock, &error, NULL)) == NULL)
1421 1491                  goto bad;
1422 1492  
1423 1493          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1424 1494          if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1425 1495              (name == NULL && namelen != 0)) {
1426 1496                  error = EFAULT;
1427 1497                  goto rel_out;
1428 1498          }
1429 1499          sock_addrlen = so->so_max_addr_len;
1430 1500          sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1431 1501  
1432 1502          if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1433 1503              B_FALSE, CRED())) == 0) {
1434 1504                  ASSERT(sock_addrlen <= so->so_max_addr_len);
1435 1505                  error = copyout_name(name, namelen, namelenp,
1436 1506                      (void *)sock_addrp, sock_addrlen);
1437 1507          }
1438 1508          kmem_free(sock_addrp, so->so_max_addr_len);
1439 1509  rel_out:
1440 1510          releasef(sock);
1441 1511  bad:    return (error != 0 ? set_errno(error) : 0);
1442 1512  }
1443 1513  
1444 1514  /*ARGSUSED3*/
1445 1515  int
1446 1516  getsockname(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1447 1517  {
1448 1518          struct sonode *so;
1449 1519          int error;
1450 1520          socklen_t namelen, sock_addrlen;
1451 1521          struct sockaddr *sock_addrp;
1452 1522  
1453 1523          dprint(1, ("getsockname(%d, %p, %p)\n",
1454 1524              sock, (void *)name, (void *)namelenp));
1455 1525  
1456 1526          if ((so = getsonode(sock, &error, NULL)) == NULL)
1457 1527                  goto bad;
1458 1528  
1459 1529          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1460 1530          if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1461 1531              (name == NULL && namelen != 0)) {
1462 1532                  error = EFAULT;
1463 1533                  goto rel_out;
1464 1534          }
1465 1535  
1466 1536          sock_addrlen = so->so_max_addr_len;
1467 1537          sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1468 1538          if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1469 1539              CRED())) == 0) {
1470 1540                  ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1471 1541                  ASSERT(sock_addrlen <= so->so_max_addr_len);
1472 1542                  error = copyout_name(name, namelen, namelenp,
1473 1543                      (void *)sock_addrp, sock_addrlen);
1474 1544          }
1475 1545          kmem_free(sock_addrp, so->so_max_addr_len);
1476 1546  rel_out:
1477 1547          releasef(sock);
1478 1548  bad:    return (error != 0 ? set_errno(error) : 0);
1479 1549  }
1480 1550  
1481 1551  /*ARGSUSED5*/
1482 1552  int
1483 1553  getsockopt(int sock, int level, int option_name, void *option_value,
1484 1554      socklen_t *option_lenp, int version)
1485 1555  {
1486 1556          struct sonode *so;
1487 1557          socklen_t optlen, optlen_res;
1488 1558          void *optval;
1489 1559          int error;
1490 1560  
1491 1561          dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1492 1562              sock, level, option_name, option_value, (void *)option_lenp));
1493 1563  
1494 1564          if ((so = getsonode(sock, &error, NULL)) == NULL)
1495 1565                  return (set_errno(error));
1496 1566  
1497 1567          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1498 1568          if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1499 1569                  releasef(sock);
1500 1570                  return (set_errno(EFAULT));
1501 1571          }
1502 1572          /*
1503 1573           * Verify that the length is not excessive to prevent
1504 1574           * an application from consuming all of kernel memory.
1505 1575           */
1506 1576          if (optlen > SO_MAXARGSIZE) {
1507 1577                  error = EINVAL;
1508 1578                  releasef(sock);
1509 1579                  return (set_errno(error));
1510 1580          }
1511 1581          optval = kmem_alloc(optlen, KM_SLEEP);
1512 1582          optlen_res = optlen;
1513 1583          error = socket_getsockopt(so, level, option_name, optval,
1514 1584              &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1515 1585              CRED());
1516 1586          releasef(sock);
1517 1587          if (error) {
1518 1588                  kmem_free(optval, optlen);
1519 1589                  return (set_errno(error));
1520 1590          }
1521 1591          error = copyout_arg(option_value, optlen, option_lenp,
1522 1592              optval, optlen_res);
1523 1593          kmem_free(optval, optlen);
1524 1594          if (error)
1525 1595                  return (set_errno(error));
1526 1596          return (0);
1527 1597  }
1528 1598  
1529 1599  /*ARGSUSED5*/
1530 1600  int
1531 1601  setsockopt(int sock, int level, int option_name, void *option_value,
1532 1602      socklen_t option_len, int version)
1533 1603  {
1534 1604          struct sonode *so;
1535 1605          intptr_t buffer[2];
1536 1606          void *optval = NULL;
1537 1607          int error;
1538 1608  
1539 1609          dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1540 1610              sock, level, option_name, option_value, option_len));
1541 1611  
1542 1612          if ((so = getsonode(sock, &error, NULL)) == NULL)
1543 1613                  return (set_errno(error));
1544 1614  
1545 1615          if (option_value != NULL) {
1546 1616                  if (option_len != 0) {
1547 1617                          /*
1548 1618                           * Verify that the length is not excessive to prevent
1549 1619                           * an application from consuming all of kernel memory.
1550 1620                           */
1551 1621                          if (option_len > SO_MAXARGSIZE) {
1552 1622                                  error = EINVAL;
1553 1623                                  goto done2;
1554 1624                          }
1555 1625                          optval = option_len <= sizeof (buffer) ?
1556 1626                              &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1557 1627                          ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1558 1628                          if (copyin(option_value, optval, (size_t)option_len)) {
1559 1629                                  error = EFAULT;
1560 1630                                  goto done1;
1561 1631                          }
1562 1632                  }
1563 1633          } else
1564 1634                  option_len = 0;
1565 1635  
1566 1636          error = socket_setsockopt(so, level, option_name, optval,
1567 1637              (t_uscalar_t)option_len, CRED());
1568 1638  done1:
1569 1639          if (optval != buffer)
1570 1640                  kmem_free(optval, (size_t)option_len);
1571 1641  done2:
1572 1642          releasef(sock);
1573 1643          if (error)
1574 1644                  return (set_errno(error));
1575 1645          return (0);
1576 1646  }
1577 1647  
1578 1648  static int
1579 1649  sockconf_add_sock(int family, int type, int protocol, char *name)
1580 1650  {
1581 1651          int error = 0;
1582 1652          char *kdevpath = NULL;
1583 1653          char *kmodule = NULL;
1584 1654          char *buf = NULL;
1585 1655          size_t pathlen = 0;
1586 1656          struct sockparams *sp;
1587 1657  
1588 1658          if (name == NULL)
1589 1659                  return (EINVAL);
1590 1660          /*
1591 1661           * Copyin the name.
1592 1662           * This also makes it possible to check for too long pathnames.
1593 1663           * Compress the space needed for the name before passing it
1594 1664           * to soconfig - soconfig will store the string until
1595 1665           * the configuration is removed.
1596 1666           */
1597 1667          buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1598 1668          if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1599 1669                  kmem_free(buf, MAXPATHLEN);
1600 1670                  return (error);
1601 1671          }
1602 1672          if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1603 1673                  /* For device */
1604 1674  
1605 1675                  /*
1606 1676                   * Special handling for NCA:
1607 1677                   *
1608 1678                   * DEV_NCA is never opened even if an application
1609 1679                   * requests for AF_NCA. The device opened is instead a
1610 1680                   * predefined AF_INET transport (NCA_INET_DEV).
1611 1681                   *
1612 1682                   * Prior to Volo (PSARC/2007/587) NCA would determine
1613 1683                   * the device using a lookup, which worked then because
1614 1684                   * all protocols were based on TPI. Since TPI is no
1615 1685                   * longer the default, we have to explicitly state
1616 1686                   * which device to use.
1617 1687                   */
1618 1688                  if (strcmp(buf, NCA_DEV) == 0) {
1619 1689                          /* only support entry <28, 2, 0> */
1620 1690                          if (family != AF_NCA || type != SOCK_STREAM ||
1621 1691                              protocol != 0) {
1622 1692                                  kmem_free(buf, MAXPATHLEN);
1623 1693                                  return (EINVAL);
1624 1694                          }
1625 1695  
1626 1696                          pathlen = strlen(NCA_INET_DEV) + 1;
1627 1697                          kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1628 1698                          bcopy(NCA_INET_DEV, kdevpath, pathlen);
1629 1699                          kdevpath[pathlen - 1] = '\0';
1630 1700                  } else {
1631 1701                          kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1632 1702                          bcopy(buf, kdevpath, pathlen);
1633 1703                          kdevpath[pathlen - 1] = '\0';
1634 1704                  }
1635 1705          } else {
1636 1706                  /* For socket module */
1637 1707                  kmodule = kmem_alloc(pathlen, KM_SLEEP);
1638 1708                  bcopy(buf, kmodule, pathlen);
1639 1709                  kmodule[pathlen - 1] = '\0';
1640 1710                  pathlen = 0;
1641 1711          }
1642 1712          kmem_free(buf, MAXPATHLEN);
1643 1713  
1644 1714          /* sockparams_create frees mod name and devpath upon failure */
1645 1715          sp = sockparams_create(family, type, protocol, kmodule,
1646 1716              kdevpath, pathlen, 0, KM_SLEEP, &error);
1647 1717          if (sp != NULL) {
1648 1718                  error = sockparams_add(sp);
1649 1719                  if (error != 0)
1650 1720                          sockparams_destroy(sp);
1651 1721          }
1652 1722  
1653 1723          return (error);
1654 1724  }
1655 1725  
1656 1726  static int
1657 1727  sockconf_remove_sock(int family, int type, int protocol)
1658 1728  {
1659 1729          return (sockparams_delete(family, type, protocol));
1660 1730  }
1661 1731  
1662 1732  static int
1663 1733  sockconfig_remove_filter(const char *uname)
1664 1734  {
1665 1735          char kname[SOF_MAXNAMELEN];
1666 1736          size_t len;
1667 1737          int error;
1668 1738          sof_entry_t *ent;
1669 1739  
1670 1740          if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1671 1741                  return (error);
1672 1742  
1673 1743          ent = sof_entry_remove_by_name(kname);
1674 1744          if (ent == NULL)
1675 1745                  return (ENXIO);
1676 1746  
1677 1747          mutex_enter(&ent->sofe_lock);
1678 1748          ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1679 1749          if (ent->sofe_refcnt == 0) {
1680 1750                  mutex_exit(&ent->sofe_lock);
1681 1751                  sof_entry_free(ent);
1682 1752          } else {
1683 1753                  /* let the last socket free the filter */
1684 1754                  ent->sofe_flags |= SOFEF_CONDEMED;
1685 1755                  mutex_exit(&ent->sofe_lock);
1686 1756          }
1687 1757  
1688 1758          return (0);
1689 1759  }
1690 1760  
1691 1761  static int
1692 1762  sockconfig_add_filter(const char *uname, void *ufilpropp)
1693 1763  {
1694 1764          struct sockconfig_filter_props filprop;
1695 1765          sof_entry_t *ent;
1696 1766          int error;
1697 1767          size_t tuplesz, len;
1698 1768          char hintbuf[SOF_MAXNAMELEN];
1699 1769  
1700 1770          ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1701 1771          mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1702 1772  
1703 1773          if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1704 1774              &len)) != 0) {
1705 1775                  sof_entry_free(ent);
1706 1776                  return (error);
1707 1777          }
1708 1778  
1709 1779          if (get_udatamodel() == DATAMODEL_NATIVE) {
1710 1780                  if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1711 1781                          sof_entry_free(ent);
1712 1782                          return (EFAULT);
1713 1783                  }
1714 1784          }
1715 1785  #ifdef  _SYSCALL32_IMPL
1716 1786          else {
1717 1787                  struct sockconfig_filter_props32 filprop32;
1718 1788  
1719 1789                  if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1720 1790                          sof_entry_free(ent);
1721 1791                          return (EFAULT);
1722 1792                  }
1723 1793                  filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1724 1794                  filprop.sfp_autoattach = filprop32.sfp_autoattach;
1725 1795                  filprop.sfp_hint = filprop32.sfp_hint;
1726 1796                  filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1727 1797                  filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1728 1798                  filprop.sfp_socktuple =
1729 1799                      (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1730 1800          }
1731 1801  #endif  /* _SYSCALL32_IMPL */
1732 1802  
1733 1803          if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1734 1804              sizeof (ent->sofe_modname), &len)) != 0) {
1735 1805                  sof_entry_free(ent);
1736 1806                  return (error);
1737 1807          }
1738 1808  
1739 1809          /*
1740 1810           * A filter must specify at least one socket tuple.
1741 1811           */
1742 1812          if (filprop.sfp_socktuple_cnt == 0 ||
1743 1813              filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1744 1814                  sof_entry_free(ent);
1745 1815                  return (EINVAL);
1746 1816          }
1747 1817          ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1748 1818          ent->sofe_hint = filprop.sfp_hint;
1749 1819  
1750 1820          /*
1751 1821           * Verify the hint, and copy in the hint argument, if necessary.
1752 1822           */
1753 1823          switch (ent->sofe_hint) {
1754 1824          case SOF_HINT_BEFORE:
1755 1825          case SOF_HINT_AFTER:
1756 1826                  if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1757 1827                      sizeof (hintbuf), &len)) != 0) {
1758 1828                          sof_entry_free(ent);
1759 1829                          return (error);
1760 1830                  }
1761 1831                  ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1762 1832                  bcopy(hintbuf, ent->sofe_hintarg, len);
1763 1833                  /* FALLTHRU */
1764 1834          case SOF_HINT_TOP:
1765 1835          case SOF_HINT_BOTTOM:
1766 1836                  /* hints cannot be used with programmatic filters */
1767 1837                  if (ent->sofe_flags & SOFEF_PROG) {
1768 1838                          sof_entry_free(ent);
1769 1839                          return (EINVAL);
1770 1840                  }
1771 1841                  break;
1772 1842          case SOF_HINT_NONE:
1773 1843                  break;
1774 1844          default:
1775 1845                  /* bad hint value */
1776 1846                  sof_entry_free(ent);
1777 1847                  return (EINVAL);
1778 1848          }
1779 1849  
1780 1850          ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1781 1851          tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1782 1852          ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1783 1853  
1784 1854          if (get_udatamodel() == DATAMODEL_NATIVE) {
1785 1855                  if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1786 1856                      tuplesz)) {
1787 1857                          sof_entry_free(ent);
1788 1858                          return (EFAULT);
1789 1859                  }
1790 1860          }
1791 1861  #ifdef  _SYSCALL32_IMPL
1792 1862          else {
1793 1863                  int i;
1794 1864                  caddr_t data = (caddr_t)filprop.sfp_socktuple;
1795 1865                  sof_socktuple_t *tup = ent->sofe_socktuple;
1796 1866                  sof_socktuple32_t tup32;
1797 1867  
1798 1868                  tup = ent->sofe_socktuple;
1799 1869                  for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1800 1870                          ASSERT(tup < ent->sofe_socktuple + tuplesz);
1801 1871  
1802 1872                          if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1803 1873                                  sof_entry_free(ent);
1804 1874                                  return (EFAULT);
1805 1875                          }
1806 1876                          tup->sofst_family = tup32.sofst_family;
1807 1877                          tup->sofst_type = tup32.sofst_type;
1808 1878                          tup->sofst_protocol = tup32.sofst_protocol;
1809 1879  
1810 1880                          data += sizeof (tup32);
1811 1881                  }
1812 1882          }
1813 1883  #endif  /* _SYSCALL32_IMPL */
1814 1884  
1815 1885          /* Sockets can start using the filter as soon as the filter is added */
1816 1886          if ((error = sof_entry_add(ent)) != 0)
1817 1887                  sof_entry_free(ent);
1818 1888  
1819 1889          return (error);
1820 1890  }
1821 1891  
1822 1892  /*
1823 1893   * Socket configuration system call. It is used to add and remove
1824 1894   * socket types.
1825 1895   */
1826 1896  int
1827 1897  sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1828 1898  {
1829 1899          int error = 0;
1830 1900  
1831 1901          if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1832 1902                  return (set_errno(EPERM));
1833 1903  
1834 1904          if (sockfs_defer_nl7c_init) {
1835 1905                  nl7c_init();
1836 1906                  sockfs_defer_nl7c_init = 0;
1837 1907          }
1838 1908  
1839 1909          switch (cmd) {
1840 1910          case SOCKCONFIG_ADD_SOCK:
1841 1911                  error = sockconf_add_sock((int)(uintptr_t)arg1,
1842 1912                      (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1843 1913                  break;
1844 1914          case SOCKCONFIG_REMOVE_SOCK:
1845 1915                  error = sockconf_remove_sock((int)(uintptr_t)arg1,
1846 1916                      (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1847 1917                  break;
1848 1918          case SOCKCONFIG_ADD_FILTER:
1849 1919                  error = sockconfig_add_filter((const char *)arg1, arg2);
1850 1920                  break;
1851 1921          case SOCKCONFIG_REMOVE_FILTER:
1852 1922                  error = sockconfig_remove_filter((const char *)arg1);
1853 1923                  break;
1854 1924          case SOCKCONFIG_GET_SOCKTABLE:
1855 1925                  error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1856 1926                  break;
1857 1927          default:
1858 1928  #ifdef  DEBUG
1859 1929                  cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1860 1930  #endif
1861 1931                  error = EINVAL;
1862 1932                  break;
1863 1933          }
1864 1934  
1865 1935          if (error != 0) {
1866 1936                  eprintline(error);
1867 1937                  return (set_errno(error));
1868 1938          }
1869 1939          return (0);
1870 1940  }
1871 1941  
1872 1942  
1873 1943  /*
1874 1944   * Sendfile is implemented through two schemes, direct I/O or by
1875 1945   * caching in the filesystem page cache. We cache the input file by
1876 1946   * default and use direct I/O only if sendfile_max_size is set
1877 1947   * appropriately as explained below. Note that this logic is consistent
1878 1948   * with other filesystems where caching is turned on by default
1879 1949   * unless explicitly turned off by using the DIRECTIO ioctl.
1880 1950   *
1881 1951   * We choose a slightly different scheme here. One can turn off
1882 1952   * caching by setting sendfile_max_size to 0. One can also enable
1883 1953   * caching of files <= sendfile_max_size by setting sendfile_max_size
1884 1954   * to an appropriate value. By default sendfile_max_size is set to the
1885 1955   * maximum value so that all files are cached. In future, we may provide
1886 1956   * better interfaces for caching the file.
1887 1957   *
1888 1958   * Sendfile through Direct I/O (Zero copy)
1889 1959   * --------------------------------------
1890 1960   *
1891 1961   * As disks are normally slower than the network, we can't have a
1892 1962   * single thread that reads the disk and writes to the network. We
1893 1963   * need to have parallelism. This is done by having the sendfile
1894 1964   * thread create another thread that reads from the filesystem
1895 1965   * and queues it for network processing. In this scheme, the data
1896 1966   * is never copied anywhere i.e it is zero copy unlike the other
1897 1967   * scheme.
1898 1968   *
1899 1969   * We have a sendfile queue (snfq) where each sendfile
1900 1970   * request (snf_req_t) is queued for processing by a thread. Number
1901 1971   * of threads is dynamically allocated and they exit if they are idling
1902 1972   * beyond a specified amount of time. When each request (snf_req_t) is
1903 1973   * processed by a thread, it produces a number of mblk_t structures to
1904 1974   * be consumed by the sendfile thread. snf_deque and snf_enque are
1905 1975   * used for consuming and producing mblks. Size of the filesystem
1906 1976   * read is determined by the tunable (sendfile_read_size). A single
1907 1977   * mblk holds sendfile_read_size worth of data (except the last
1908 1978   * read of the file) which is sent down as a whole to the network.
1909 1979   * sendfile_read_size is set to 1 MB as this seems to be the optimal
1910 1980   * value for the UFS filesystem backed by a striped storage array.
1911 1981   *
1912 1982   * Synchronisation between read (producer) and write (consumer) threads.
1913 1983   * --------------------------------------------------------------------
1914 1984   *
1915 1985   * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1916 1986   * adding and deleting items in this list. Error can happen anytime
1917 1987   * during read or write. There could be unprocessed mblks in the
1918 1988   * sr_ib_XXX list when a read or write error occurs. Whenever error
1919 1989   * is encountered, we need two things to happen :
1920 1990   *
1921 1991   * a) One of the threads need to clean the mblks.
1922 1992   * b) When one thread encounters an error, the other should stop.
1923 1993   *
1924 1994   * For (a), we don't want to penalize the reader thread as it could do
1925 1995   * some useful work processing other requests. For (b), the error can
1926 1996   * be detected by examining sr_read_error or sr_write_error.
1927 1997   * sr_lock protects sr_read_error and sr_write_error. If both reader and
1928 1998   * writer encounters error, we need to report the write error back to
1929 1999   * the application as that's what would have happened if the operations
1930 2000   * were done sequentially. With this in mind, following should work :
1931 2001   *
1932 2002   *      - Check for errors before read or write.
1933 2003   *      - If the reader encounters error, set the error in sr_read_error.
1934 2004   *        Check sr_write_error, if it is set, send cv_signal as it is
1935 2005   *        waiting for reader to complete. If it is not set, the writer
1936 2006   *        is either running sinking data to the network or blocked
1937 2007   *        because of flow control. For handling the latter case, we
1938 2008   *        always send a signal. In any case, it will examine sr_read_error
1939 2009   *        and return. sr_read_error is marked with SR_READ_DONE to tell
1940 2010   *        the writer that the reader is done in all the cases.
1941 2011   *      - If the writer encounters error, set the error in sr_write_error.
1942 2012   *        The reader thread is either blocked because of flow control or
1943 2013   *        running reading data from the disk. For the former, we need to
1944 2014   *        wakeup the thread. Again to keep it simple, we always wake up
1945 2015   *        the reader thread. Then, wait for the read thread to complete
1946 2016   *        if it is not done yet. Cleanup and return.
1947 2017   *
1948 2018   * High and low water marks for the read thread.
1949 2019   * --------------------------------------------
1950 2020   *
1951 2021   * If sendfile() is used to send data over a slow network, we need to
1952 2022   * make sure that the read thread does not produce data at a faster
1953 2023   * rate than the network. This can happen if the disk is faster than
1954 2024   * the network. In such a case, we don't want to build a very large queue.
1955 2025   * But we would still like to get all of the network throughput possible.
1956 2026   * This implies that network should never block waiting for data.
1957 2027   * As there are lot of disk throughput/network throughput combinations
1958 2028   * possible, it is difficult to come up with an accurate number.
1959 2029   * A typical 10K RPM disk has a max seek latency 17ms and rotational
1960 2030   * latency of 3ms for reading a disk block. Thus, the total latency to
1961 2031   * initiate a new read, transfer data from the disk and queue for
1962 2032   * transmission would take about a max of 25ms. Todays max transfer rate
1963 2033   * for network is 100MB/sec. If the thread is blocked because of flow
1964 2034   * control, it would take 25ms to get new data ready for transmission.
1965 2035   * We have to make sure that network is not idling, while we are initiating
1966 2036   * new transfers. So, at 100MB/sec, to keep network busy we would need
1967 2037   * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1968 2038   * We need to pick a high water mark so that the woken up thread would
1969 2039   * do considerable work before blocking again to prevent thrashing. Currently,
1970 2040   * we pick this to be 10 times that of the low water mark.
1971 2041   *
1972 2042   * Sendfile with segmap caching (One copy from page cache to mblks).
1973 2043   * ----------------------------------------------------------------
1974 2044   *
1975 2045   * We use the segmap cache for caching the file, if the size of file
1976 2046   * is <= sendfile_max_size. In this case we don't use threads as VM
1977 2047   * is reasonably fast enough to keep up with the network. If the underlying
1978 2048   * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1979 2049   * of data into segmap space, and use the virtual address from segmap
1980 2050   * directly through desballoc() to avoid copy. Once the transport is done
1981 2051   * with the data, the mapping will be released through segmap_release()
1982 2052   * called by the call-back routine.
1983 2053   *
1984 2054   * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1985 2055   * to copy the data from the filesystem into our temporary network buffer.
1986 2056   *
1987 2057   * To disable caching, set sendfile_max_size to 0.
1988 2058   */
1989 2059  
1990 2060  uint_t sendfile_read_size = 1024 * 1024;
1991 2061  #define SENDFILE_REQ_LOWAT      3 * 1024 * 1024
1992 2062  uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1993 2063  uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1994 2064  struct sendfile_stats sf_stats;
1995 2065  struct sendfile_queue *snfq;
1996 2066  clock_t snfq_timeout;
1997 2067  off64_t sendfile_max_size;
1998 2068  
1999 2069  static void snf_enque(snf_req_t *, mblk_t *);
2000 2070  static mblk_t *snf_deque(snf_req_t *);
2001 2071  
2002 2072  void
2003 2073  sendfile_init(void)
2004 2074  {
2005 2075          snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2006 2076  
2007 2077          mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2008 2078          cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2009 2079          snfq->snfq_max_threads = max_ncpus;
2010 2080          snfq_timeout = SNFQ_TIMEOUT;
2011 2081          /* Cache all files by default. */
2012 2082          sendfile_max_size = MAXOFFSET_T;
2013 2083  }
2014 2084  
2015 2085  /*
2016 2086   * Queues a mblk_t for network processing.
2017 2087   */
2018 2088  static void
2019 2089  snf_enque(snf_req_t *sr, mblk_t *mp)
2020 2090  {
2021 2091          mp->b_next = NULL;
2022 2092          mutex_enter(&sr->sr_lock);
2023 2093          if (sr->sr_mp_head == NULL) {
2024 2094                  sr->sr_mp_head = sr->sr_mp_tail = mp;
2025 2095                  cv_signal(&sr->sr_cv);
2026 2096          } else {
2027 2097                  sr->sr_mp_tail->b_next = mp;
2028 2098                  sr->sr_mp_tail = mp;
2029 2099          }
2030 2100          sr->sr_qlen += MBLKL(mp);
2031 2101          while ((sr->sr_qlen > sr->sr_hiwat) &&
2032 2102              (sr->sr_write_error == 0)) {
2033 2103                  sf_stats.ss_full_waits++;
2034 2104                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2035 2105          }
2036 2106          mutex_exit(&sr->sr_lock);
2037 2107  }
2038 2108  
2039 2109  /*
2040 2110   * De-queues a mblk_t for network processing.
2041 2111   */
2042 2112  static mblk_t *
2043 2113  snf_deque(snf_req_t *sr)
2044 2114  {
2045 2115          mblk_t *mp;
2046 2116  
2047 2117          mutex_enter(&sr->sr_lock);
2048 2118          /*
2049 2119           * If we have encountered an error on read or read is
2050 2120           * completed and no more mblks, return NULL.
2051 2121           * We need to check for NULL sr_mp_head also as
2052 2122           * the reads could have completed and there is
2053 2123           * nothing more to come.
2054 2124           */
2055 2125          if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2056 2126              ((sr->sr_read_error & SR_READ_DONE) &&
2057 2127              sr->sr_mp_head == NULL)) {
2058 2128                  mutex_exit(&sr->sr_lock);
2059 2129                  return (NULL);
2060 2130          }
2061 2131          /*
2062 2132           * To start with neither SR_READ_DONE is marked nor
2063 2133           * the error is set. When we wake up from cv_wait,
2064 2134           * following are the possibilities :
2065 2135           *
2066 2136           *      a) sr_read_error is zero and mblks are queued.
2067 2137           *      b) sr_read_error is set to SR_READ_DONE
2068 2138           *         and mblks are queued.
2069 2139           *      c) sr_read_error is set to SR_READ_DONE
2070 2140           *         and no mblks.
2071 2141           *      d) sr_read_error is set to some error other
2072 2142           *         than SR_READ_DONE.
2073 2143           */
2074 2144  
2075 2145          while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2076 2146                  sf_stats.ss_empty_waits++;
2077 2147                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2078 2148          }
2079 2149          /* Handle (a) and (b) first  - the normal case. */
2080 2150          if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2081 2151              (sr->sr_mp_head != NULL)) {
2082 2152                  mp = sr->sr_mp_head;
2083 2153                  sr->sr_mp_head = mp->b_next;
2084 2154                  sr->sr_qlen -= MBLKL(mp);
2085 2155                  if (sr->sr_qlen < sr->sr_lowat)
2086 2156                          cv_signal(&sr->sr_cv);
2087 2157                  mutex_exit(&sr->sr_lock);
2088 2158                  mp->b_next = NULL;
2089 2159                  return (mp);
2090 2160          }
2091 2161          /* Handle (c) and (d). */
2092 2162          mutex_exit(&sr->sr_lock);
2093 2163          return (NULL);
2094 2164  }
2095 2165  
2096 2166  /*
2097 2167   * Reads data from the filesystem and queues it for network processing.
2098 2168   */
2099 2169  void
2100 2170  snf_async_read(snf_req_t *sr)
2101 2171  {
2102 2172          size_t iosize;
2103 2173          u_offset_t fileoff;
2104 2174          u_offset_t size;
2105 2175          int ret_size;
2106 2176          int error;
2107 2177          file_t *fp;
2108 2178          mblk_t *mp;
2109 2179          struct vnode *vp;
2110 2180          int extra = 0;
2111 2181          int maxblk = 0;
2112 2182          int wroff = 0;
2113 2183          struct sonode *so;
2114 2184  
2115 2185          fp = sr->sr_fp;
2116 2186          size = sr->sr_file_size;
2117 2187          fileoff = sr->sr_file_off;
2118 2188  
2119 2189          /*
2120 2190           * Ignore the error for filesystems that doesn't support DIRECTIO.
2121 2191           */
2122 2192          (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2123 2193              kcred, NULL, NULL);
2124 2194  
2125 2195          vp = sr->sr_vp;
2126 2196          if (vp->v_type == VSOCK) {
2127 2197                  stdata_t *stp;
2128 2198  
2129 2199                  /*
2130 2200                   * Get the extra space to insert a header and a trailer.
2131 2201                   */
2132 2202                  so = VTOSO(vp);
2133 2203                  stp = vp->v_stream;
2134 2204                  if (stp == NULL) {
2135 2205                          wroff = so->so_proto_props.sopp_wroff;
2136 2206                          maxblk = so->so_proto_props.sopp_maxblk;
2137 2207                          extra = wroff + so->so_proto_props.sopp_tail;
2138 2208                  } else {
2139 2209                          wroff = (int)(stp->sd_wroff);
2140 2210                          maxblk = (int)(stp->sd_maxblk);
2141 2211                          extra = wroff + (int)(stp->sd_tail);
2142 2212                  }
2143 2213          }
2144 2214  
2145 2215          while ((size != 0) && (sr->sr_write_error == 0)) {
2146 2216  
2147 2217                  iosize = (int)MIN(sr->sr_maxpsz, size);
2148 2218  
2149 2219                  /*
2150 2220                   * Socket filters can limit the mblk size,
2151 2221                   * so limit reads to maxblk if there are
2152 2222                   * filters present.
2153 2223                   */
2154 2224                  if (vp->v_type == VSOCK &&
2155 2225                      so->so_filter_active > 0 && maxblk != INFPSZ)
2156 2226                          iosize = (int)MIN(iosize, maxblk);
2157 2227  
2158 2228                  if (is_system_labeled()) {
2159 2229                          mp = allocb_cred(iosize + extra, CRED(),
2160 2230                              curproc->p_pid);
2161 2231                  } else {
2162 2232                          mp = allocb(iosize + extra, BPRI_MED);
2163 2233                  }
2164 2234                  if (mp == NULL) {
2165 2235                          error = EAGAIN;
2166 2236                          break;
2167 2237                  }
2168 2238  
2169 2239                  mp->b_rptr += wroff;
2170 2240  
2171 2241                  ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2172 2242  
2173 2243                  /* Error or Reached EOF ? */
2174 2244                  if ((error != 0) || (ret_size == 0)) {
2175 2245                          freeb(mp);
2176 2246                          break;
2177 2247                  }
2178 2248                  mp->b_wptr = mp->b_rptr + ret_size;
2179 2249  
2180 2250                  snf_enque(sr, mp);
2181 2251                  size -= ret_size;
2182 2252                  fileoff += ret_size;
2183 2253          }
2184 2254          (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2185 2255              kcred, NULL, NULL);
2186 2256          mutex_enter(&sr->sr_lock);
2187 2257          sr->sr_read_error = error;
2188 2258          sr->sr_read_error |= SR_READ_DONE;
2189 2259          cv_signal(&sr->sr_cv);
2190 2260          mutex_exit(&sr->sr_lock);
2191 2261  }
2192 2262  
2193 2263  void
2194 2264  snf_async_thread(void)
2195 2265  {
2196 2266          snf_req_t *sr;
2197 2267          callb_cpr_t cprinfo;
2198 2268          clock_t time_left = 1;
2199 2269  
2200 2270          CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2201 2271  
2202 2272          mutex_enter(&snfq->snfq_lock);
2203 2273          for (;;) {
2204 2274                  /*
2205 2275                   * If we didn't find a entry, then block until woken up
2206 2276                   * again and then look through the queues again.
2207 2277                   */
2208 2278                  while ((sr = snfq->snfq_req_head) == NULL) {
2209 2279                          CALLB_CPR_SAFE_BEGIN(&cprinfo);
2210 2280                          if (time_left <= 0) {
2211 2281                                  snfq->snfq_svc_threads--;
2212 2282                                  CALLB_CPR_EXIT(&cprinfo);
2213 2283                                  thread_exit();
2214 2284                                  /* NOTREACHED */
2215 2285                          }
2216 2286                          snfq->snfq_idle_cnt++;
2217 2287  
2218 2288                          time_left = cv_reltimedwait(&snfq->snfq_cv,
2219 2289                              &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2220 2290                          snfq->snfq_idle_cnt--;
2221 2291  
2222 2292                          CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2223 2293                  }
2224 2294                  snfq->snfq_req_head = sr->sr_next;
2225 2295                  snfq->snfq_req_cnt--;
2226 2296                  mutex_exit(&snfq->snfq_lock);
2227 2297                  snf_async_read(sr);
2228 2298                  mutex_enter(&snfq->snfq_lock);
2229 2299          }
2230 2300  }
2231 2301  
2232 2302  
2233 2303  snf_req_t *
2234 2304  create_thread(int operation, struct vnode *vp, file_t *fp,
2235 2305      u_offset_t fileoff, u_offset_t size)
2236 2306  {
2237 2307          snf_req_t *sr;
2238 2308          stdata_t *stp;
2239 2309  
2240 2310          sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2241 2311  
2242 2312          sr->sr_vp = vp;
2243 2313          sr->sr_fp = fp;
2244 2314          stp = vp->v_stream;
2245 2315  
2246 2316          /*
2247 2317           * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2248 2318           * stream might be closed before thread returns from snf_async_read.
2249 2319           */
2250 2320          if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2251 2321                  sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2252 2322          } else {
2253 2323                  sr->sr_maxpsz = MAXBSIZE;
2254 2324          }
2255 2325  
2256 2326          sr->sr_operation = operation;
2257 2327          sr->sr_file_off = fileoff;
2258 2328          sr->sr_file_size = size;
2259 2329          sr->sr_hiwat = sendfile_req_hiwat;
2260 2330          sr->sr_lowat = sendfile_req_lowat;
2261 2331          mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2262 2332          cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2263 2333          /*
2264 2334           * See whether we need another thread for servicing this
2265 2335           * request. If there are already enough requests queued
2266 2336           * for the threads, create one if not exceeding
2267 2337           * snfq_max_threads.
2268 2338           */
2269 2339          mutex_enter(&snfq->snfq_lock);
2270 2340          if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2271 2341              snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2272 2342                  (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2273 2343                      TS_RUN, minclsyspri);
2274 2344                  snfq->snfq_svc_threads++;
2275 2345          }
2276 2346          if (snfq->snfq_req_head == NULL) {
2277 2347                  snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2278 2348                  cv_signal(&snfq->snfq_cv);
2279 2349          } else {
2280 2350                  snfq->snfq_req_tail->sr_next = sr;
2281 2351                  snfq->snfq_req_tail = sr;
2282 2352          }
2283 2353          snfq->snfq_req_cnt++;
2284 2354          mutex_exit(&snfq->snfq_lock);
2285 2355          return (sr);
2286 2356  }
2287 2357  
2288 2358  int
2289 2359  snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2290 2360      ssize_t *count)
2291 2361  {
2292 2362          snf_req_t *sr;
2293 2363          mblk_t *mp;
2294 2364          int iosize;
2295 2365          int error = 0;
2296 2366          short fflag;
2297 2367          struct vnode *vp;
2298 2368          int ksize;
2299 2369          struct nmsghdr msg;
2300 2370  
2301 2371          ksize = 0;
2302 2372          *count = 0;
2303 2373          bzero(&msg, sizeof (msg));
2304 2374  
2305 2375          vp = fp->f_vnode;
2306 2376          fflag = fp->f_flag;
2307 2377          if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2308 2378                  return (EAGAIN);
2309 2379  
2310 2380          /*
2311 2381           * We check for read error in snf_deque. It has to check
2312 2382           * for successful READ_DONE and return NULL, and we might
2313 2383           * as well make an additional check there.
2314 2384           */
2315 2385          while ((mp = snf_deque(sr)) != NULL) {
2316 2386  
2317 2387                  if (ISSIG(curthread, JUSTLOOKING)) {
2318 2388                          freeb(mp);
2319 2389                          error = EINTR;
2320 2390                          break;
2321 2391                  }
2322 2392                  iosize = MBLKL(mp);
2323 2393  
2324 2394                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2325 2395  
2326 2396                  if (error != 0) {
2327 2397                          if (mp != NULL)
2328 2398                                  freeb(mp);
2329 2399                          break;
2330 2400                  }
2331 2401                  ksize += iosize;
2332 2402          }
2333 2403          *count = ksize;
2334 2404  
2335 2405          mutex_enter(&sr->sr_lock);
2336 2406          sr->sr_write_error = error;
2337 2407          /* Look at the big comments on why we cv_signal here. */
2338 2408          cv_signal(&sr->sr_cv);
2339 2409  
2340 2410          /* Wait for the reader to complete always. */
2341 2411          while (!(sr->sr_read_error & SR_READ_DONE)) {
2342 2412                  cv_wait(&sr->sr_cv, &sr->sr_lock);
2343 2413          }
2344 2414          /* If there is no write error, check for read error. */
2345 2415          if (error == 0)
2346 2416                  error = (sr->sr_read_error & ~SR_READ_DONE);
2347 2417  
2348 2418          if (error != 0) {
2349 2419                  mblk_t *next_mp;
2350 2420  
2351 2421                  mp = sr->sr_mp_head;
2352 2422                  while (mp != NULL) {
2353 2423                          next_mp = mp->b_next;
2354 2424                          mp->b_next = NULL;
2355 2425                          freeb(mp);
2356 2426                          mp = next_mp;
2357 2427                  }
2358 2428          }
2359 2429          mutex_exit(&sr->sr_lock);
2360 2430          kmem_free(sr, sizeof (snf_req_t));
2361 2431          return (error);
2362 2432  }
2363 2433  
2364 2434  /* Maximum no.of pages allocated by vpm for sendfile at a time */
2365 2435  #define SNF_VPMMAXPGS   (VPMMAXPGS/2)
2366 2436  
2367 2437  /*
2368 2438   * Maximum no.of elements in the list returned by vpm, including
2369 2439   * NULL for the last entry
2370 2440   */
2371 2441  #define SNF_MAXVMAPS    (SNF_VPMMAXPGS + 1)
2372 2442  
2373 2443  typedef struct {
2374 2444          unsigned int    snfv_ref;
2375 2445          frtn_t          snfv_frtn;
2376 2446          vnode_t         *snfv_vp;
2377 2447          struct vmap     snfv_vml[SNF_MAXVMAPS];
2378 2448  } snf_vmap_desbinfo;
2379 2449  
2380 2450  typedef struct {
2381 2451          frtn_t          snfi_frtn;
2382 2452          caddr_t         snfi_base;
2383 2453          uint_t          snfi_mapoff;
2384 2454          size_t          snfi_len;
2385 2455          vnode_t         *snfi_vp;
2386 2456  } snf_smap_desbinfo;
2387 2457  
2388 2458  /*
2389 2459   * The callback function used for vpm mapped mblks called when the last ref of
2390 2460   * the mblk is dropped which normally occurs when TCP receives the ack. But it
2391 2461   * can be the driver too due to lazy reclaim.
2392 2462   */
2393 2463  void
2394 2464  snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2395 2465  {
2396 2466          ASSERT(snfv->snfv_ref != 0);
2397 2467          if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2398 2468                  vpm_unmap_pages(snfv->snfv_vml, S_READ);
2399 2469                  VN_RELE(snfv->snfv_vp);
2400 2470                  kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2401 2471          }
2402 2472  }
2403 2473  
2404 2474  /*
2405 2475   * The callback function used for segmap'ped mblks called when the last ref of
2406 2476   * the mblk is dropped which normally occurs when TCP receives the ack. But it
2407 2477   * can be the driver too due to lazy reclaim.
2408 2478   */
2409 2479  void
2410 2480  snf_smap_desbfree(snf_smap_desbinfo *snfi)
2411 2481  {
2412 2482          if (! IS_KPM_ADDR(snfi->snfi_base)) {
2413 2483                  /*
2414 2484                   * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2415 2485                   * segmap_kpm as long as the latter never falls back to
2416 2486                   * "use_segmap_range". (See segmap_getmapflt().)
2417 2487                   *
2418 2488                   * Using S_OTHER saves an redundant hat_setref() in
2419 2489                   * segmap_unlock()
2420 2490                   */
2421 2491                  (void) segmap_fault(kas.a_hat, segkmap,
2422 2492                      (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2423 2493                      snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2424 2494                      F_SOFTUNLOCK, S_OTHER);
2425 2495          }
2426 2496          (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2427 2497          VN_RELE(snfi->snfi_vp);
2428 2498          kmem_free(snfi, sizeof (*snfi));
2429 2499  }
2430 2500  
2431 2501  /*
2432 2502   * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2433 2503   * When segmap is used, the mblk contains a segmap slot of no more
2434 2504   * than MAXBSIZE.
2435 2505   *
2436 2506   * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2437 2507   * in each iteration and sent by socket_sendmblk until an error occurs or
2438 2508   * the requested size has been transferred. An mblk is esballoca'ed from
2439 2509   * each mapped page and a chain of these mblk is sent to the transport layer.
2440 2510   * vpm will be called to unmap the pages when all mblks have been freed by
2441 2511   * free_func.
2442 2512   *
2443 2513   * At the end of the whole sendfile() operation, we wait till the data from
2444 2514   * the last mblk is ack'ed by the transport before returning so that the
2445 2515   * caller of sendfile() can safely modify the file content.
2446 2516   *
2447 2517   * The caller of this function should make sure that total_size does not exceed
2448 2518   * the actual file size of fvp.
2449 2519   */
2450 2520  int
2451 2521  snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2452 2522      ssize_t *count, boolean_t nowait)
2453 2523  {
2454 2524          caddr_t base;
2455 2525          int mapoff;
2456 2526          vnode_t *vp;
2457 2527          mblk_t *mp = NULL;
2458 2528          int chain_size;
2459 2529          int error;
2460 2530          clock_t deadlk_wait;
2461 2531          short fflag;
2462 2532          int ksize;
2463 2533          struct vattr va;
2464 2534          boolean_t dowait = B_FALSE;
2465 2535          struct nmsghdr msg;
2466 2536  
2467 2537          vp = fp->f_vnode;
2468 2538          fflag = fp->f_flag;
2469 2539          ksize = 0;
2470 2540          bzero(&msg, sizeof (msg));
2471 2541  
2472 2542          for (;;) {
2473 2543                  if (ISSIG(curthread, JUSTLOOKING)) {
2474 2544                          error = EINTR;
2475 2545                          break;
2476 2546                  }
2477 2547  
2478 2548                  if (vpm_enable) {
2479 2549                          snf_vmap_desbinfo *snfv;
2480 2550                          mblk_t *nmp;
2481 2551                          int mblk_size;
2482 2552                          int maxsize;
2483 2553                          int i;
2484 2554  
2485 2555                          mapoff = fileoff & PAGEOFFSET;
2486 2556                          maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2487 2557  
2488 2558                          snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2489 2559                              KM_SLEEP);
2490 2560  
2491 2561                          /*
2492 2562                           * Get vpm mappings for maxsize with read access.
2493 2563                           * If the pages aren't available yet, we get
2494 2564                           * DEADLK, so wait and try again a little later using
2495 2565                           * an increasing wait. We might be here a long time.
2496 2566                           *
2497 2567                           * If delay_sig returns EINTR, be sure to exit and
2498 2568                           * pass it up to the caller.
2499 2569                           */
2500 2570                          deadlk_wait = 0;
2501 2571                          while ((error = vpm_map_pages(fvp, fileoff,
2502 2572                              (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2503 2573                              SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2504 2574                                  deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2505 2575                                  if ((error = delay_sig(deadlk_wait)) != 0) {
2506 2576                                          break;
2507 2577                                  }
2508 2578                          }
2509 2579                          if (error != 0) {
2510 2580                                  kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2511 2581                                  error = (error == EINTR) ? EINTR : EIO;
2512 2582                                  goto out;
2513 2583                          }
2514 2584                          snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2515 2585                          snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2516 2586  
2517 2587                          /* Construct the mblk chain from the page mappings */
2518 2588                          chain_size = 0;
2519 2589                          for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2520 2590                              total_size > 0; i++) {
2521 2591                                  ASSERT(chain_size < maxsize);
2522 2592                                  mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2523 2593                                      mapoff, total_size);
2524 2594                                  nmp = esballoca(
2525 2595                                      (uchar_t *)snfv->snfv_vml[i].vs_addr +
2526 2596                                      mapoff, mblk_size, BPRI_HI,
2527 2597                                      &snfv->snfv_frtn);
2528 2598  
2529 2599                                  /*
2530 2600                                   * We return EAGAIN after unmapping the pages
2531 2601                                   * if we cannot allocate the the head of the
2532 2602                                   * chain. Otherwise, we continue sending the
2533 2603                                   * mblks constructed so far.
2534 2604                                   */
2535 2605                                  if (nmp == NULL) {
2536 2606                                          if (i == 0) {
2537 2607                                                  vpm_unmap_pages(snfv->snfv_vml,
2538 2608                                                      S_READ);
2539 2609                                                  kmem_free(snfv,
2540 2610                                                      sizeof (snf_vmap_desbinfo));
2541 2611                                                  error = EAGAIN;
2542 2612                                                  goto out;
2543 2613                                          }
2544 2614                                          break;
2545 2615                                  }
2546 2616                                  /* Mark this dblk with the zero-copy flag */
2547 2617                                  nmp->b_datap->db_struioflag |= STRUIO_ZC;
2548 2618                                  nmp->b_wptr += mblk_size;
2549 2619                                  chain_size += mblk_size;
2550 2620                                  fileoff += mblk_size;
2551 2621                                  total_size -= mblk_size;
2552 2622                                  snfv->snfv_ref++;
2553 2623                                  mapoff = 0;
2554 2624                                  if (i > 0)
2555 2625                                          linkb(mp, nmp);
2556 2626                                  else
2557 2627                                          mp = nmp;
2558 2628                          }
2559 2629                          VN_HOLD(fvp);
2560 2630                          snfv->snfv_vp = fvp;
2561 2631                  } else {
2562 2632                          /* vpm not supported. fallback to segmap */
2563 2633                          snf_smap_desbinfo *snfi;
2564 2634  
2565 2635                          mapoff = fileoff & MAXBOFFSET;
2566 2636                          chain_size = MAXBSIZE - mapoff;
2567 2637                          if (chain_size > total_size)
2568 2638                                  chain_size = total_size;
2569 2639                          /*
2570 2640                           * we don't forcefault because we'll call
2571 2641                           * segmap_fault(F_SOFTLOCK) next.
2572 2642                           *
2573 2643                           * S_READ will get the ref bit set (by either
2574 2644                           * segmap_getmapflt() or segmap_fault()) and page
2575 2645                           * shared locked.
2576 2646                           */
2577 2647                          base = segmap_getmapflt(segkmap, fvp, fileoff,
2578 2648                              chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2579 2649  
2580 2650                          snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2581 2651                          snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2582 2652                              PAGESIZE)- (mapoff & PAGEMASK);
2583 2653                          /*
2584 2654                           * We must call segmap_fault() even for segmap_kpm
2585 2655                           * because that's how error gets returned.
2586 2656                           * (segmap_getmapflt() never fails but segmap_fault()
2587 2657                           * does.)
2588 2658                           *
2589 2659                           * If the pages aren't available yet, we get
2590 2660                           * DEADLK, so wait and try again a little later using
2591 2661                           * an increasing wait. We might be here a long time.
2592 2662                           *
2593 2663                           * If delay_sig returns EINTR, be sure to exit and
2594 2664                           * pass it up to the caller.
2595 2665                           */
2596 2666                          deadlk_wait = 0;
2597 2667                          while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2598 2668                              segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2599 2669                              mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2600 2670                              S_READ))) == EDEADLK) {
2601 2671                                  deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2602 2672                                  if ((error = delay_sig(deadlk_wait)) != 0) {
2603 2673                                          break;
2604 2674                                  }
2605 2675                          }
2606 2676                          if (error != 0) {
2607 2677                                  (void) segmap_release(segkmap, base, 0);
2608 2678                                  kmem_free(snfi, sizeof (*snfi));
2609 2679                                  error = (error == EINTR) ? EINTR : EIO;
2610 2680                                  goto out;
2611 2681                          }
2612 2682                          snfi->snfi_frtn.free_func = snf_smap_desbfree;
2613 2683                          snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2614 2684                          snfi->snfi_base = base;
2615 2685                          snfi->snfi_mapoff = mapoff;
2616 2686                          mp = esballoca((uchar_t *)base + mapoff, chain_size,
2617 2687                              BPRI_HI, &snfi->snfi_frtn);
2618 2688  
2619 2689                          if (mp == NULL) {
2620 2690                                  (void) segmap_fault(kas.a_hat, segkmap,
2621 2691                                      (caddr_t)(uintptr_t)(((uintptr_t)base +
2622 2692                                      mapoff) & PAGEMASK), snfi->snfi_len,
2623 2693                                      F_SOFTUNLOCK, S_OTHER);
2624 2694                                  (void) segmap_release(segkmap, base, 0);
2625 2695                                  kmem_free(snfi, sizeof (*snfi));
2626 2696                                  freemsg(mp);
2627 2697                                  error = EAGAIN;
2628 2698                                  goto out;
2629 2699                          }
2630 2700                          VN_HOLD(fvp);
2631 2701                          snfi->snfi_vp = fvp;
2632 2702                          mp->b_wptr += chain_size;
2633 2703  
2634 2704                          /* Mark this dblk with the zero-copy flag */
2635 2705                          mp->b_datap->db_struioflag |= STRUIO_ZC;
2636 2706                          fileoff += chain_size;
2637 2707                          total_size -= chain_size;
2638 2708                  }
2639 2709  
2640 2710                  if (total_size == 0 && !nowait) {
2641 2711                          ASSERT(!dowait);
2642 2712                          dowait = B_TRUE;
2643 2713                          mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2644 2714                  }
2645 2715                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2646 2716                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2647 2717                  if (error != 0) {
2648 2718                          /*
2649 2719                           * mp contains the mblks that were not sent by
2650 2720                           * socket_sendmblk. Use its size to update *count
2651 2721                           */
2652 2722                          *count = ksize + (chain_size - msgdsize(mp));
2653 2723                          if (mp != NULL)
2654 2724                                  freemsg(mp);
2655 2725                          return (error);
2656 2726                  }
2657 2727                  ksize += chain_size;
2658 2728                  if (total_size == 0)
2659 2729                          goto done;
2660 2730  
2661 2731                  (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2662 2732                  va.va_mask = AT_SIZE;
2663 2733                  error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2664 2734                  if (error)
2665 2735                          break;
2666 2736                  /* Read as much as possible. */
2667 2737                  if (fileoff >= va.va_size)
2668 2738                          break;
2669 2739                  if (total_size + fileoff > va.va_size)
2670 2740                          total_size = va.va_size - fileoff;
2671 2741          }
2672 2742  out:
2673 2743          VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2674 2744  done:
2675 2745          *count = ksize;
2676 2746          if (dowait) {
2677 2747                  stdata_t *stp;
2678 2748  
2679 2749                  stp = vp->v_stream;
2680 2750                  if (stp == NULL) {
2681 2751                          struct sonode *so;
2682 2752                          so = VTOSO(vp);
2683 2753                          error = so_zcopy_wait(so);
2684 2754                  } else {
2685 2755                          mutex_enter(&stp->sd_lock);
2686 2756                          while (!(stp->sd_flag & STZCNOTIFY)) {
2687 2757                                  if (cv_wait_sig(&stp->sd_zcopy_wait,
2688 2758                                      &stp->sd_lock) == 0) {
2689 2759                                          error = EINTR;
2690 2760                                          break;
2691 2761                                  }
2692 2762                          }
2693 2763                          stp->sd_flag &= ~STZCNOTIFY;
2694 2764                          mutex_exit(&stp->sd_lock);
2695 2765                  }
2696 2766          }
2697 2767          return (error);
2698 2768  }
2699 2769  
2700 2770  int
2701 2771  snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2702 2772      uint_t maxpsz, ssize_t *count)
2703 2773  {
2704 2774          struct vnode *vp;
2705 2775          mblk_t *mp;
2706 2776          int iosize;
2707 2777          int extra = 0;
2708 2778          int error;
2709 2779          short fflag;
2710 2780          int ksize;
2711 2781          int ioflag;
2712 2782          struct uio auio;
2713 2783          struct iovec aiov;
2714 2784          struct vattr va;
2715 2785          int maxblk = 0;
2716 2786          int wroff = 0;
2717 2787          struct sonode *so;
2718 2788          struct nmsghdr msg;
2719 2789  
2720 2790          vp = fp->f_vnode;
2721 2791          if (vp->v_type == VSOCK) {
2722 2792                  stdata_t *stp;
2723 2793  
2724 2794                  /*
2725 2795                   * Get the extra space to insert a header and a trailer.
2726 2796                   */
2727 2797                  so = VTOSO(vp);
2728 2798                  stp = vp->v_stream;
2729 2799                  if (stp == NULL) {
2730 2800                          wroff = so->so_proto_props.sopp_wroff;
2731 2801                          maxblk = so->so_proto_props.sopp_maxblk;
2732 2802                          extra = wroff + so->so_proto_props.sopp_tail;
2733 2803                  } else {
2734 2804                          wroff = (int)(stp->sd_wroff);
2735 2805                          maxblk = (int)(stp->sd_maxblk);
2736 2806                          extra = wroff + (int)(stp->sd_tail);
2737 2807                  }
2738 2808          }
2739 2809          bzero(&msg, sizeof (msg));
2740 2810          fflag = fp->f_flag;
2741 2811          ksize = 0;
2742 2812          auio.uio_iov = &aiov;
2743 2813          auio.uio_iovcnt = 1;
2744 2814          auio.uio_segflg = UIO_SYSSPACE;
2745 2815          auio.uio_llimit = MAXOFFSET_T;
2746 2816          auio.uio_fmode = fflag;
2747 2817          auio.uio_extflg = UIO_COPY_CACHED;
2748 2818          ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2749 2819          /* If read sync is not asked for, filter sync flags */
2750 2820          if ((ioflag & FRSYNC) == 0)
2751 2821                  ioflag &= ~(FSYNC|FDSYNC);
2752 2822          for (;;) {
2753 2823                  if (ISSIG(curthread, JUSTLOOKING)) {
2754 2824                          error = EINTR;
2755 2825                          break;
2756 2826                  }
2757 2827                  iosize = (int)MIN(maxpsz, size);
2758 2828  
2759 2829                  /*
2760 2830                   * Socket filters can limit the mblk size,
2761 2831                   * so limit reads to maxblk if there are
2762 2832                   * filters present.
2763 2833                   */
2764 2834                  if (vp->v_type == VSOCK &&
2765 2835                      so->so_filter_active > 0 && maxblk != INFPSZ)
2766 2836                          iosize = (int)MIN(iosize, maxblk);
2767 2837  
2768 2838                  if (is_system_labeled()) {
2769 2839                          mp = allocb_cred(iosize + extra, CRED(),
2770 2840                              curproc->p_pid);
2771 2841                  } else {
2772 2842                          mp = allocb(iosize + extra, BPRI_MED);
2773 2843                  }
2774 2844                  if (mp == NULL) {
2775 2845                          error = EAGAIN;
2776 2846                          break;
2777 2847                  }
2778 2848  
2779 2849                  mp->b_rptr += wroff;
2780 2850  
2781 2851                  aiov.iov_base = (caddr_t)mp->b_rptr;
2782 2852                  aiov.iov_len = iosize;
2783 2853                  auio.uio_loffset = fileoff;
2784 2854                  auio.uio_resid = iosize;
2785 2855  
2786 2856                  error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2787 2857                  iosize -= auio.uio_resid;
2788 2858  
2789 2859                  if (error == EINTR && iosize != 0)
2790 2860                          error = 0;
2791 2861  
2792 2862                  if (error != 0 || iosize == 0) {
2793 2863                          freeb(mp);
2794 2864                          break;
2795 2865                  }
2796 2866                  mp->b_wptr = mp->b_rptr + iosize;
2797 2867  
2798 2868                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2799 2869  
2800 2870                  error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2801 2871  
2802 2872                  if (error != 0) {
2803 2873                          *count = ksize;
2804 2874                          if (mp != NULL)
2805 2875                                  freeb(mp);
2806 2876                          return (error);
2807 2877                  }
2808 2878                  ksize += iosize;
2809 2879                  size -= iosize;
2810 2880                  if (size == 0)
2811 2881                          goto done;
2812 2882  
2813 2883                  fileoff += iosize;
2814 2884                  (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2815 2885                  va.va_mask = AT_SIZE;
2816 2886                  error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2817 2887                  if (error)
2818 2888                          break;
2819 2889                  /* Read as much as possible. */
2820 2890                  if (fileoff >= va.va_size)
2821 2891                          size = 0;
2822 2892                  else if (size + fileoff > va.va_size)
2823 2893                          size = va.va_size - fileoff;
2824 2894          }
2825 2895          VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826 2896  done:
2827 2897          *count = ksize;
2828 2898          return (error);
2829 2899  }
2830 2900  
2831 2901  #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2832 2902  /*
2833 2903   * Largefile support for 32 bit applications only.
2834 2904   */
2835 2905  int
2836 2906  sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2837 2907      ssize32_t *count32)
2838 2908  {
2839 2909          ssize32_t sfv_len;
2840 2910          u_offset_t sfv_off, va_size;
2841 2911          struct vnode *vp, *fvp, *realvp;
2842 2912          struct vattr va;
2843 2913          stdata_t *stp;
2844 2914          ssize_t count = 0;
2845 2915          int error = 0;
2846 2916          boolean_t dozcopy = B_FALSE;
2847 2917          uint_t maxpsz;
2848 2918  
2849 2919          sfv_len = (ssize32_t)sfv->sfv_len;
2850 2920          if (sfv_len < 0) {
2851 2921                  error = EINVAL;
2852 2922                  goto out;
2853 2923          }
2854 2924  
2855 2925          if (sfv_len == 0) goto out;
2856 2926  
2857 2927          sfv_off = (u_offset_t)sfv->sfv_off;
2858 2928  
2859 2929          /* Same checks as in pread */
2860 2930          if (sfv_off > MAXOFFSET_T) {
2861 2931                  error = EINVAL;
2862 2932                  goto out;
2863 2933          }
2864 2934          if (sfv_off + sfv_len > MAXOFFSET_T)
2865 2935                  sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2866 2936  
2867 2937          /*
2868 2938           * There are no more checks on sfv_len. So, we cast it to
2869 2939           * u_offset_t and share the snf_direct_io/snf_cache code between
2870 2940           * 32 bit and 64 bit.
2871 2941           *
2872 2942           * TODO: should do nbl_need_check() like read()?
2873 2943           */
2874 2944          if (sfv_len > sendfile_max_size) {
2875 2945                  sf_stats.ss_file_not_cached++;
2876 2946                  error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2877 2947                      &count);
2878 2948                  goto out;
2879 2949          }
2880 2950          fvp = rfp->f_vnode;
2881 2951          if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2882 2952                  fvp = realvp;
2883 2953          /*
2884 2954           * Grab the lock as a reader to prevent the file size
2885 2955           * from changing underneath.
2886 2956           */
2887 2957          (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2888 2958          va.va_mask = AT_SIZE;
2889 2959          error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2890 2960          va_size = va.va_size;
2891 2961          if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2892 2962                  VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2893 2963                  goto out;
2894 2964          }
2895 2965          /* Read as much as possible. */
2896 2966          if (sfv_off + sfv_len > va_size)
2897 2967                  sfv_len = va_size - sfv_off;
2898 2968  
2899 2969          vp = fp->f_vnode;
2900 2970          stp = vp->v_stream;
2901 2971          /*
2902 2972           * When the NOWAIT flag is not set, we enable zero-copy only if the
2903 2973           * transfer size is large enough. This prevents performance loss
2904 2974           * when the caller sends the file piece by piece.
2905 2975           */
2906 2976          if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2907 2977              (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2908 2978              !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2909 2979                  uint_t copyflag;
2910 2980                  copyflag = stp != NULL ? stp->sd_copyflag :
2911 2981                      VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2912 2982                  if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2913 2983                          int on = 1;
2914 2984  
2915 2985                          if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2916 2986                              SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2917 2987                                  dozcopy = B_TRUE;
2918 2988                  } else {
2919 2989                          dozcopy = copyflag & STZCVMSAFE;
2920 2990                  }
2921 2991          }
2922 2992          if (dozcopy) {
2923 2993                  sf_stats.ss_file_segmap++;
2924 2994                  error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2925 2995                      &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2926 2996          } else {
2927 2997                  if (vp->v_type == VSOCK && stp == NULL) {
2928 2998                          sonode_t *so = VTOSO(vp);
2929 2999                          maxpsz = so->so_proto_props.sopp_maxpsz;
2930 3000                  } else if (stp != NULL) {
2931 3001                          maxpsz = stp->sd_qn_maxpsz;
2932 3002                  } else {
2933 3003                          maxpsz = maxphys;
2934 3004                  }
2935 3005  
2936 3006                  if (maxpsz == INFPSZ)
2937 3007                          maxpsz = maxphys;
2938 3008                  else
2939 3009                          maxpsz = roundup(maxpsz, MAXBSIZE);
2940 3010                  sf_stats.ss_file_cached++;
2941 3011                  error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2942 3012                      maxpsz, &count);
2943 3013          }
2944 3014  out:
2945 3015          releasef(sfv->sfv_fd);
2946 3016          *count32 = (ssize32_t)count;
2947 3017          return (error);
2948 3018  }
2949 3019  #endif
2950 3020  
2951 3021  #ifdef _SYSCALL32_IMPL
2952 3022  /*
2953 3023   * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2954 3024   * ssize_t rather than ssize32_t; see the comments above read32 for details.
2955 3025   */
2956 3026  
2957 3027  ssize_t
2958 3028  recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2959 3029  {
2960 3030          return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2961 3031  }
2962 3032  
2963 3033  ssize_t
2964 3034  recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2965 3035      caddr32_t name, caddr32_t namelenp)
2966 3036  {
2967 3037          return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2968 3038              (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2969 3039  }
2970 3040  
2971 3041  ssize_t
2972 3042  send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2973 3043  {
2974 3044          return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2975 3045  }
2976 3046  
2977 3047  ssize_t
2978 3048  sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2979 3049      caddr32_t name, socklen_t namelen)
2980 3050  {
2981 3051          return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2982 3052              (void *)(uintptr_t)name, namelen));
2983 3053  }
2984 3054  #endif  /* _SYSCALL32_IMPL */
2985 3055  
2986 3056  /*
2987 3057   * Function wrappers (mostly around the sonode switch) for
2988 3058   * backward compatibility.
2989 3059   */
2990 3060  
2991 3061  int
2992 3062  soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2993 3063  {
2994 3064          return (socket_accept(so, fflag, CRED(), nsop));
2995 3065  }
2996 3066  
2997 3067  int
2998 3068  sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2999 3069      int backlog, int flags)
3000 3070  {
3001 3071          int     error;
3002 3072  
3003 3073          error = socket_bind(so, name, namelen, flags, CRED());
3004 3074          if (error == 0 && backlog != 0)
3005 3075                  return (socket_listen(so, backlog, CRED()));
3006 3076  
3007 3077          return (error);
3008 3078  }
3009 3079  
3010 3080  int
3011 3081  solisten(struct sonode *so, int backlog)
3012 3082  {
3013 3083          return (socket_listen(so, backlog, CRED()));
3014 3084  }
3015 3085  
3016 3086  int
3017 3087  soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3018 3088      int fflag, int flags)
3019 3089  {
3020 3090          return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3021 3091  }
3022 3092  
3023 3093  int
3024 3094  sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3025 3095  {
3026 3096          return (socket_recvmsg(so, msg, uiop, CRED()));
3027 3097  }
3028 3098  
3029 3099  int
3030 3100  sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3031 3101  {
3032 3102          return (socket_sendmsg(so, msg, uiop, CRED()));
3033 3103  }
3034 3104  
3035 3105  int
3036 3106  soshutdown(struct sonode *so, int how)
3037 3107  {
3038 3108          return (socket_shutdown(so, how, CRED()));
3039 3109  }
3040 3110  
3041 3111  int
3042 3112  sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3043 3113      socklen_t *optlenp, int flags)
3044 3114  {
3045 3115          return (socket_getsockopt(so, level, option_name, optval, optlenp,
3046 3116              flags, CRED()));
3047 3117  }
3048 3118  
3049 3119  int
3050 3120  sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3051 3121      t_uscalar_t optlen)
3052 3122  {
3053 3123          return (socket_setsockopt(so, level, option_name, optval, optlen,
3054 3124              CRED()));
3055 3125  }
3056 3126  
3057 3127  /*
3058 3128   * Because this is backward compatibility interface it only needs to be
3059 3129   * able to handle the creation of TPI sockfs sockets.
3060 3130   */
3061 3131  struct sonode *
3062 3132  socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3063 3133      int *errorp)
3064 3134  {
3065 3135          struct sonode *so;
3066 3136  
3067 3137          ASSERT(sp != NULL);
3068 3138  
3069 3139          so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3070 3140              version, SOCKET_SLEEP, errorp, CRED());
3071 3141          if (so == NULL) {
3072 3142                  SOCKPARAMS_DEC_REF(sp);
3073 3143          } else {
3074 3144                  if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3075 3145                          /* Cannot fail, only bumps so_count */
3076 3146                          (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3077 3147                  } else {
3078 3148                          socket_destroy(so);
3079 3149                          so = NULL;
3080 3150                  }
3081 3151          }
3082 3152          return (so);
3083 3153  }

↓ open down ↓

1706 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX