illumos-gate Wdiff usr/src/uts/common/inet/tcp/tcp_fusion.c

Print this page

11547 Want connstat(1M) command to display per-connection TCP statistics
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Ahmed G <ahmedg@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/tcp/tcp_fusion.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_fusion.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2015 by Delphix. All rights reserved.
  23   24   */
  24   25  
  25   26  #include <sys/types.h>
  26   27  #include <sys/stream.h>
  27   28  #include <sys/strsun.h>
  28   29  #include <sys/strsubr.h>
  29   30  #include <sys/debug.h>
  30   31  #include <sys/sdt.h>
  31   32  #include <sys/cmn_err.h>
  32   33  #include <sys/tihdr.h>

  33   34  
  34   35  #include <inet/common.h>
  35   36  #include <inet/optcom.h>
  36   37  #include <inet/ip.h>
  37   38  #include <inet/ip_if.h>
  38   39  #include <inet/ip_impl.h>
  39   40  #include <inet/tcp.h>
  40   41  #include <inet/tcp_impl.h>
  41   42  #include <inet/ipsec_impl.h>
  42   43  #include <inet/ipclassifier.h>
  43   44  #include <inet/ipp_common.h>
  44   45  #include <inet/ip_if.h>
  45   46  
  46   47  /*
  47   48   * This file implements TCP fusion - a protocol-less data path for TCP
  48   49   * loopback connections.  The fusion of two local TCP endpoints occurs
  49   50   * at connection establishment time.  Various conditions (see details
  50   51   * in tcp_fuse()) need to be met for fusion to be successful.  If it
  51   52   * fails, we fall back to the regular TCP data path; if it succeeds,
  52   53   * both endpoints proceed to use tcp_fuse_output() as the transmit path.
  53   54   * tcp_fuse_output() enqueues application data directly onto the peer's
  54   55   * receive queue; no protocol processing is involved.
  55   56   *
  56   57   * Sychronization is handled by squeue and the mutex tcp_non_sq_lock.
  57   58   * One of the requirements for fusion to succeed is that both endpoints
  58   59   * need to be using the same squeue.  This ensures that neither side
  59   60   * can disappear while the other side is still sending data. Flow
  60   61   * control information is manipulated outside the squeue, so the
  61   62   * tcp_non_sq_lock must be held when touching tcp_flow_stopped.
  62   63   */
  63   64  
  64   65  /*
  65   66   * Setting this to false means we disable fusion altogether and
  66   67   * loopback connections would go through the protocol paths.
  67   68   */
  68   69  boolean_t do_tcp_fusion = B_TRUE;
  69   70  
  70   71  /*
  71   72   * This routine gets called by the eager tcp upon changing state from
  72   73   * SYN_RCVD to ESTABLISHED.  It fuses a direct path between itself
  73   74   * and the active connect tcp such that the regular tcp processings
  74   75   * may be bypassed under allowable circumstances.  Because the fusion
  75   76   * requires both endpoints to be in the same squeue, it does not work
  76   77   * for simultaneous active connects because there is no easy way to
  77   78   * switch from one squeue to another once the connection is created.
  78   79   * This is different from the eager tcp case where we assign it the
  79   80   * same squeue as the one given to the active connect tcp during open.
  80   81   */
  81   82  void
  82   83  tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha)
  83   84  {
  84   85          conn_t          *peer_connp, *connp = tcp->tcp_connp;
  85   86          tcp_t           *peer_tcp;
  86   87          tcp_stack_t     *tcps = tcp->tcp_tcps;
  87   88          netstack_t      *ns;
  88   89          ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
  89   90  
  90   91          ASSERT(!tcp->tcp_fused);
  91   92          ASSERT(tcp->tcp_loopback);
  92   93          ASSERT(tcp->tcp_loopback_peer == NULL);
  93   94          /*
  94   95           * We need to inherit conn_rcvbuf of the listener tcp,
  95   96           * but we can't really use tcp_listener since we get here after
  96   97           * sending up T_CONN_IND and tcp_tli_accept() may be called
  97   98           * independently, at which point tcp_listener is cleared;
  98   99           * this is why we use tcp_saved_listener. The listener itself
  99  100           * is guaranteed to be around until tcp_accept_finish() is called
 100  101           * on this eager -- this won't happen until we're done since we're
 101  102           * inside the eager's perimeter now.
 102  103           */
 103  104          ASSERT(tcp->tcp_saved_listener != NULL);
 104  105          /*
 105  106           * Lookup peer endpoint; search for the remote endpoint having
 106  107           * the reversed address-port quadruplet in ESTABLISHED state,
 107  108           * which is guaranteed to be unique in the system.  Zone check
 108  109           * is applied accordingly for loopback address, but not for
 109  110           * local address since we want fusion to happen across Zones.
 110  111           */
 111  112          if (connp->conn_ipversion == IPV4_VERSION) {
 112  113                  peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
 113  114                      (ipha_t *)iphdr, tcpha, ipst);
 114  115          } else {
 115  116                  peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
 116  117                      (ip6_t *)iphdr, tcpha, ipst);
 117  118          }
 118  119  
 119  120          /*
 120  121           * We can only proceed if peer exists, resides in the same squeue
 121  122           * as our conn and is not raw-socket. We also restrict fusion to
 122  123           * endpoints of the same type (STREAMS or non-STREAMS). The squeue
 123  124           * assignment of this eager tcp was done earlier at the time of SYN
 124  125           * processing in ip_fanout_tcp{_v6}.  Note that similar squeues by
 125  126           * itself doesn't guarantee a safe condition to fuse, hence we perform
 126  127           * additional tests below.
 127  128           */
 128  129          ASSERT(peer_connp == NULL || peer_connp != connp);
 129  130          if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
 130  131              !IPCL_IS_TCP(peer_connp) ||
 131  132              IPCL_IS_NONSTR(connp) != IPCL_IS_NONSTR(peer_connp)) {
 132  133                  if (peer_connp != NULL) {
 133  134                          TCP_STAT(tcps, tcp_fusion_unqualified);
 134  135                          CONN_DEC_REF(peer_connp);
 135  136                  }
 136  137                  return;
 137  138          }
 138  139          peer_tcp = peer_connp->conn_tcp;        /* active connect tcp */
 139  140  
 140  141          ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
 141  142          ASSERT(peer_tcp->tcp_loopback_peer == NULL);
 142  143          ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
 143  144  
 144  145          /*
 145  146           * Due to IRE changes the peer and us might not agree on tcp_loopback.
 146  147           * We bail in that case.
 147  148           */
 148  149          if (!peer_tcp->tcp_loopback) {
 149  150                  TCP_STAT(tcps, tcp_fusion_unqualified);
 150  151                  CONN_DEC_REF(peer_connp);
 151  152                  return;
 152  153          }
 153  154          /*
 154  155           * Fuse the endpoints; we perform further checks against both
 155  156           * tcp endpoints to ensure that a fusion is allowed to happen.
 156  157           */
 157  158          ns = tcps->tcps_netstack;
 158  159          ipst = ns->netstack_ip;
 159  160  
 160  161          if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
 161  162              tcp->tcp_xmit_head == NULL && peer_tcp->tcp_xmit_head == NULL) {
 162  163                  mblk_t *mp;
 163  164                  queue_t *peer_rq = peer_connp->conn_rq;
 164  165  
 165  166                  ASSERT(!TCP_IS_DETACHED(peer_tcp));
 166  167                  ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
 167  168                  ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
 168  169  
 169  170                  /*
 170  171                   * We need to drain data on both endpoints during unfuse.
 171  172                   * If we need to send up SIGURG at the time of draining,
 172  173                   * we want to be sure that an mblk is readily available.
 173  174                   * This is why we pre-allocate the M_PCSIG mblks for both
 174  175                   * endpoints which will only be used during/after unfuse.
 175  176                   * The mblk might already exist if we are doing a re-fuse.
 176  177                   */
 177  178                  if (!IPCL_IS_NONSTR(tcp->tcp_connp)) {
 178  179                          ASSERT(!IPCL_IS_NONSTR(peer_tcp->tcp_connp));
 179  180  
 180  181                          if (tcp->tcp_fused_sigurg_mp == NULL) {
 181  182                                  if ((mp = allocb(1, BPRI_HI)) == NULL)
 182  183                                          goto failed;
 183  184                                  tcp->tcp_fused_sigurg_mp = mp;
 184  185                          }
 185  186  
 186  187                          if (peer_tcp->tcp_fused_sigurg_mp == NULL) {
 187  188                                  if ((mp = allocb(1, BPRI_HI)) == NULL)
 188  189                                          goto failed;
 189  190                                  peer_tcp->tcp_fused_sigurg_mp = mp;
 190  191                          }
 191  192  
 192  193                          if ((mp = allocb(sizeof (struct stroptions),
 193  194                              BPRI_HI)) == NULL)
 194  195                                  goto failed;
 195  196                  }
 196  197  
 197  198                  /* Fuse both endpoints */
 198  199                  peer_tcp->tcp_loopback_peer = tcp;
 199  200                  tcp->tcp_loopback_peer = peer_tcp;
 200  201                  peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
 201  202  
 202  203                  /*
 203  204                   * We never use regular tcp paths in fusion and should
 204  205                   * therefore clear tcp_unsent on both endpoints.  Having
 205  206                   * them set to non-zero values means asking for trouble
 206  207                   * especially after unfuse, where we may end up sending
 207  208                   * through regular tcp paths which expect xmit_list and
 208  209                   * friends to be correctly setup.
 209  210                   */
 210  211                  peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
 211  212  
 212  213                  tcp_timers_stop(tcp);
 213  214                  tcp_timers_stop(peer_tcp);
 214  215  
 215  216                  /*
 216  217                   * Set receive buffer and max packet size for the
 217  218                   * active open tcp.
 218  219                   * eager's values will be set in tcp_accept_finish.
 219  220                   */
 220  221                  (void) tcp_rwnd_set(peer_tcp, peer_tcp->tcp_connp->conn_rcvbuf);
 221  222  
 222  223                  /*
 223  224                   * Set the write offset value to zero since we won't
 224  225                   * be needing any room for TCP/IP headers.
 225  226                   */
 226  227                  if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
 227  228                          struct stroptions *stropt;
 228  229  
 229  230                          DB_TYPE(mp) = M_SETOPTS;
 230  231                          mp->b_wptr += sizeof (*stropt);
 231  232  
 232  233                          stropt = (struct stroptions *)mp->b_rptr;
 233  234                          stropt->so_flags = SO_WROFF | SO_MAXBLK;
 234  235                          stropt->so_wroff = 0;
 235  236                          stropt->so_maxblk = INFPSZ;
 236  237  
 237  238                          /* Send the options up */
 238  239                          putnext(peer_rq, mp);
 239  240                  } else {
 240  241                          struct sock_proto_props sopp;
 241  242  
 242  243                          /* The peer is a non-STREAMS end point */
 243  244                          ASSERT(IPCL_IS_TCP(peer_connp));
 244  245  
 245  246                          sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_MAXBLK;
 246  247                          sopp.sopp_wroff = 0;
 247  248                          sopp.sopp_maxblk = INFPSZ;
 248  249                          (*peer_connp->conn_upcalls->su_set_proto_props)
 249  250                              (peer_connp->conn_upper_handle, &sopp);
 250  251                  }
 251  252          } else {
 252  253                  TCP_STAT(tcps, tcp_fusion_unqualified);
 253  254          }
 254  255          CONN_DEC_REF(peer_connp);
 255  256          return;
 256  257  
 257  258  failed:
 258  259          if (tcp->tcp_fused_sigurg_mp != NULL) {
 259  260                  freeb(tcp->tcp_fused_sigurg_mp);
 260  261                  tcp->tcp_fused_sigurg_mp = NULL;
 261  262          }
 262  263          if (peer_tcp->tcp_fused_sigurg_mp != NULL) {
 263  264                  freeb(peer_tcp->tcp_fused_sigurg_mp);
 264  265                  peer_tcp->tcp_fused_sigurg_mp = NULL;
 265  266          }
 266  267          CONN_DEC_REF(peer_connp);
 267  268  }
 268  269  
 269  270  /*
 270  271   * Unfuse a previously-fused pair of tcp loopback endpoints.
 271  272   */
 272  273  void
 273  274  tcp_unfuse(tcp_t *tcp)
 274  275  {
 275  276          tcp_t *peer_tcp = tcp->tcp_loopback_peer;
 276  277          tcp_stack_t *tcps = tcp->tcp_tcps;
 277  278  
 278  279          ASSERT(tcp->tcp_fused && peer_tcp != NULL);
 279  280          ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
 280  281          ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
 281  282          ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
 282  283  
 283  284          /*
 284  285           * Cancel any pending push timers.
 285  286           */
 286  287          if (tcp->tcp_push_tid != 0) {
 287  288                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
 288  289                  tcp->tcp_push_tid = 0;
 289  290          }
 290  291          if (peer_tcp->tcp_push_tid != 0) {
 291  292                  (void) TCP_TIMER_CANCEL(peer_tcp, peer_tcp->tcp_push_tid);
 292  293                  peer_tcp->tcp_push_tid = 0;
 293  294          }
 294  295  
 295  296          /*
 296  297           * Drain any pending data; Note that in case of a detached tcp, the
 297  298           * draining will happen later after the tcp is unfused.  For non-
 298  299           * urgent data, this can be handled by the regular tcp_rcv_drain().
 299  300           * If we have urgent data sitting in the receive list, we will
 300  301           * need to send up a SIGURG signal first before draining the data.
 301  302           * All of these will be handled by the code in tcp_fuse_rcv_drain()
 302  303           * when called from tcp_rcv_drain().
 303  304           */
 304  305          if (!TCP_IS_DETACHED(tcp)) {
 305  306                  (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp,
 306  307                      &tcp->tcp_fused_sigurg_mp);
 307  308          }
 308  309          if (!TCP_IS_DETACHED(peer_tcp)) {
 309  310                  (void) tcp_fuse_rcv_drain(peer_tcp->tcp_connp->conn_rq,
 310  311                      peer_tcp,  &peer_tcp->tcp_fused_sigurg_mp);
 311  312          }
 312  313  
 313  314          /* Lift up any flow-control conditions */
 314  315          mutex_enter(&tcp->tcp_non_sq_lock);
 315  316          if (tcp->tcp_flow_stopped) {
 316  317                  tcp_clrqfull(tcp);
 317  318                  TCP_STAT(tcps, tcp_fusion_backenabled);
 318  319          }
 319  320          mutex_exit(&tcp->tcp_non_sq_lock);
 320  321  
 321  322          mutex_enter(&peer_tcp->tcp_non_sq_lock);
 322  323          if (peer_tcp->tcp_flow_stopped) {
 323  324                  tcp_clrqfull(peer_tcp);
 324  325                  TCP_STAT(tcps, tcp_fusion_backenabled);
 325  326          }
 326  327          mutex_exit(&peer_tcp->tcp_non_sq_lock);
 327  328  
 328  329          /*
 329  330           * Update tha_seq and tha_ack in the header template
 330  331           */
 331  332          tcp->tcp_tcpha->tha_seq = htonl(tcp->tcp_snxt);
 332  333          tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
 333  334          peer_tcp->tcp_tcpha->tha_seq = htonl(peer_tcp->tcp_snxt);
 334  335          peer_tcp->tcp_tcpha->tha_ack = htonl(peer_tcp->tcp_rnxt);
 335  336  
 336  337          /* Unfuse the endpoints */
 337  338          peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
 338  339          peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
 339  340  }
 340  341  
 341  342  /*
 342  343   * Fusion output routine used to handle urgent data sent by STREAMS based
 343  344   * endpoints. This routine is called by tcp_fuse_output() for handling
 344  345   * non-M_DATA mblks.
 345  346   */
 346  347  void
 347  348  tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
 348  349  {
 349  350          mblk_t *mp1;
 350  351          struct T_exdata_ind *tei;
 351  352          tcp_t *peer_tcp = tcp->tcp_loopback_peer;
 352  353          mblk_t *head, *prev_head = NULL;
 353  354          tcp_stack_t     *tcps = tcp->tcp_tcps;
 354  355  
 355  356          ASSERT(tcp->tcp_fused);
 356  357          ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
 357  358          ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
 358  359          ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
 359  360          ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
 360  361          ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
 361  362  
 362  363          /*
 363  364           * Urgent data arrives in the form of T_EXDATA_REQ from above.
 364  365           * Each occurence denotes a new urgent pointer.  For each new
 365  366           * urgent pointer we signal (SIGURG) the receiving app to indicate
 366  367           * that it needs to go into urgent mode.  This is similar to the
 367  368           * urgent data handling in the regular tcp.  We don't need to keep
 368  369           * track of where the urgent pointer is, because each T_EXDATA_REQ
 369  370           * "advances" the urgent pointer for us.
 370  371           *
 371  372           * The actual urgent data carried by T_EXDATA_REQ is then prepended
 372  373           * by a T_EXDATA_IND before being enqueued behind any existing data
 373  374           * destined for the receiving app.  There is only a single urgent
 374  375           * pointer (out-of-band mark) for a given tcp.  If the new urgent
 375  376           * data arrives before the receiving app reads some existing urgent
 376  377           * data, the previous marker is lost.  This behavior is emulated
 377  378           * accordingly below, by removing any existing T_EXDATA_IND messages
 378  379           * and essentially converting old urgent data into non-urgent.
 379  380           */
 380  381          ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
 381  382          /* Let sender get out of urgent mode */
 382  383          tcp->tcp_valid_bits &= ~TCP_URG_VALID;
 383  384  
 384  385          /*
 385  386           * This flag indicates that a signal needs to be sent up.
 386  387           * This flag will only get cleared once SIGURG is delivered and
 387  388           * is not affected by the tcp_fused flag -- delivery will still
 388  389           * happen even after an endpoint is unfused, to handle the case
 389  390           * where the sending endpoint immediately closes/unfuses after
 390  391           * sending urgent data and the accept is not yet finished.
 391  392           */
 392  393          peer_tcp->tcp_fused_sigurg = B_TRUE;
 393  394  
 394  395          /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
 395  396          DB_TYPE(mp) = M_PROTO;
 396  397          tei = (struct T_exdata_ind *)mp->b_rptr;
 397  398          tei->PRIM_type = T_EXDATA_IND;
 398  399          tei->MORE_flag = 0;
 399  400          mp->b_wptr = (uchar_t *)&tei[1];
 400  401  
 401  402          TCP_STAT(tcps, tcp_fusion_urg);
 402  403          TCPS_BUMP_MIB(tcps, tcpOutUrg);
 403  404  
 404  405          head = peer_tcp->tcp_rcv_list;
 405  406          while (head != NULL) {
 406  407                  /*
 407  408                   * Remove existing T_EXDATA_IND, keep the data which follows
 408  409                   * it and relink our list.  Note that we don't modify the
 409  410                   * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
 410  411                   */
 411  412                  if (DB_TYPE(head) != M_DATA) {
 412  413                          mp1 = head;
 413  414  
 414  415                          ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
 415  416                          head = mp1->b_cont;
 416  417                          mp1->b_cont = NULL;
 417  418                          head->b_next = mp1->b_next;
 418  419                          mp1->b_next = NULL;
 419  420                          if (prev_head != NULL)
 420  421                                  prev_head->b_next = head;
 421  422                          if (peer_tcp->tcp_rcv_list == mp1)
 422  423                                  peer_tcp->tcp_rcv_list = head;
 423  424                          if (peer_tcp->tcp_rcv_last_head == mp1)
 424  425                                  peer_tcp->tcp_rcv_last_head = head;
 425  426                          freeb(mp1);
 426  427                  }
 427  428                  prev_head = head;
 428  429                  head = head->b_next;
 429  430          }
 430  431  }
 431  432  
 432  433  /*
 433  434   * Fusion output routine, called by tcp_output() and tcp_wput_proto().
 434  435   * If we are modifying any member that can be changed outside the squeue,
 435  436   * like tcp_flow_stopped, we need to take tcp_non_sq_lock.
 436  437   */
 437  438  boolean_t
 438  439  tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
 439  440  {
 440  441          conn_t          *connp = tcp->tcp_connp;
 441  442          tcp_t           *peer_tcp = tcp->tcp_loopback_peer;
 442  443          conn_t          *peer_connp = peer_tcp->tcp_connp;
 443  444          boolean_t       flow_stopped, peer_data_queued = B_FALSE;
 444  445          boolean_t       urgent = (DB_TYPE(mp) != M_DATA);
 445  446          boolean_t       push = B_TRUE;
 446  447          mblk_t          *mp1 = mp;
 447  448          uint_t          ip_hdr_len;
 448  449          uint32_t        recv_size = send_size;
 449  450          tcp_stack_t     *tcps = tcp->tcp_tcps;
 450  451          netstack_t      *ns = tcps->tcps_netstack;
 451  452          ip_stack_t      *ipst = ns->netstack_ip;
 452  453          ipsec_stack_t   *ipss = ns->netstack_ipsec;
 453  454          iaflags_t       ixaflags = connp->conn_ixa->ixa_flags;
 454  455          boolean_t       do_ipsec, hooks_out, hooks_in, ipobs_enabled;
 455  456  
 456  457          ASSERT(tcp->tcp_fused);
 457  458          ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
 458  459          ASSERT(connp->conn_sqp == peer_connp->conn_sqp);
 459  460          ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
 460  461              DB_TYPE(mp) == M_PCPROTO);
 461  462  
 462  463          if (send_size == 0) {
 463  464                  freemsg(mp);
 464  465                  return (B_TRUE);
 465  466          }
 466  467  
 467  468          /*
 468  469           * Handle urgent data; we either send up SIGURG to the peer now
 469  470           * or do it later when we drain, in case the peer is detached
 470  471           * or if we're short of memory for M_PCSIG mblk.
 471  472           */
 472  473          if (urgent) {
 473  474                  tcp_fuse_output_urg(tcp, mp);
 474  475  
 475  476                  mp1 = mp->b_cont;
 476  477          }
 477  478  
 478  479          /*
 479  480           * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before
 480  481           * further processes.
 481  482           */
 482  483          if (!ip_output_verify_local(connp->conn_ixa))
 483  484                  goto unfuse;
 484  485  
 485  486          /*
 486  487           * Build IP and TCP header in case we have something that needs the
 487  488           * headers. Those cases are:
 488  489           * 1. IPsec
 489  490           * 2. IPobs
 490  491           * 3. FW_HOOKS
 491  492           *
 492  493           * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection
 493  494           * and back to regular path.
 494  495           */
 495  496          if (ixaflags & IXAF_IS_IPV4) {
 496  497                  do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
 497  498                      CONN_INBOUND_POLICY_PRESENT(peer_connp, ipss);
 498  499  
 499  500                  hooks_out = HOOKS4_INTERESTED_LOOPBACK_OUT(ipst);
 500  501                  hooks_in = HOOKS4_INTERESTED_LOOPBACK_IN(ipst);
 501  502                  ipobs_enabled = (ipst->ips_ip4_observe.he_interested != 0);
 502  503          } else {
 503  504                  do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
 504  505                      CONN_INBOUND_POLICY_PRESENT_V6(peer_connp, ipss);
 505  506  
 506  507                  hooks_out = HOOKS6_INTERESTED_LOOPBACK_OUT(ipst);
 507  508                  hooks_in = HOOKS6_INTERESTED_LOOPBACK_IN(ipst);
 508  509                  ipobs_enabled = (ipst->ips_ip6_observe.he_interested != 0);
 509  510          }
 510  511  
 511  512          /* We do logical 'or' for efficiency */
 512  513          if (ipobs_enabled | do_ipsec | hooks_in | hooks_out) {
 513  514                  if ((mp1 = tcp_xmit_mp(tcp, mp1, tcp->tcp_mss, NULL, NULL,
 514  515                      tcp->tcp_snxt, B_TRUE, NULL, B_FALSE)) == NULL)
 515  516                          /* If tcp_xmit_mp fails, use regular path */
 516  517                          goto unfuse;
 517  518  
 518  519                  /*
 519  520                   * Leave all IP relevant processes to ip_output_process_local(),
 520  521                   * which handles IPsec, IPobs, and FW_HOOKS.
 521  522                   */
 522  523                  mp1 = ip_output_process_local(mp1, connp->conn_ixa, hooks_out,
 523  524                      hooks_in, do_ipsec ? peer_connp : NULL);
 524  525  
 525  526                  /* If the message is dropped for any reason. */
 526  527                  if (mp1 == NULL)
 527  528                          goto unfuse;
 528  529  
 529  530                  /*
 530  531                   * Data length might have been changed by FW_HOOKS.
 531  532                   * We assume that the first mblk contains the TCP/IP headers.
 532  533                   */
 533  534                  if (hooks_in || hooks_out) {
 534  535                          tcpha_t *tcpha;
 535  536  
 536  537                          ip_hdr_len = (ixaflags & IXAF_IS_IPV4) ?
 537  538                              IPH_HDR_LENGTH((ipha_t *)mp1->b_rptr) :
 538  539                              ip_hdr_length_v6(mp1, (ip6_t *)mp1->b_rptr);
 539  540  
 540  541                          tcpha = (tcpha_t *)&mp1->b_rptr[ip_hdr_len];
 541  542                          ASSERT((uchar_t *)tcpha + sizeof (tcpha_t) <=
 542  543                              mp1->b_wptr);
 543  544                          recv_size += htonl(tcpha->tha_seq) - tcp->tcp_snxt;
 544  545  
 545  546                  }
 546  547  
 547  548                  /*
 548  549                   * The message duplicated by tcp_xmit_mp is freed.
 549  550                   * Note: the original message passed in remains unchanged.
 550  551                   */
 551  552                  freemsg(mp1);
 552  553          }
 553  554  
 554  555          /*
 555  556           * Enqueue data into the peer's receive list; we may or may not
 556  557           * drain the contents depending on the conditions below.
 557  558           *
 558  559           * For non-STREAMS sockets we normally queue data directly in the
 559  560           * socket by calling the su_recv upcall. However, if the peer is
 560  561           * detached we use tcp_rcv_enqueue() instead. Queued data will be
 561  562           * drained when the accept completes (in tcp_accept_finish()).
 562  563           */
 563  564          if (IPCL_IS_NONSTR(peer_connp) &&
 564  565              !TCP_IS_DETACHED(peer_tcp)) {
 565  566                  int error;
 566  567                  int flags = 0;
 567  568  
 568  569                  if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
 569  570                      (tcp->tcp_urg == tcp->tcp_snxt)) {
 570  571                          flags = MSG_OOB;
 571  572                          (*peer_connp->conn_upcalls->su_signal_oob)
 572  573                              (peer_connp->conn_upper_handle, 0);
 573  574                          tcp->tcp_valid_bits &= ~TCP_URG_VALID;
 574  575                  }
 575  576                  if ((*peer_connp->conn_upcalls->su_recv)(
 576  577                      peer_connp->conn_upper_handle, mp, recv_size,
 577  578                      flags, &error, &push) < 0) {
 578  579                          ASSERT(error != EOPNOTSUPP);
 579  580                          peer_data_queued = B_TRUE;
 580  581                  }
 581  582          } else {
 582  583                  if (IPCL_IS_NONSTR(peer_connp) &&
 583  584                      (tcp->tcp_valid_bits & TCP_URG_VALID) &&
 584  585                      (tcp->tcp_urg == tcp->tcp_snxt)) {
 585  586                          /*
 586  587                           * Can not deal with urgent pointers
 587  588                           * that arrive before the connection has been
 588  589                           * accept()ed.
 589  590                           */
 590  591                          tcp->tcp_valid_bits &= ~TCP_URG_VALID;
 591  592                          freemsg(mp);
 592  593                          return (B_TRUE);
 593  594                  }
 594  595  
 595  596                  tcp_rcv_enqueue(peer_tcp, mp, recv_size,
 596  597                      tcp->tcp_connp->conn_cred);
 597  598  
 598  599                  /* In case it wrapped around and also to keep it constant */
 599  600                  peer_tcp->tcp_rwnd += recv_size;
 600  601          }
 601  602  
 602  603          /*
 603  604           * Exercise flow-control when needed; we will get back-enabled
 604  605           * in either tcp_accept_finish(), tcp_unfuse(), or when data is
 605  606           * consumed. If peer endpoint is detached, we emulate streams flow
 606  607           * control by checking the peer's queue size and high water mark;
 607  608           * otherwise we simply use canputnext() to decide if we need to stop
 608  609           * our flow.
 609  610           *
 610  611           * Since we are accessing our tcp_flow_stopped and might modify it,
 611  612           * we need to take tcp->tcp_non_sq_lock.
 612  613           */
 613  614          mutex_enter(&tcp->tcp_non_sq_lock);
 614  615          flow_stopped = tcp->tcp_flow_stopped;
 615  616          if ((TCP_IS_DETACHED(peer_tcp) &&
 616  617              (peer_tcp->tcp_rcv_cnt >= peer_connp->conn_rcvbuf)) ||
 617  618              (!TCP_IS_DETACHED(peer_tcp) &&
 618  619              !IPCL_IS_NONSTR(peer_connp) && !canputnext(peer_connp->conn_rq))) {
 619  620                  peer_data_queued = B_TRUE;
 620  621          }
 621  622  
 622  623          if (!flow_stopped && (peer_data_queued ||
 623  624              (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf))) {
 624  625                  tcp_setqfull(tcp);
 625  626                  flow_stopped = B_TRUE;
 626  627                  TCP_STAT(tcps, tcp_fusion_flowctl);
 627  628                  DTRACE_PROBE3(tcp__fuse__output__flowctl, tcp_t *, tcp,
 628  629                      uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt);
 629  630          } else if (flow_stopped && !peer_data_queued &&
 630  631              (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat)) {
 631  632                  tcp_clrqfull(tcp);
 632  633                  TCP_STAT(tcps, tcp_fusion_backenabled);
 633  634                  flow_stopped = B_FALSE;
 634  635          }
 635  636          mutex_exit(&tcp->tcp_non_sq_lock);
 636  637  
 637  638          ipst->ips_loopback_packets++;

↓ open down ↓

605 lines elided

↑ open up ↑

 638  639          tcp->tcp_last_sent_len = send_size;
 639  640  
 640  641          /* Need to adjust the following SNMP MIB-related variables */
 641  642          tcp->tcp_snxt += send_size;
 642  643          tcp->tcp_suna = tcp->tcp_snxt;
 643  644          peer_tcp->tcp_rnxt += recv_size;
 644  645          peer_tcp->tcp_last_recv_len = recv_size;
 645  646          peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
 646  647  
 647  648          TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
      649 +        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
 648  650          TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, send_size);
      651 +        tcp->tcp_cs.tcp_out_data_bytes += send_size;
      652 +        tcp->tcp_cs.tcp_out_data_segs++;
 649  653  
 650  654          TCPS_BUMP_MIB(tcps, tcpHCInSegs);
 651  655          TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
 652  656          TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, send_size);
      657 +        peer_tcp->tcp_cs.tcp_in_data_inorder_bytes += send_size;
      658 +        peer_tcp->tcp_cs.tcp_in_data_inorder_segs++;
 653  659  
 654      -        BUMP_LOCAL(tcp->tcp_obsegs);
 655      -        BUMP_LOCAL(peer_tcp->tcp_ibsegs);
 656      -
 657  660          DTRACE_TCP5(send, void, NULL, ip_xmit_attr_t *, connp->conn_ixa,
 658  661              __dtrace_tcp_void_ip_t *, NULL, tcp_t *, tcp,
 659  662              __dtrace_tcp_tcph_t *, NULL);
 660  663          DTRACE_TCP5(receive, void, NULL, ip_xmit_attr_t *,
 661  664              peer_connp->conn_ixa, __dtrace_tcp_void_ip_t *, NULL,
 662  665              tcp_t *, peer_tcp, __dtrace_tcp_tcph_t *, NULL);
 663  666  
 664  667          if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
 665  668              !TCP_IS_DETACHED(peer_tcp)) {
 666  669                  /*

 667  670                   * Drain the peer's receive queue it has urgent data or if
 668  671                   * we're not flow-controlled.
 669  672                   */
 670  673                  if (urgent || !flow_stopped) {
 671  674                          ASSERT(peer_tcp->tcp_rcv_list != NULL);
 672  675                          /*
 673  676                           * For TLI-based streams, a thread in tcp_accept_swap()
 674  677                           * can race with us.  That thread will ensure that the
 675  678                           * correct peer_connp->conn_rq is globally visible
 676  679                           * before peer_tcp->tcp_detached is visible as clear,
 677  680                           * but we must also ensure that the load of conn_rq
 678  681                           * cannot be reordered to be before the tcp_detached
 679  682                           * check.
 680  683                           */
 681  684                          membar_consumer();
 682  685                          (void) tcp_fuse_rcv_drain(peer_connp->conn_rq, peer_tcp,
 683  686                              NULL);
 684  687                  }
 685  688          }
 686  689          return (B_TRUE);
 687  690  unfuse:
 688  691          tcp_unfuse(tcp);
 689  692          return (B_FALSE);
 690  693  }
 691  694  
 692  695  /*
 693  696   * This routine gets called to deliver data upstream on a fused or
 694  697   * previously fused tcp loopback endpoint; the latter happens only
 695  698   * when there is a pending SIGURG signal plus urgent data that can't
 696  699   * be sent upstream in the past.
 697  700   */
 698  701  boolean_t
 699  702  tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
 700  703  {
 701  704          mblk_t *mp;
 702  705          conn_t  *connp = tcp->tcp_connp;
 703  706  
 704  707  #ifdef DEBUG
 705  708          uint_t cnt = 0;
 706  709  #endif
 707  710          tcp_stack_t     *tcps = tcp->tcp_tcps;
 708  711          tcp_t           *peer_tcp = tcp->tcp_loopback_peer;
 709  712  
 710  713          ASSERT(tcp->tcp_loopback);
 711  714          ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
 712  715          ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
 713  716          ASSERT(IPCL_IS_NONSTR(connp) || sigurg_mpp != NULL || tcp->tcp_fused);
 714  717  
 715  718          /* No need for the push timer now, in case it was scheduled */
 716  719          if (tcp->tcp_push_tid != 0) {
 717  720                  (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
 718  721                  tcp->tcp_push_tid = 0;
 719  722          }
 720  723          /*
 721  724           * If there's urgent data sitting in receive list and we didn't
 722  725           * get a chance to send up a SIGURG signal, make sure we send
 723  726           * it first before draining in order to ensure that SIOCATMARK
 724  727           * works properly.
 725  728           */
 726  729          if (tcp->tcp_fused_sigurg) {
 727  730                  ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
 728  731  
 729  732                  tcp->tcp_fused_sigurg = B_FALSE;
 730  733                  /*
 731  734                   * sigurg_mpp is normally NULL, i.e. when we're still
 732  735                   * fused and didn't get here because of tcp_unfuse().
 733  736                   * In this case try hard to allocate the M_PCSIG mblk.
 734  737                   */
 735  738                  if (sigurg_mpp == NULL &&
 736  739                      (mp = allocb(1, BPRI_HI)) == NULL &&
 737  740                      (mp = allocb_tryhard(1)) == NULL) {
 738  741                          /* Alloc failed; try again next time */
 739  742                          tcp->tcp_push_tid = TCP_TIMER(tcp,
 740  743                              tcp_push_timer, tcps->tcps_push_timer_interval);
 741  744                          return (B_TRUE);
 742  745                  } else if (sigurg_mpp != NULL) {
 743  746                          /*
 744  747                           * Use the supplied M_PCSIG mblk; it means we're
 745  748                           * either unfused or in the process of unfusing,
 746  749                           * and the drain must happen now.
 747  750                           */
 748  751                          mp = *sigurg_mpp;
 749  752                          *sigurg_mpp = NULL;
 750  753                  }
 751  754                  ASSERT(mp != NULL);
 752  755  
 753  756                  /* Send up the signal */
 754  757                  DB_TYPE(mp) = M_PCSIG;
 755  758                  *mp->b_wptr++ = (uchar_t)SIGURG;
 756  759                  putnext(q, mp);
 757  760  
 758  761                  /*
 759  762                   * Let the regular tcp_rcv_drain() path handle
 760  763                   * draining the data if we're no longer fused.
 761  764                   */
 762  765                  if (!tcp->tcp_fused)
 763  766                          return (B_FALSE);
 764  767          }
 765  768  
 766  769          /* Drain the data */
 767  770          while ((mp = tcp->tcp_rcv_list) != NULL) {
 768  771                  tcp->tcp_rcv_list = mp->b_next;
 769  772                  mp->b_next = NULL;
 770  773  #ifdef DEBUG
 771  774                  cnt += msgdsize(mp);
 772  775  #endif
 773  776                  ASSERT(!IPCL_IS_NONSTR(connp));
 774  777                  putnext(q, mp);
 775  778                  TCP_STAT(tcps, tcp_fusion_putnext);
 776  779          }
 777  780  
 778  781  #ifdef DEBUG
 779  782          ASSERT(cnt == tcp->tcp_rcv_cnt);
 780  783  #endif
 781  784          tcp->tcp_rcv_last_head = NULL;
 782  785          tcp->tcp_rcv_last_tail = NULL;
 783  786          tcp->tcp_rcv_cnt = 0;
 784  787          tcp->tcp_rwnd = tcp->tcp_connp->conn_rcvbuf;
 785  788  
 786  789          mutex_enter(&peer_tcp->tcp_non_sq_lock);
 787  790          if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <=
 788  791              peer_tcp->tcp_connp->conn_sndlowat)) {
 789  792                  tcp_clrqfull(peer_tcp);
 790  793                  TCP_STAT(tcps, tcp_fusion_backenabled);
 791  794          }
 792  795          mutex_exit(&peer_tcp->tcp_non_sq_lock);
 793  796  
 794  797          return (B_TRUE);
 795  798  }
 796  799  
 797  800  /*
 798  801   * Calculate the size of receive buffer for a fused tcp endpoint.
 799  802   */
 800  803  size_t
 801  804  tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
 802  805  {
 803  806          tcp_stack_t     *tcps = tcp->tcp_tcps;
 804  807          uint32_t        max_win;
 805  808  
 806  809          ASSERT(tcp->tcp_fused);
 807  810  
 808  811          /* Ensure that value is within the maximum upper bound */
 809  812          if (rwnd > tcps->tcps_max_buf)
 810  813                  rwnd = tcps->tcps_max_buf;
 811  814          /*
 812  815           * Round up to system page size in case SO_RCVBUF is modified
 813  816           * after SO_SNDBUF; the latter is also similarly rounded up.
 814  817           */
 815  818          rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
 816  819          max_win = TCP_MAXWIN << tcp->tcp_rcv_ws;
 817  820          if (rwnd > max_win) {
 818  821                  rwnd = max_win - (max_win % tcp->tcp_mss);
 819  822                  if (rwnd < tcp->tcp_mss)
 820  823                          rwnd = max_win;
 821  824          }
 822  825  
 823  826          /*
 824  827           * Record high water mark, this is used for flow-control
 825  828           * purposes in tcp_fuse_output().
 826  829           */
 827  830          tcp->tcp_connp->conn_rcvbuf = rwnd;
 828  831          tcp->tcp_rwnd = rwnd;
 829  832          return (rwnd);
 830  833  }
 831  834  
 832  835  /*
 833  836   * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
 834  837   */
 835  838  int
 836  839  tcp_fuse_maxpsz(tcp_t *tcp)
 837  840  {
 838  841          tcp_t *peer_tcp = tcp->tcp_loopback_peer;
 839  842          conn_t *connp = tcp->tcp_connp;
 840  843          uint_t sndbuf = connp->conn_sndbuf;
 841  844          uint_t maxpsz = sndbuf;
 842  845  
 843  846          ASSERT(tcp->tcp_fused);
 844  847          ASSERT(peer_tcp != NULL);
 845  848          ASSERT(peer_tcp->tcp_connp->conn_rcvbuf != 0);
 846  849          /*
 847  850           * In the fused loopback case, we want the stream head to split
 848  851           * up larger writes into smaller chunks for a more accurate flow-
 849  852           * control accounting.  Our maxpsz is half of the sender's send
 850  853           * buffer or the receiver's receive buffer, whichever is smaller.
 851  854           * We round up the buffer to system page size due to the lack of
 852  855           * TCP MSS concept in Fusion.
 853  856           */
 854  857          if (maxpsz > peer_tcp->tcp_connp->conn_rcvbuf)
 855  858                  maxpsz = peer_tcp->tcp_connp->conn_rcvbuf;
 856  859          maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
 857  860  
 858  861          return (maxpsz);
 859  862  }
 860  863  
 861  864  /*
 862  865   * Called to release flow control.
 863  866   */
 864  867  void
 865  868  tcp_fuse_backenable(tcp_t *tcp)
 866  869  {
 867  870          tcp_t *peer_tcp = tcp->tcp_loopback_peer;
 868  871  
 869  872          ASSERT(tcp->tcp_fused);
 870  873          ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused);
 871  874          ASSERT(peer_tcp->tcp_loopback_peer == tcp);
 872  875          ASSERT(!TCP_IS_DETACHED(tcp));
 873  876          ASSERT(tcp->tcp_connp->conn_sqp ==
 874  877              peer_tcp->tcp_connp->conn_sqp);
 875  878  
 876  879          if (tcp->tcp_rcv_list != NULL)
 877  880                  (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, NULL);
 878  881  
 879  882          mutex_enter(&peer_tcp->tcp_non_sq_lock);
 880  883          if (peer_tcp->tcp_flow_stopped &&
 881  884              (TCP_UNSENT_BYTES(peer_tcp) <=
 882  885              peer_tcp->tcp_connp->conn_sndlowat)) {
 883  886                  tcp_clrqfull(peer_tcp);
 884  887          }
 885  888          mutex_exit(&peer_tcp->tcp_non_sq_lock);
 886  889  
 887  890          TCP_STAT(tcp->tcp_tcps, tcp_fusion_backenabled);
 888  891  }

↓ open down ↓

222 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX