Print this page
11547 Want connstat(1M) command to display per-connection TCP statistics
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Ahmed G <ahmedg@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/tcp/tcp_input.c
          +++ new/usr/src/uts/common/inet/tcp/tcp_input.c
↓ open down ↓ 551 lines elided ↑ open up ↑
 552  552  
 553  553  /*
 554  554   * Add a new piece to the tcp reassembly queue.  If the gap at the beginning
 555  555   * is filled, return as much as we can.  The message passed in may be
 556  556   * multi-part, chained using b_cont.  "start" is the starting sequence
 557  557   * number for this piece.
 558  558   */
 559  559  static mblk_t *
 560  560  tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
 561  561  {
 562      -        uint32_t        end;
      562 +        uint32_t        end, bytes;
 563  563          mblk_t          *mp1;
 564  564          mblk_t          *mp2;
 565  565          mblk_t          *next_mp;
 566  566          uint32_t        u1;
 567  567          tcp_stack_t     *tcps = tcp->tcp_tcps;
 568  568  
 569  569  
 570  570          /* Walk through all the new pieces. */
 571  571          do {
 572  572                  ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
 573  573                      (uintptr_t)INT_MAX);
 574  574                  end = start + (int)(mp->b_wptr - mp->b_rptr);
 575  575                  next_mp = mp->b_cont;
 576  576                  if (start == end) {
 577  577                          /* Empty.  Blast it. */
 578  578                          freeb(mp);
 579  579                          continue;
 580  580                  }
      581 +                bytes = end - start;
 581  582                  mp->b_cont = NULL;
 582  583                  TCP_REASS_SET_SEQ(mp, start);
 583  584                  TCP_REASS_SET_END(mp, end);
 584  585                  mp1 = tcp->tcp_reass_tail;
 585      -                if (!mp1) {
      586 +                if (mp1 == NULL || SEQ_GEQ(start, TCP_REASS_END(mp1))) {
      587 +                        if (mp1 != NULL) {
      588 +                                /*
      589 +                                 * New stuff is beyond the tail; link it on the
      590 +                                 * end.
      591 +                                 */
      592 +                                mp1->b_cont = mp;
      593 +                        } else {
      594 +                                tcp->tcp_reass_head = mp;
      595 +                        }
 586  596                          tcp->tcp_reass_tail = mp;
 587      -                        tcp->tcp_reass_head = mp;
 588  597                          TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 589      -                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 590      -                            end - start);
      598 +                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, bytes);
      599 +                        tcp->tcp_cs.tcp_in_data_unorder_segs++;
      600 +                        tcp->tcp_cs.tcp_in_data_unorder_bytes += bytes;
 591  601                          continue;
 592  602                  }
 593      -                /* New stuff completely beyond tail? */
 594      -                if (SEQ_GEQ(start, TCP_REASS_END(mp1))) {
 595      -                        /* Link it on end. */
 596      -                        mp1->b_cont = mp;
 597      -                        tcp->tcp_reass_tail = mp;
 598      -                        TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs);
 599      -                        TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes,
 600      -                            end - start);
 601      -                        continue;
 602      -                }
 603  603                  mp1 = tcp->tcp_reass_head;
 604  604                  u1 = TCP_REASS_SEQ(mp1);
 605  605                  /* New stuff at the front? */
 606  606                  if (SEQ_LT(start, u1)) {
 607  607                          /* Yes... Check for overlap. */
 608  608                          mp->b_cont = mp1;
 609  609                          tcp->tcp_reass_head = mp;
 610  610                          tcp_reass_elim_overlap(tcp, mp);
 611  611                          continue;
 612  612                  }
↓ open down ↓ 1794 lines elided ↑ open up ↑
2407 2407                   * that it is also updated for tcp structure that belongs to
2408 2408                   * global and listener queues which do not really need updating.
2409 2409                   * But that should not cause any harm.  And it is updated for
2410 2410                   * all kinds of incoming segments, not only for data segments.
2411 2411                   */
2412 2412                  tcp->tcp_last_recv_time = LBOLT_FASTPATH;
2413 2413          }
2414 2414  
2415 2415          flags = (unsigned int)tcpha->tha_flags & 0xFF;
2416 2416  
2417      -        BUMP_LOCAL(tcp->tcp_ibsegs);
     2417 +        TCPS_BUMP_MIB(tcps, tcpHCInSegs);
2418 2418          DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
2419 2419  
2420 2420          if ((flags & TH_URG) && sqp != NULL) {
2421 2421                  /*
2422 2422                   * TCP can't handle urgent pointers that arrive before
2423 2423                   * the connection has been accept()ed since it can't
2424 2424                   * buffer OOB data.  Discard segment if this happens.
2425 2425                   *
2426 2426                   * We can't just rely on a non-null tcp_listener to indicate
2427 2427                   * that the accept() has completed since unlinking of the
↓ open down ↓ 224 lines elided ↑ open up ↑
2652 2652                                   * final ACK triggers the passive side to
2653 2653                                   * perform fusion in ESTABLISHED state.
2654 2654                                   */
2655 2655                                  if ((ack_mp = tcp_ack_mp(tcp)) != NULL) {
2656 2656                                          if (tcp->tcp_ack_tid != 0) {
2657 2657                                                  (void) TCP_TIMER_CANCEL(tcp,
2658 2658                                                      tcp->tcp_ack_tid);
2659 2659                                                  tcp->tcp_ack_tid = 0;
2660 2660                                          }
2661 2661                                          tcp_send_data(tcp, ack_mp);
2662      -                                        BUMP_LOCAL(tcp->tcp_obsegs);
     2662 +                                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
2663 2663                                          TCPS_BUMP_MIB(tcps, tcpOutAck);
2664 2664  
2665 2665                                          if (!IPCL_IS_NONSTR(connp)) {
2666 2666                                                  /* Send up T_CONN_CON */
2667 2667                                                  if (ira->ira_cred != NULL) {
2668 2668                                                          mblk_setcred(mp1,
2669 2669                                                              ira->ira_cred,
2670 2670                                                              ira->ira_cpid);
2671 2671                                                  }
2672 2672                                                  putnext(connp->conn_rq, mp1);
↓ open down ↓ 368 lines elided ↑ open up ↑
3041 3041          }
3042 3042          /*
3043 3043           * rgap is the amount of stuff received out of window.  A negative
3044 3044           * value is the amount out of window.
3045 3045           */
3046 3046          if (rgap < 0) {
3047 3047                  mblk_t  *mp2;
3048 3048  
3049 3049                  if (tcp->tcp_rwnd == 0) {
3050 3050                          TCPS_BUMP_MIB(tcps, tcpInWinProbe);
     3051 +                        tcp->tcp_cs.tcp_in_zwnd_probes++;
3051 3052                  } else {
3052 3053                          TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
3053 3054                          TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
3054 3055                  }
3055 3056  
3056 3057                  /*
3057 3058                   * seg_len does not include the FIN, so if more than
3058 3059                   * just the FIN is out of window, we act like we don't
3059 3060                   * see it.  (If just the FIN is out of window, rgap
3060 3061                   * will be zero and we will go ahead and acknowledge
↓ open down ↓ 229 lines elided ↑ open up ↑
3290 3291                                      tcp->tcp_reass_tid == 0) {
3291 3292                                          tcp->tcp_reass_tid = TCP_TIMER(tcp,
3292 3293                                              tcp_reass_timer,
3293 3294                                              tcps->tcps_reass_timeout);
3294 3295                                  }
3295 3296                          }
3296 3297                  }
3297 3298          } else if (seg_len > 0) {
3298 3299                  TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
3299 3300                  TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
     3301 +                tcp->tcp_cs.tcp_in_data_inorder_segs++;
     3302 +                tcp->tcp_cs.tcp_in_data_inorder_bytes += seg_len;
     3303 +
3300 3304                  /*
3301 3305                   * If an out of order FIN was received before, and the seq
3302 3306                   * num and len of the new segment match that of the FIN,
3303 3307                   * put the FIN flag back in.
3304 3308                   */
3305 3309                  if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) &&
3306 3310                      seg_seq + seg_len == tcp->tcp_ofo_fin_seq) {
3307 3311                          flags |= TH_FIN;
3308 3312                          tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID;
3309 3313                  }
↓ open down ↓ 829 lines elided ↑ open up ↑
4139 4143                              tcp_drop_ack_unsent_cnt) {
4140 4144                                  TCP_STAT(tcps, tcp_in_ack_unsent_drop);
4141 4145                                  if (tcp->tcp_in_ack_unsent > 2 *
4142 4146                                      tcp_drop_ack_unsent_cnt) {
4143 4147                                          (void) tcp_clean_death(tcp, EPROTO);
4144 4148                                  }
4145 4149                                  return;
4146 4150                          }
4147 4151                          mp = tcp_ack_mp(tcp);
4148 4152                          if (mp != NULL) {
4149      -                                BUMP_LOCAL(tcp->tcp_obsegs);
     4153 +                                TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4150 4154                                  TCPS_BUMP_MIB(tcps, tcpOutAck);
4151 4155                                  tcp_send_data(tcp, mp);
4152 4156                          }
4153 4157                          return;
4154 4158                  }
4155 4159          } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack,
4156 4160              tcp->tcp_snxt_shrunk)) {
4157 4161                          tcp->tcp_is_wnd_shrnk = B_FALSE;
4158 4162          }
4159 4163  
↓ open down ↓ 670 lines elided ↑ open up ↑
4830 4834                              NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size,
4831 4835                              B_TRUE);
4832 4836  
4833 4837                          if (mp1 != NULL) {
4834 4838                                  tcp->tcp_xmit_head->b_prev =
4835 4839                                      (mblk_t *)(intptr_t)gethrtime();
4836 4840                                  tcp->tcp_csuna = tcp->tcp_snxt;
4837 4841                                  TCPS_BUMP_MIB(tcps, tcpRetransSegs);
4838 4842                                  TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
4839 4843                                      snd_size);
     4844 +                                tcp->tcp_cs.tcp_out_retrans_segs++;
     4845 +                                tcp->tcp_cs.tcp_out_retrans_bytes += snd_size;
4840 4846                                  tcp_send_data(tcp, mp1);
4841 4847                          }
4842 4848                  }
4843 4849                  if (flags & TH_NEED_SACK_REXMIT) {
4844 4850                          tcp_sack_rexmit(tcp, &flags);
4845 4851                  }
4846 4852                  /*
4847 4853                   * For TH_LIMIT_XMIT, tcp_wput_data() is called to send
4848 4854                   * out new segment.  Note that tcp_rexmit should not be
4849 4855                   * set, otherwise TH_LIMIT_XMIT should not be set.
↓ open down ↓ 55 lines elided ↑ open up ↑
4905 4911                  flags &= ~TH_SEND_URP_MARK;
4906 4912          }
4907 4913          if (flags & TH_ACK_NEEDED) {
4908 4914                  /*
4909 4915                   * Time to send an ack for some reason.
4910 4916                   */
4911 4917                  mp1 = tcp_ack_mp(tcp);
4912 4918  
4913 4919                  if (mp1 != NULL) {
4914 4920                          tcp_send_data(tcp, mp1);
4915      -                        BUMP_LOCAL(tcp->tcp_obsegs);
     4921 +                        TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
4916 4922                          TCPS_BUMP_MIB(tcps, tcpOutAck);
4917 4923                  }
4918 4924                  if (tcp->tcp_ack_tid != 0) {
4919 4925                          (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4920 4926                          tcp->tcp_ack_tid = 0;
4921 4927                  }
4922 4928          }
4923 4929          if (flags & TH_ACK_TIMER_NEEDED) {
4924 4930                  /*
4925 4931                   * Arrange for deferred ACK or push wait timeout.
↓ open down ↓ 295 lines elided ↑ open up ↑
5221 5227  static void
5222 5228  tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
5223 5229  {
5224 5230          hrtime_t m = rtt;
5225 5231          hrtime_t sa = tcp->tcp_rtt_sa;
5226 5232          hrtime_t sv = tcp->tcp_rtt_sd;
5227 5233          tcp_stack_t *tcps = tcp->tcp_tcps;
5228 5234  
5229 5235          TCPS_BUMP_MIB(tcps, tcpRttUpdate);
5230 5236          tcp->tcp_rtt_update++;
     5237 +        tcp->tcp_rtt_sum += m;
     5238 +        tcp->tcp_rtt_cnt++;
5231 5239  
5232 5240          /* tcp_rtt_sa is not 0 means this is a new sample. */
5233 5241          if (sa != 0) {
5234 5242                  /*
5235 5243                   * Update average estimator (see section 2.3 of RFC6298):
5236 5244                   *      SRTT = 7/8 SRTT + 1/8 rtt
5237 5245                   *
5238 5246                   * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
5239 5247                   *      tcp_rtt_sa = 7 * SRTT + rtt
5240 5248                   *      tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
↓ open down ↓ 547 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX