Print this page
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>


   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #ifndef _INET_TCP_H
  30 #define _INET_TCP_H
  31 
  32 #ifdef  __cplusplus
  33 extern "C" {
  34 #endif
  35 
  36 #include <sys/inttypes.h>
  37 #include <netinet/ip6.h>
  38 #include <netinet/tcp.h>
  39 #include <sys/socket.h>
  40 #include <sys/socket_proto.h>
  41 #include <sys/md5.h>
  42 #include <inet/common.h>
  43 #include <inet/ip.h>
  44 #include <inet/ip6.h>
  45 #include <inet/mi.h>
  46 #include <inet/mib2.h>
  47 #include <inet/tcp_stack.h>
  48 #include <inet/tcp_sack.h>

  49 
  50 /* TCP states */
  51 #define TCPS_CLOSED             -6
  52 #define TCPS_IDLE               -5      /* idle (opened, but not bound) */
  53 #define TCPS_BOUND              -4      /* bound, ready to connect or accept */
  54 #define TCPS_LISTEN             -3      /* listening for connection */
  55 #define TCPS_SYN_SENT           -2      /* active, have sent syn */
  56 #define TCPS_SYN_RCVD           -1      /* have received syn (and sent ours) */
  57 /* states < TCPS_ESTABLISHED are those where connections not established */
  58 #define TCPS_ESTABLISHED        0       /* established */
  59 #define TCPS_CLOSE_WAIT         1       /* rcvd fin, waiting for close */
  60 /* states > TCPS_CLOSE_WAIT are those where user has closed */
  61 #define TCPS_FIN_WAIT_1         2       /* have closed and sent fin */
  62 #define TCPS_CLOSING            3       /* closed, xchd FIN, await FIN ACK */
  63 #define TCPS_LAST_ACK           4       /* had fin and close; await FIN ACK */
  64 /* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */
  65 #define TCPS_FIN_WAIT_2         5       /* have closed, fin is acked */
  66 #define TCPS_TIME_WAIT          6       /* in 2*msl quiet wait after close */
  67 
  68 /*


 135 struct conn_s;
 136 struct tcp_listen_cnt_s;
 137 
 138 /*
 139  * Control structure for each open TCP stream,
 140  * defined only within the kernel or for a kmem user.
 141  * NOTE: tcp_reinit_values MUST have a line for each field in this structure!
 142  */
 143 #if (defined(_KERNEL) || defined(_KMEMUSER))
 144 
 145 typedef struct tcp_s {
 146         struct tcp_s    *tcp_time_wait_next;
 147                                 /* Pointer to next T/W block */
 148         struct tcp_s    *tcp_time_wait_prev;
 149                                 /* Pointer to previous T/W next */
 150         int64_t         tcp_time_wait_expire;
 151 
 152         struct conn_s   *tcp_connp;     /* back pointer to conn_t */
 153         tcp_stack_t     *tcp_tcps;      /* back pointer to tcp_stack_t */
 154 



 155         int32_t tcp_state;
 156         int32_t tcp_rcv_ws;             /* My window scale power */
 157         int32_t tcp_snd_ws;             /* Sender's window scale power */
 158         uint32_t tcp_ts_recent;         /* Timestamp of earliest unacked */
 159                                         /*  data segment */
 160         clock_t tcp_rto;                /* Round trip timeout */
 161         int64_t tcp_last_rcv_lbolt;
 162                                 /* lbolt on last packet, used for PAWS */
 163         uint32_t tcp_rto_initial;       /* Initial RTO */
 164         uint32_t tcp_rto_min;           /* Minimum RTO */
 165         uint32_t tcp_rto_max;           /* Maximum RTO */
 166 
 167         uint32_t tcp_snxt;              /* Senders next seq num */
 168         uint32_t tcp_swnd;              /* Senders window (relative to suna) */
 169         uint32_t tcp_mss;               /* Max segment size */
 170         uint32_t tcp_iss;               /* Initial send seq num */
 171         uint32_t tcp_rnxt;              /* Seq we expect to recv next */
 172         uint32_t tcp_rwnd;
 173 
 174         /* Fields arranged in approximate access order along main paths */


 489         uint32_t                tcp_fin_wait_2_flush_interval;
 490 
 491         tcp_conn_stats_t        tcp_cs;
 492 
 493 #ifdef DEBUG
 494         pc_t                    tcmp_stk[15];
 495 #endif
 496 } tcp_t;
 497 
 498 #ifdef DEBUG
 499 #define TCP_DEBUG_GETPCSTACK(buffer, depth)     ((void) getpcstack(buffer, \
 500                                                     depth))
 501 #else
 502 #define TCP_DEBUG_GETPCSTACK(buffer, depth)
 503 #endif
 504 
 505 extern void     tcp_conn_reclaim(void *);
 506 extern void     tcp_free(tcp_t *tcp);
 507 extern void     tcp_ddi_g_init(void);
 508 extern void     tcp_ddi_g_destroy(void);
 509 extern void     *tcp_get_conn(void *arg, tcp_stack_t *);
 510 extern mblk_t   *tcp_snmp_get(queue_t *, mblk_t *, boolean_t);
 511 extern int      tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
 512 
 513 /* Pad for the tf_t structure to avoid false cache line sharing. */
 514 #define TF_CACHEL_PAD   64
 515 
 516 /*
 517  * The TCP Fanout structure for bind and acceptor hashes.
 518  * The hash tables and their linkage (tcp_*_hash, tcp_ptp*hn) are
 519  * protected by the per-bucket tf_lock.  Each tcp_t
 520  * inserted in the list points back at this lock using tcp_*_lockp.
 521  *
 522  * The bind and acceptor hash queues are lists of tcp_t.
 523  */
 524 /* listener hash and acceptor hash queue head */
 525 typedef struct tf_s {
 526         tcp_t           *tf_tcp;
 527         kmutex_t        tf_lock;
 528         unsigned char   tf_pad[TF_CACHEL_PAD -
 529             (sizeof (tcp_t *) + sizeof (kmutex_t))];




   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  24  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #ifndef _INET_TCP_H
  30 #define _INET_TCP_H
  31 
  32 #ifdef  __cplusplus
  33 extern "C" {
  34 #endif
  35 
  36 #include <sys/inttypes.h>
  37 #include <netinet/ip6.h>
  38 #include <netinet/tcp.h>
  39 #include <sys/socket.h>
  40 #include <sys/socket_proto.h>
  41 #include <sys/md5.h>
  42 #include <inet/common.h>
  43 #include <inet/ip.h>
  44 #include <inet/ip6.h>
  45 #include <inet/mi.h>
  46 #include <inet/mib2.h>
  47 #include <inet/tcp_stack.h>
  48 #include <inet/tcp_sack.h>
  49 #include <inet/cc.h>
  50 
  51 /* TCP states */
  52 #define TCPS_CLOSED             -6
  53 #define TCPS_IDLE               -5      /* idle (opened, but not bound) */
  54 #define TCPS_BOUND              -4      /* bound, ready to connect or accept */
  55 #define TCPS_LISTEN             -3      /* listening for connection */
  56 #define TCPS_SYN_SENT           -2      /* active, have sent syn */
  57 #define TCPS_SYN_RCVD           -1      /* have received syn (and sent ours) */
  58 /* states < TCPS_ESTABLISHED are those where connections not established */
  59 #define TCPS_ESTABLISHED        0       /* established */
  60 #define TCPS_CLOSE_WAIT         1       /* rcvd fin, waiting for close */
  61 /* states > TCPS_CLOSE_WAIT are those where user has closed */
  62 #define TCPS_FIN_WAIT_1         2       /* have closed and sent fin */
  63 #define TCPS_CLOSING            3       /* closed, xchd FIN, await FIN ACK */
  64 #define TCPS_LAST_ACK           4       /* had fin and close; await FIN ACK */
  65 /* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */
  66 #define TCPS_FIN_WAIT_2         5       /* have closed, fin is acked */
  67 #define TCPS_TIME_WAIT          6       /* in 2*msl quiet wait after close */
  68 
  69 /*


 136 struct conn_s;
 137 struct tcp_listen_cnt_s;
 138 
 139 /*
 140  * Control structure for each open TCP stream,
 141  * defined only within the kernel or for a kmem user.
 142  * NOTE: tcp_reinit_values MUST have a line for each field in this structure!
 143  */
 144 #if (defined(_KERNEL) || defined(_KMEMUSER))
 145 
 146 typedef struct tcp_s {
 147         struct tcp_s    *tcp_time_wait_next;
 148                                 /* Pointer to next T/W block */
 149         struct tcp_s    *tcp_time_wait_prev;
 150                                 /* Pointer to previous T/W next */
 151         int64_t         tcp_time_wait_expire;
 152 
 153         struct conn_s   *tcp_connp;     /* back pointer to conn_t */
 154         tcp_stack_t     *tcp_tcps;      /* back pointer to tcp_stack_t */
 155 
 156         struct cc_algo  *tcp_cc_algo;   /* congestion control algorithm */
 157         struct cc_var   tcp_ccv;        /* congestion control specific vars */
 158 
 159         int32_t tcp_state;
 160         int32_t tcp_rcv_ws;             /* My window scale power */
 161         int32_t tcp_snd_ws;             /* Sender's window scale power */
 162         uint32_t tcp_ts_recent;         /* Timestamp of earliest unacked */
 163                                         /*  data segment */
 164         clock_t tcp_rto;                /* Round trip timeout */
 165         int64_t tcp_last_rcv_lbolt;
 166                                 /* lbolt on last packet, used for PAWS */
 167         uint32_t tcp_rto_initial;       /* Initial RTO */
 168         uint32_t tcp_rto_min;           /* Minimum RTO */
 169         uint32_t tcp_rto_max;           /* Maximum RTO */
 170 
 171         uint32_t tcp_snxt;              /* Senders next seq num */
 172         uint32_t tcp_swnd;              /* Senders window (relative to suna) */
 173         uint32_t tcp_mss;               /* Max segment size */
 174         uint32_t tcp_iss;               /* Initial send seq num */
 175         uint32_t tcp_rnxt;              /* Seq we expect to recv next */
 176         uint32_t tcp_rwnd;
 177 
 178         /* Fields arranged in approximate access order along main paths */


 493         uint32_t                tcp_fin_wait_2_flush_interval;
 494 
 495         tcp_conn_stats_t        tcp_cs;
 496 
 497 #ifdef DEBUG
 498         pc_t                    tcmp_stk[15];
 499 #endif
 500 } tcp_t;
 501 
 502 #ifdef DEBUG
 503 #define TCP_DEBUG_GETPCSTACK(buffer, depth)     ((void) getpcstack(buffer, \
 504                                                     depth))
 505 #else
 506 #define TCP_DEBUG_GETPCSTACK(buffer, depth)
 507 #endif
 508 
 509 extern void     tcp_conn_reclaim(void *);
 510 extern void     tcp_free(tcp_t *tcp);
 511 extern void     tcp_ddi_g_init(void);
 512 extern void     tcp_ddi_g_destroy(void);
 513 extern conn_t   *tcp_get_conn(void *arg, tcp_stack_t *);
 514 extern mblk_t   *tcp_snmp_get(queue_t *, mblk_t *, boolean_t);
 515 extern int      tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
 516 
 517 /* Pad for the tf_t structure to avoid false cache line sharing. */
 518 #define TF_CACHEL_PAD   64
 519 
 520 /*
 521  * The TCP Fanout structure for bind and acceptor hashes.
 522  * The hash tables and their linkage (tcp_*_hash, tcp_ptp*hn) are
 523  * protected by the per-bucket tf_lock.  Each tcp_t
 524  * inserted in the list points back at this lock using tcp_*_lockp.
 525  *
 526  * The bind and acceptor hash queues are lists of tcp_t.
 527  */
 528 /* listener hash and acceptor hash queue head */
 529 typedef struct tf_s {
 530         tcp_t           *tf_tcp;
 531         kmutex_t        tf_lock;
 532         unsigned char   tf_pad[TF_CACHEL_PAD -
 533             (sizeof (tcp_t *) + sizeof (kmutex_t))];