Print this page
11554 Want TCP_CONGESTION socket option
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright 2016 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #define _SUN_TPI_VERSION 2
  31 #include <sys/tihdr.h>
  32 #include <sys/socket.h>
  33 #include <sys/xti_xtiopt.h>
  34 #include <sys/xti_inet.h>
  35 #include <sys/policy.h>
  36 

  37 #include <inet/common.h>
  38 #include <netinet/ip6.h>
  39 #include <inet/ip.h>
  40 
  41 #include <netinet/in.h>
  42 #include <netinet/tcp.h>
  43 #include <inet/optcom.h>
  44 #include <inet/proto_set.h>
  45 #include <inet/tcp_impl.h>
  46 
  47 static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  48 
  49 /*
  50  * Table of all known options handled on a TCP protocol stack.
  51  *
  52  * Note: This table contains options processed by both TCP and IP levels
  53  *       and is the superset of options that can be performed on a TCP over IP
  54  *       stack.
  55  */
  56 opdes_t tcp_opt_arr[] = {


 124 
 125 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 126 
 127 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 128 
 129 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 130 
 131 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 132         sizeof (int), 0 },
 133 
 134 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 135 
 136 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 137 
 138 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 139 
 140 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 141 
 142 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 143 



 144 { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 145         (OP_VARLEN|OP_NODEFAULT),
 146         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 147 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 148         (OP_VARLEN|OP_NODEFAULT),
 149         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 150 
 151 { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 152 { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 153 { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 154         sizeof (int), -1 /* not initialized */ },
 155 
 156 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 157         sizeof (ipsec_req_t), -1 /* not initialized */ },
 158 
 159 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 160         sizeof (int),   0 /* no ifindex */ },
 161 
 162 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 163         sizeof (int), 0 },


 416                  * TCP_KEEPIDLE expects value in seconds, but
 417                  * tcp_ka_interval is in milliseconds.
 418                  */
 419                 case TCP_KEEPIDLE:
 420                         *i1 = tcp->tcp_ka_interval / 1000;
 421                         return (sizeof (int));
 422                 case TCP_KEEPCNT:
 423                         *i1 = tcp->tcp_ka_cnt;
 424                         return (sizeof (int));
 425 
 426                 /*
 427                  * TCP_KEEPINTVL expects value in seconds, but
 428                  * tcp_ka_rinterval is in milliseconds.
 429                  */
 430                 case TCP_KEEPINTVL:
 431                         *i1 = tcp->tcp_ka_rinterval / 1000;
 432                         return (sizeof (int));
 433                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 434                         *i1 = tcp->tcp_ka_abort_thres;
 435                         return (sizeof (int));







 436                 case TCP_CORK:
 437                         *i1 = tcp->tcp_cork;
 438                         return (sizeof (int));
 439                 case TCP_RTO_INITIAL:
 440                         *i1 = tcp->tcp_rto_initial;
 441                         return (sizeof (uint32_t));
 442                 case TCP_RTO_MIN:
 443                         *i1 = tcp->tcp_rto_min;
 444                         return (sizeof (uint32_t));
 445                 case TCP_RTO_MAX:
 446                         *i1 = tcp->tcp_rto_max;
 447                         return (sizeof (uint32_t));
 448                 case TCP_LINGER2:
 449                         *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 450                         return (sizeof (int));
 451                 }
 452                 break;
 453         case IPPROTO_IP:
 454                 if (connp->conn_family != AF_INET)
 455                         return (-1);


 837                                         return (EINVAL);
 838                                 tcp->tcp_ka_abort_thres =
 839                                     (*i1 * tcp->tcp_ka_cnt * 1000);
 840                         }
 841                         tcp->tcp_ka_rinterval = *i1 * 1000;
 842                         break;
 843                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 844                         if (!checkonly) {
 845                                 if (*i1 <
 846                                     tcps->tcps_keepalive_abort_interval_low ||
 847                                     *i1 >
 848                                     tcps->tcps_keepalive_abort_interval_high) {
 849                                         *outlenp = 0;
 850                                         return (EINVAL);
 851                                 }
 852                                 tcp->tcp_ka_abort_thres = *i1;
 853                                 tcp->tcp_ka_cnt = 0;
 854                                 tcp->tcp_ka_rinterval = 0;
 855                         }
 856                         break;



































 857                 case TCP_CORK:
 858                         if (!checkonly) {
 859                                 /*
 860                                  * if tcp->tcp_cork was set and is now
 861                                  * being unset, we have to make sure that
 862                                  * the remaining data gets sent out. Also
 863                                  * unset tcp->tcp_cork so that tcp_wput_data()
 864                                  * can send data even if it is less than mss
 865                                  */
 866                                 if (tcp->tcp_cork && onoff == 0 &&
 867                                     tcp->tcp_unsent > 0) {
 868                                         tcp->tcp_cork = B_FALSE;
 869                                         tcp_wput_data(tcp, NULL, B_FALSE);
 870                                 }
 871                                 tcp->tcp_cork = onoff;
 872                         }
 873                         break;
 874                 case TCP_RTO_INITIAL:
 875                         if (checkonly || val == 0)
 876                                 break;




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright 2019 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #define _SUN_TPI_VERSION 2
  31 #include <sys/tihdr.h>
  32 #include <sys/socket.h>
  33 #include <sys/xti_xtiopt.h>
  34 #include <sys/xti_inet.h>
  35 #include <sys/policy.h>
  36 
  37 #include <inet/cc.h>
  38 #include <inet/common.h>
  39 #include <netinet/ip6.h>
  40 #include <inet/ip.h>
  41 
  42 #include <netinet/in.h>
  43 #include <netinet/tcp.h>
  44 #include <inet/optcom.h>
  45 #include <inet/proto_set.h>
  46 #include <inet/tcp_impl.h>
  47 
  48 static int      tcp_opt_default(queue_t *, int, int, uchar_t *);
  49 
  50 /*
  51  * Table of all known options handled on a TCP protocol stack.
  52  *
  53  * Note: This table contains options processed by both TCP and IP levels
  54  *       and is the superset of options that can be performed on a TCP over IP
  55  *       stack.
  56  */
  57 opdes_t tcp_opt_arr[] = {


 125 
 126 { TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 127 
 128 { TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 129 
 130 { TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 131 
 132 { TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
 133         sizeof (int), 0 },
 134 
 135 { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 136 
 137 { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 138 
 139 { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 140 
 141 { TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
 142 
 143 { TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 144 
 145 { TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
 146         OP_VARLEN, CC_ALGO_NAME_MAX, 0 },
 147 
 148 { IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 149         (OP_VARLEN|OP_NODEFAULT),
 150         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 151 { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 152         (OP_VARLEN|OP_NODEFAULT),
 153         IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
 154 
 155 { IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 156 { T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 157 { IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
 158         sizeof (int), -1 /* not initialized */ },
 159 
 160 { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
 161         sizeof (ipsec_req_t), -1 /* not initialized */ },
 162 
 163 { IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
 164         sizeof (int),   0 /* no ifindex */ },
 165 
 166 { IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
 167         sizeof (int), 0 },


 420                  * TCP_KEEPIDLE expects value in seconds, but
 421                  * tcp_ka_interval is in milliseconds.
 422                  */
 423                 case TCP_KEEPIDLE:
 424                         *i1 = tcp->tcp_ka_interval / 1000;
 425                         return (sizeof (int));
 426                 case TCP_KEEPCNT:
 427                         *i1 = tcp->tcp_ka_cnt;
 428                         return (sizeof (int));
 429 
 430                 /*
 431                  * TCP_KEEPINTVL expects value in seconds, but
 432                  * tcp_ka_rinterval is in milliseconds.
 433                  */
 434                 case TCP_KEEPINTVL:
 435                         *i1 = tcp->tcp_ka_rinterval / 1000;
 436                         return (sizeof (int));
 437                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 438                         *i1 = tcp->tcp_ka_abort_thres;
 439                         return (sizeof (int));
 440                 case TCP_CONGESTION: {
 441                         size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
 442                             CC_ALGO_NAME_MAX);
 443                         if (len >= CC_ALGO_NAME_MAX)
 444                                 return (-1);
 445                         return (len + 1);
 446                 }
 447                 case TCP_CORK:
 448                         *i1 = tcp->tcp_cork;
 449                         return (sizeof (int));
 450                 case TCP_RTO_INITIAL:
 451                         *i1 = tcp->tcp_rto_initial;
 452                         return (sizeof (uint32_t));
 453                 case TCP_RTO_MIN:
 454                         *i1 = tcp->tcp_rto_min;
 455                         return (sizeof (uint32_t));
 456                 case TCP_RTO_MAX:
 457                         *i1 = tcp->tcp_rto_max;
 458                         return (sizeof (uint32_t));
 459                 case TCP_LINGER2:
 460                         *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
 461                         return (sizeof (int));
 462                 }
 463                 break;
 464         case IPPROTO_IP:
 465                 if (connp->conn_family != AF_INET)
 466                         return (-1);


 848                                         return (EINVAL);
 849                                 tcp->tcp_ka_abort_thres =
 850                                     (*i1 * tcp->tcp_ka_cnt * 1000);
 851                         }
 852                         tcp->tcp_ka_rinterval = *i1 * 1000;
 853                         break;
 854                 case TCP_KEEPALIVE_ABORT_THRESHOLD:
 855                         if (!checkonly) {
 856                                 if (*i1 <
 857                                     tcps->tcps_keepalive_abort_interval_low ||
 858                                     *i1 >
 859                                     tcps->tcps_keepalive_abort_interval_high) {
 860                                         *outlenp = 0;
 861                                         return (EINVAL);
 862                                 }
 863                                 tcp->tcp_ka_abort_thres = *i1;
 864                                 tcp->tcp_ka_cnt = 0;
 865                                 tcp->tcp_ka_rinterval = 0;
 866                         }
 867                         break;
 868                 case TCP_CONGESTION: {
 869                         struct cc_algo *algo;
 870 
 871                         if (checkonly) {
 872                                 break;
 873                         }
 874 
 875                         /*
 876                          * Make sure the string is NUL-terminated. Some
 877                          * consumers pass only the number of characters
 878                          * in the string, and don't include the NUL
 879                          * terminator, so we set it for them.
 880                          */
 881                         if (inlen < CC_ALGO_NAME_MAX) {
 882                                 invalp[inlen] = '\0';
 883                         }
 884                         invalp[CC_ALGO_NAME_MAX - 1] = '\0';
 885 
 886                         if ((algo = cc_load_algo((char *)invalp)) == NULL) {
 887                                 return (ENOENT);
 888                         }
 889 
 890                         if (CC_ALGO(tcp)->cb_destroy != NULL) {
 891                                 CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
 892                         }
 893 
 894                         CC_DATA(tcp) = NULL;
 895                         CC_ALGO(tcp) = algo;
 896 
 897                         if (CC_ALGO(tcp)->cb_init != NULL) {
 898                                 VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
 899                         }
 900 
 901                         break;
 902                 }
 903                 case TCP_CORK:
 904                         if (!checkonly) {
 905                                 /*
 906                                  * if tcp->tcp_cork was set and is now
 907                                  * being unset, we have to make sure that
 908                                  * the remaining data gets sent out. Also
 909                                  * unset tcp->tcp_cork so that tcp_wput_data()
 910                                  * can send data even if it is less than mss
 911                                  */
 912                                 if (tcp->tcp_cork && onoff == 0 &&
 913                                     tcp->tcp_unsent > 0) {
 914                                         tcp->tcp_cork = B_FALSE;
 915                                         tcp_wput_data(tcp, NULL, B_FALSE);
 916                                 }
 917                                 tcp->tcp_cork = onoff;
 918                         }
 919                         break;
 920                 case TCP_RTO_INITIAL:
 921                         if (checkonly || val == 0)
 922                                 break;