1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, Joyent Inc. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  25  */
  26 /* Copyright (c) 1990 Mentat Inc. */
  27 
  28 #include <inet/ip.h>
  29 #include <inet/tcp_impl.h>
  30 #include <sys/multidata.h>
  31 #include <sys/sunddi.h>
  32 
  33 /* Max size IP datagram is 64k - 1 */
  34 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
  35 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
  36 
  37 /* Max of the above */
  38 #define TCP_MSS_MAX             TCP_MSS_MAX_IPV4
  39 
  40 #define TCP_XMIT_LOWATER        4096
  41 #define TCP_XMIT_HIWATER        49152
  42 #define TCP_RECV_LOWATER        2048
  43 #define TCP_RECV_HIWATER        128000
  44 
  45 /*
  46  * Set the RFC 1948 pass phrase
  47  */
  48 /* ARGSUSED */
  49 static int
  50 tcp_set_1948phrase(void *cbarg,  cred_t *cr, mod_prop_info_t *pinfo,
  51     const char *ifname, const void* pr_val, uint_t flags)
  52 {
  53         tcp_stack_t     *tcps = (tcp_stack_t *)cbarg;
  54 
  55         if (flags & MOD_PROP_DEFAULT)
  56                 return (ENOTSUP);
  57 
  58         /*
  59          * Basically, value contains a new pass phrase.  Pass it along!
  60          */
  61         tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps);
  62         return (0);
  63 }
  64 
  65 /*
  66  * returns the current list of listener limit configuration.
  67  */
  68 /* ARGSUSED */
  69 static int
  70 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname,
  71     void *val, uint_t psize, uint_t flags)
  72 {
  73         tcp_stack_t     *tcps = (tcp_stack_t *)cbarg;
  74         tcp_listener_t  *tl;
  75         char            *pval = val;
  76         size_t          nbytes = 0, tbytes = 0;
  77         uint_t          size;
  78         int             err = 0;
  79 
  80         bzero(pval, psize);
  81         size = psize;
  82 
  83         if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE))
  84                 return (0);
  85 
  86         mutex_enter(&tcps->tcps_listener_conf_lock);
  87         for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
  88             tl = list_next(&tcps->tcps_listener_conf, tl)) {
  89                 if (psize == size)
  90                         nbytes = snprintf(pval, size, "%d:%d",  tl->tl_port,
  91                             tl->tl_ratio);
  92                 else
  93                         nbytes = snprintf(pval, size, ",%d:%d",  tl->tl_port,
  94                             tl->tl_ratio);
  95                 size -= nbytes;
  96                 pval += nbytes;
  97                 tbytes += nbytes;
  98                 if (tbytes >= psize) {
  99                         /* Buffer overflow, stop copying information */
 100                         err = ENOBUFS;
 101                         break;
 102                 }
 103         }
 104 
 105         mutex_exit(&tcps->tcps_listener_conf_lock);
 106         return (err);
 107 }
 108 
 109 /*
 110  * add a new listener limit configuration.
 111  */
 112 /* ARGSUSED */
 113 static int
 114 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
 115     const char *ifname, const void* pval, uint_t flags)
 116 {
 117         tcp_listener_t  *new_tl;
 118         tcp_listener_t  *tl;
 119         long            lport;
 120         long            ratio;
 121         char            *colon;
 122         tcp_stack_t     *tcps = (tcp_stack_t *)cbarg;
 123 
 124         if (flags & MOD_PROP_DEFAULT)
 125                 return (ENOTSUP);
 126 
 127         if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 ||
 128             lport > USHRT_MAX || *colon != ':') {
 129                 return (EINVAL);
 130         }
 131         if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
 132                 return (EINVAL);
 133 
 134         mutex_enter(&tcps->tcps_listener_conf_lock);
 135         for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
 136             tl = list_next(&tcps->tcps_listener_conf, tl)) {
 137                 /* There is an existing entry, so update its ratio value. */
 138                 if (tl->tl_port == lport) {
 139                         tl->tl_ratio = ratio;
 140                         mutex_exit(&tcps->tcps_listener_conf_lock);
 141                         return (0);
 142                 }
 143         }
 144 
 145         if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
 146             NULL) {
 147                 mutex_exit(&tcps->tcps_listener_conf_lock);
 148                 return (ENOMEM);
 149         }
 150 
 151         new_tl->tl_port = lport;
 152         new_tl->tl_ratio = ratio;
 153         list_insert_tail(&tcps->tcps_listener_conf, new_tl);
 154         mutex_exit(&tcps->tcps_listener_conf_lock);
 155         return (0);
 156 }
 157 
 158 /*
 159  * remove a listener limit configuration.
 160  */
 161 /* ARGSUSED */
 162 static int
 163 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
 164     const char *ifname, const void* pval, uint_t flags)
 165 {
 166         tcp_listener_t  *tl;
 167         long            lport;
 168         tcp_stack_t     *tcps = (tcp_stack_t *)cbarg;
 169 
 170         if (flags & MOD_PROP_DEFAULT)
 171                 return (ENOTSUP);
 172 
 173         if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 ||
 174             lport > USHRT_MAX) {
 175                 return (EINVAL);
 176         }
 177         mutex_enter(&tcps->tcps_listener_conf_lock);
 178         for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
 179             tl = list_next(&tcps->tcps_listener_conf, tl)) {
 180                 if (tl->tl_port == lport) {
 181                         list_remove(&tcps->tcps_listener_conf, tl);
 182                         mutex_exit(&tcps->tcps_listener_conf_lock);
 183                         kmem_free(tl, sizeof (tcp_listener_t));
 184                         return (0);
 185                 }
 186         }
 187         mutex_exit(&tcps->tcps_listener_conf_lock);
 188         return (ESRCH);
 189 }
 190 
 191 /*
 192  * Special checkers for smallest/largest anonymous port so they don't
 193  * ever happen to be (largest < smallest).
 194  */
 195 /* ARGSUSED */
 196 static int
 197 tcp_smallest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
 198     const char *ifname, const void *pval, uint_t flags)
 199 {
 200         unsigned long new_value;
 201         tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
 202         int err;
 203 
 204         if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0)
 205                 return (err);
 206         /* mod_uint32_value() + pinfo guarantees we're in TCP port range. */
 207         if ((uint32_t)new_value > tcps->tcps_largest_anon_port)
 208                 return (ERANGE);
 209         pinfo->prop_cur_uval = (uint32_t)new_value;
 210         return (0);
 211 }
 212 
 213 /* ARGSUSED */
 214 static int
 215 tcp_largest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
 216     const char *ifname, const void *pval, uint_t flags)
 217 {
 218         unsigned long new_value;
 219         tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
 220         int err;
 221 
 222         if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0)
 223                 return (err);
 224         /* mod_uint32_value() + pinfo guarantees we're in TCP port range. */
 225         if ((uint32_t)new_value < tcps->tcps_smallest_anon_port)
 226                 return (ERANGE);
 227         pinfo->prop_cur_uval = (uint32_t)new_value;
 228         return (0);
 229 }
 230 
 231 /*
 232  * All of these are alterable, within the min/max values given, at run time.
 233  *
 234  * Note: All those tunables which do not start with "_" are Committed and
 235  * therefore are public. See PSARC 2010/080.
 236  */
 237 mod_prop_info_t tcp_propinfo_tbl[] = {
 238         /* tunable - 0 */
 239         { "_time_wait_interval", MOD_PROTO_TCP,
 240             mod_set_uint32, mod_get_uint32,
 241             {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
 242 
 243         { "_conn_req_max_q", MOD_PROTO_TCP,
 244             mod_set_uint32, mod_get_uint32,
 245             {1, UINT32_MAX, 128}, {128} },
 246 
 247         { "_conn_req_max_q0", MOD_PROTO_TCP,
 248             mod_set_uint32, mod_get_uint32,
 249             {0, UINT32_MAX, 1024}, {1024} },
 250 
 251         { "_conn_req_min", MOD_PROTO_TCP,
 252             mod_set_uint32, mod_get_uint32,
 253             {1, 1024, 1}, {1} },
 254 
 255         { "_conn_grace_period", MOD_PROTO_TCP,
 256             mod_set_uint32, mod_get_uint32,
 257             {0*MS, 20*SECONDS, 0*MS}, {0*MS} },
 258 
 259         { "_cwnd_max", MOD_PROTO_TCP,
 260             mod_set_uint32, mod_get_uint32,
 261             {128, (1<<30), 1024*1024}, {1024*1024} },
 262 
 263         { "_debug", MOD_PROTO_TCP,
 264             mod_set_uint32, mod_get_uint32,
 265             {0, 10, 0}, {0} },
 266 
 267         { "smallest_nonpriv_port", MOD_PROTO_TCP,
 268             mod_set_uint32, mod_get_uint32,
 269             {1024, (32*1024), 1024}, {1024} },
 270 
 271         { "_ip_abort_cinterval", MOD_PROTO_TCP,
 272             mod_set_uint32, mod_get_uint32,
 273             {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
 274 
 275         { "_ip_abort_linterval", MOD_PROTO_TCP,
 276             mod_set_uint32, mod_get_uint32,
 277             {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
 278 
 279         /* tunable - 10 */
 280         { "_ip_abort_interval", MOD_PROTO_TCP,
 281             mod_set_uint32, mod_get_uint32,
 282             {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} },
 283 
 284         { "_ip_notify_cinterval", MOD_PROTO_TCP,
 285             mod_set_uint32, mod_get_uint32,
 286             {1*SECONDS, UINT32_MAX, 10*SECONDS},
 287             {10*SECONDS} },
 288 
 289         { "_ip_notify_interval", MOD_PROTO_TCP,
 290             mod_set_uint32, mod_get_uint32,
 291             {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} },
 292 
 293         { "_ipv4_ttl", MOD_PROTO_TCP,
 294             mod_set_uint32, mod_get_uint32,
 295             {1, 255, 64}, {64} },
 296 
 297         { "_keepalive_interval", MOD_PROTO_TCP,
 298             mod_set_uint32, mod_get_uint32,
 299             {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
 300 
 301         { "_maxpsz_multiplier", MOD_PROTO_TCP,
 302             mod_set_uint32, mod_get_uint32,
 303             {0, 100, 10}, {10} },
 304 
 305         { "_mss_def_ipv4", MOD_PROTO_TCP,
 306             mod_set_uint32, mod_get_uint32,
 307             {1, TCP_MSS_MAX_IPV4, 536}, {536} },
 308 
 309         { "_mss_max_ipv4", MOD_PROTO_TCP,
 310             mod_set_uint32, mod_get_uint32,
 311             {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4},
 312             {TCP_MSS_MAX_IPV4} },
 313 
 314         { "_mss_min", MOD_PROTO_TCP,
 315             mod_set_uint32, mod_get_uint32,
 316             {1, TCP_MSS_MAX, 108}, {108} },
 317 
 318         { "_naglim_def", MOD_PROTO_TCP,
 319             mod_set_uint32, mod_get_uint32,
 320             {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} },
 321 
 322         /* tunable - 20 */
 323         { "_rexmit_interval_initial", MOD_PROTO_TCP,
 324             mod_set_uint32, mod_get_uint32,
 325             {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} },
 326 
 327         { "_rexmit_interval_max", MOD_PROTO_TCP,
 328             mod_set_uint32, mod_get_uint32,
 329             {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} },
 330 
 331         { "_rexmit_interval_min", MOD_PROTO_TCP,
 332             mod_set_uint32, mod_get_uint32,
 333             {1*MS, 2*HOURS, 400*MS}, {400*MS} },
 334 
 335         { "_deferred_ack_interval", MOD_PROTO_TCP,
 336             mod_set_uint32, mod_get_uint32,
 337             {1*MS, 1*MINUTES, 100*MS}, {100*MS} },
 338 
 339         { "_snd_lowat_fraction", MOD_PROTO_TCP,
 340             mod_set_uint32, mod_get_uint32,
 341             {0, 16, 0}, {0} },
 342 
 343         { "_dupack_fast_retransmit", MOD_PROTO_TCP,
 344             mod_set_uint32, mod_get_uint32,
 345             {1, 10000, 3}, {3} },
 346 
 347         { "_ignore_path_mtu", MOD_PROTO_TCP,
 348             mod_set_boolean, mod_get_boolean,
 349             {B_FALSE}, {B_FALSE} },
 350 
 351         { "smallest_anon_port", MOD_PROTO_TCP,
 352             tcp_smallest_anon_set, mod_get_uint32,
 353             {1024, ULP_MAX_PORT, 32*1024}, {32*1024} },
 354 
 355         { "largest_anon_port", MOD_PROTO_TCP,
 356             tcp_largest_anon_set, mod_get_uint32,
 357             {1024, ULP_MAX_PORT, ULP_MAX_PORT},
 358             {ULP_MAX_PORT} },
 359 
 360         { "send_maxbuf", MOD_PROTO_TCP,
 361             mod_set_uint32, mod_get_uint32,
 362             {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER},
 363             {TCP_XMIT_HIWATER} },
 364 
 365         /* tunable - 30 */
 366         { "_xmit_lowat", MOD_PROTO_TCP,
 367             mod_set_uint32, mod_get_uint32,
 368             {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER},
 369             {TCP_XMIT_LOWATER} },
 370 
 371         { "recv_maxbuf", MOD_PROTO_TCP,
 372             mod_set_uint32, mod_get_uint32,
 373             {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER},
 374             {TCP_RECV_HIWATER} },
 375 
 376         { "_recv_hiwat_minmss", MOD_PROTO_TCP,
 377             mod_set_uint32, mod_get_uint32,
 378             {1, 65536, 4}, {4} },
 379 
 380         { "_fin_wait_2_flush_interval", MOD_PROTO_TCP,
 381             mod_set_uint32, mod_get_uint32,
 382             {1*SECONDS, 2*HOURS, 60*SECONDS},
 383             {60*SECONDS} },
 384 
 385         { "_max_buf", MOD_PROTO_TCP,
 386             mod_set_uint32, mod_get_uint32,
 387             {8192, (1<<30), 1024*1024}, {1024*1024} },
 388 
 389         /*
 390          * Question:  What default value should I set for tcp_strong_iss?
 391          */
 392         { "_strong_iss", MOD_PROTO_TCP,
 393             mod_set_uint32, mod_get_uint32,
 394             {0, 2, 1}, {1} },
 395 
 396         { "_rtt_updates", MOD_PROTO_TCP,
 397             mod_set_uint32, mod_get_uint32,
 398             {0, 65536, 20}, {20} },
 399 
 400         { "_wscale_always", MOD_PROTO_TCP,
 401             mod_set_boolean, mod_get_boolean,
 402             {B_TRUE}, {B_TRUE} },
 403 
 404         { "_tstamp_always", MOD_PROTO_TCP,
 405             mod_set_boolean, mod_get_boolean,
 406             {B_FALSE}, {B_FALSE} },
 407 
 408         { "_tstamp_if_wscale", MOD_PROTO_TCP,
 409             mod_set_boolean, mod_get_boolean,
 410             {B_TRUE}, {B_TRUE} },
 411 
 412         /* tunable - 40 */
 413         { "_rexmit_interval_extra", MOD_PROTO_TCP,
 414             mod_set_uint32, mod_get_uint32,
 415             {0*MS, 2*HOURS, 0*MS}, {0*MS} },
 416 
 417         { "_deferred_acks_max", MOD_PROTO_TCP,
 418             mod_set_uint32, mod_get_uint32,
 419             {0, 16, 2}, {2} },
 420 
 421         { "_slow_start_after_idle", MOD_PROTO_TCP,
 422             mod_set_uint32, mod_get_uint32,
 423             {0, 16384, 0}, {0} },
 424 
 425         { "_slow_start_initial", MOD_PROTO_TCP,
 426             mod_set_uint32, mod_get_uint32,
 427             {0, 16, 0}, {0} },
 428 
 429         { "sack", MOD_PROTO_TCP,
 430             mod_set_uint32, mod_get_uint32,
 431             {0, 2, 2}, {2} },
 432 
 433         { "_ipv6_hoplimit", MOD_PROTO_TCP,
 434             mod_set_uint32, mod_get_uint32,
 435             {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
 436             {IPV6_DEFAULT_HOPS} },
 437 
 438         { "_mss_def_ipv6", MOD_PROTO_TCP,
 439             mod_set_uint32, mod_get_uint32,
 440             {1, TCP_MSS_MAX_IPV6, 1220}, {1220} },
 441 
 442         { "_mss_max_ipv6", MOD_PROTO_TCP,
 443             mod_set_uint32, mod_get_uint32,
 444             {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6},
 445             {TCP_MSS_MAX_IPV6} },
 446 
 447         { "_rev_src_routes", MOD_PROTO_TCP,
 448             mod_set_boolean, mod_get_boolean,
 449             {B_FALSE}, {B_FALSE} },
 450 
 451         { "_local_dack_interval", MOD_PROTO_TCP,
 452             mod_set_uint32, mod_get_uint32,
 453             {10*MS, 500*MS, 50*MS}, {50*MS} },
 454 
 455         /* tunable - 50 */
 456         { "_local_dacks_max", MOD_PROTO_TCP,
 457             mod_set_uint32, mod_get_uint32,
 458             {0, 16, 8}, {8} },
 459 
 460         { "ecn", MOD_PROTO_TCP,
 461             mod_set_uint32, mod_get_uint32,
 462             {0, 2, 1}, {1} },
 463 
 464         { "_rst_sent_rate_enabled", MOD_PROTO_TCP,
 465             mod_set_boolean, mod_get_boolean,
 466             {B_TRUE}, {B_TRUE} },
 467 
 468         { "_rst_sent_rate", MOD_PROTO_TCP,
 469             mod_set_uint32, mod_get_uint32,
 470             {0, UINT32_MAX, 40}, {40} },
 471 
 472         { "_push_timer_interval", MOD_PROTO_TCP,
 473             mod_set_uint32, mod_get_uint32,
 474             {0, 100*MS, 50*MS}, {50*MS} },
 475 
 476         { "_use_smss_as_mss_opt", MOD_PROTO_TCP,
 477             mod_set_boolean, mod_get_boolean,
 478             {B_FALSE}, {B_FALSE} },
 479 
 480         { "_keepalive_abort_interval", MOD_PROTO_TCP,
 481             mod_set_uint32, mod_get_uint32,
 482             {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} },
 483 
 484         /*
 485          * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
 486          * layer header.  It has to be a multiple of 8.
 487          */
 488         { "_wroff_xtra", MOD_PROTO_TCP,
 489             mod_set_aligned, mod_get_uint32,
 490             {0, 256, 32}, {32} },
 491 
 492         { "_dev_flow_ctl", MOD_PROTO_TCP,
 493             mod_set_boolean, mod_get_boolean,
 494             {B_FALSE}, {B_FALSE} },
 495 
 496         { "_reass_timeout", MOD_PROTO_TCP,
 497             mod_set_uint32, mod_get_uint32,
 498             {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} },
 499 
 500         /* tunable - 60 */
 501         { "extra_priv_ports", MOD_PROTO_TCP,
 502             mod_set_extra_privports, mod_get_extra_privports,
 503             {1, ULP_MAX_PORT, 0}, {0} },
 504 
 505         { "_1948_phrase", MOD_PROTO_TCP,
 506             tcp_set_1948phrase, NULL, {0}, {0} },
 507 
 508         { "_listener_limit_conf", MOD_PROTO_TCP,
 509             NULL, tcp_listener_conf_get, {0}, {0} },
 510 
 511         { "_listener_limit_conf_add", MOD_PROTO_TCP,
 512             tcp_listener_conf_add, NULL, {0}, {0} },
 513 
 514         { "_listener_limit_conf_del", MOD_PROTO_TCP,
 515             tcp_listener_conf_del, NULL, {0}, {0} },
 516 
 517         { "_iss_incr", MOD_PROTO_TCP,
 518             mod_set_uint32, mod_get_uint32,
 519             {1, ISS_INCR, ISS_INCR},
 520             {ISS_INCR} },
 521 
 522         { "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
 523 
 524         { NULL, 0, NULL, NULL, {0}, {0} }
 525 };
 526 
 527 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);