1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, Joyent Inc. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <inet/ip.h> 29 #include <inet/tcp_impl.h> 30 #include <sys/multidata.h> 31 #include <sys/sunddi.h> 32 33 /* Max size IP datagram is 64k - 1 */ 34 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t))) 35 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t))) 36 37 /* Max of the above */ 38 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 39 40 #define TCP_XMIT_LOWATER 4096 41 #define TCP_XMIT_HIWATER 49152 42 #define TCP_RECV_LOWATER 2048 43 #define TCP_RECV_HIWATER 128000 44 45 /* 46 * Set the RFC 1948 pass phrase 47 */ 48 /* ARGSUSED */ 49 static int 50 tcp_set_1948phrase(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo, 51 const char *ifname, const void* pr_val, uint_t flags) 52 { 53 tcp_stack_t *tcps = (tcp_stack_t *)cbarg; 54 55 if (flags & MOD_PROP_DEFAULT) 56 return (ENOTSUP); 57 58 /* 59 * Basically, value contains a new pass phrase. Pass it along! 60 */ 61 tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps); 62 return (0); 63 } 64 65 /* 66 * returns the current list of listener limit configuration. 67 */ 68 /* ARGSUSED */ 69 static int 70 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname, 71 void *val, uint_t psize, uint_t flags) 72 { 73 tcp_stack_t *tcps = (tcp_stack_t *)cbarg; 74 tcp_listener_t *tl; 75 char *pval = val; 76 size_t nbytes = 0, tbytes = 0; 77 uint_t size; 78 int err = 0; 79 80 bzero(pval, psize); 81 size = psize; 82 83 if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE)) 84 return (0); 85 86 mutex_enter(&tcps->tcps_listener_conf_lock); 87 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 88 tl = list_next(&tcps->tcps_listener_conf, tl)) { 89 if (psize == size) 90 nbytes = snprintf(pval, size, "%d:%d", tl->tl_port, 91 tl->tl_ratio); 92 else 93 nbytes = snprintf(pval, size, ",%d:%d", tl->tl_port, 94 tl->tl_ratio); 95 size -= nbytes; 96 pval += nbytes; 97 tbytes += nbytes; 98 if (tbytes >= psize) { 99 /* Buffer overflow, stop copying information */ 100 err = ENOBUFS; 101 break; 102 } 103 } 104 105 mutex_exit(&tcps->tcps_listener_conf_lock); 106 return (err); 107 } 108 109 /* 110 * add a new listener limit configuration. 111 */ 112 /* ARGSUSED */ 113 static int 114 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo, 115 const char *ifname, const void* pval, uint_t flags) 116 { 117 tcp_listener_t *new_tl; 118 tcp_listener_t *tl; 119 long lport; 120 long ratio; 121 char *colon; 122 tcp_stack_t *tcps = (tcp_stack_t *)cbarg; 123 124 if (flags & MOD_PROP_DEFAULT) 125 return (ENOTSUP); 126 127 if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 || 128 lport > USHRT_MAX || *colon != ':') { 129 return (EINVAL); 130 } 131 if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0) 132 return (EINVAL); 133 134 mutex_enter(&tcps->tcps_listener_conf_lock); 135 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 136 tl = list_next(&tcps->tcps_listener_conf, tl)) { 137 /* There is an existing entry, so update its ratio value. */ 138 if (tl->tl_port == lport) { 139 tl->tl_ratio = ratio; 140 mutex_exit(&tcps->tcps_listener_conf_lock); 141 return (0); 142 } 143 } 144 145 if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) == 146 NULL) { 147 mutex_exit(&tcps->tcps_listener_conf_lock); 148 return (ENOMEM); 149 } 150 151 new_tl->tl_port = lport; 152 new_tl->tl_ratio = ratio; 153 list_insert_tail(&tcps->tcps_listener_conf, new_tl); 154 mutex_exit(&tcps->tcps_listener_conf_lock); 155 return (0); 156 } 157 158 /* 159 * remove a listener limit configuration. 160 */ 161 /* ARGSUSED */ 162 static int 163 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo, 164 const char *ifname, const void* pval, uint_t flags) 165 { 166 tcp_listener_t *tl; 167 long lport; 168 tcp_stack_t *tcps = (tcp_stack_t *)cbarg; 169 170 if (flags & MOD_PROP_DEFAULT) 171 return (ENOTSUP); 172 173 if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 || 174 lport > USHRT_MAX) { 175 return (EINVAL); 176 } 177 mutex_enter(&tcps->tcps_listener_conf_lock); 178 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 179 tl = list_next(&tcps->tcps_listener_conf, tl)) { 180 if (tl->tl_port == lport) { 181 list_remove(&tcps->tcps_listener_conf, tl); 182 mutex_exit(&tcps->tcps_listener_conf_lock); 183 kmem_free(tl, sizeof (tcp_listener_t)); 184 return (0); 185 } 186 } 187 mutex_exit(&tcps->tcps_listener_conf_lock); 188 return (ESRCH); 189 } 190 191 /* 192 * Special checkers for smallest/largest anonymous port so they don't 193 * ever happen to be (largest < smallest). 194 */ 195 /* ARGSUSED */ 196 static int 197 tcp_smallest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo, 198 const char *ifname, const void *pval, uint_t flags) 199 { 200 unsigned long new_value; 201 tcp_stack_t *tcps = (tcp_stack_t *)cbarg; 202 int err; 203 204 if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0) 205 return (err); 206 /* mod_uint32_value() + pinfo guarantees we're in TCP port range. */ 207 if ((uint32_t)new_value > tcps->tcps_largest_anon_port) 208 return (ERANGE); 209 pinfo->prop_cur_uval = (uint32_t)new_value; 210 return (0); 211 } 212 213 /* ARGSUSED */ 214 static int 215 tcp_largest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo, 216 const char *ifname, const void *pval, uint_t flags) 217 { 218 unsigned long new_value; 219 tcp_stack_t *tcps = (tcp_stack_t *)cbarg; 220 int err; 221 222 if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0) 223 return (err); 224 /* mod_uint32_value() + pinfo guarantees we're in TCP port range. */ 225 if ((uint32_t)new_value < tcps->tcps_smallest_anon_port) 226 return (ERANGE); 227 pinfo->prop_cur_uval = (uint32_t)new_value; 228 return (0); 229 } 230 231 /* 232 * All of these are alterable, within the min/max values given, at run time. 233 * 234 * Note: All those tunables which do not start with "_" are Committed and 235 * therefore are public. See PSARC 2010/080. 236 */ 237 mod_prop_info_t tcp_propinfo_tbl[] = { 238 /* tunable - 0 */ 239 { "_time_wait_interval", MOD_PROTO_TCP, 240 mod_set_uint32, mod_get_uint32, 241 {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} }, 242 243 { "_conn_req_max_q", MOD_PROTO_TCP, 244 mod_set_uint32, mod_get_uint32, 245 {1, UINT32_MAX, 128}, {128} }, 246 247 { "_conn_req_max_q0", MOD_PROTO_TCP, 248 mod_set_uint32, mod_get_uint32, 249 {0, UINT32_MAX, 1024}, {1024} }, 250 251 { "_conn_req_min", MOD_PROTO_TCP, 252 mod_set_uint32, mod_get_uint32, 253 {1, 1024, 1}, {1} }, 254 255 { "_conn_grace_period", MOD_PROTO_TCP, 256 mod_set_uint32, mod_get_uint32, 257 {0*MS, 20*SECONDS, 0*MS}, {0*MS} }, 258 259 { "_cwnd_max", MOD_PROTO_TCP, 260 mod_set_uint32, mod_get_uint32, 261 {128, (1<<30), 1024*1024}, {1024*1024} }, 262 263 { "_debug", MOD_PROTO_TCP, 264 mod_set_uint32, mod_get_uint32, 265 {0, 10, 0}, {0} }, 266 267 { "smallest_nonpriv_port", MOD_PROTO_TCP, 268 mod_set_uint32, mod_get_uint32, 269 {1024, (32*1024), 1024}, {1024} }, 270 271 { "_ip_abort_cinterval", MOD_PROTO_TCP, 272 mod_set_uint32, mod_get_uint32, 273 {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} }, 274 275 { "_ip_abort_linterval", MOD_PROTO_TCP, 276 mod_set_uint32, mod_get_uint32, 277 {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} }, 278 279 /* tunable - 10 */ 280 { "_ip_abort_interval", MOD_PROTO_TCP, 281 mod_set_uint32, mod_get_uint32, 282 {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} }, 283 284 { "_ip_notify_cinterval", MOD_PROTO_TCP, 285 mod_set_uint32, mod_get_uint32, 286 {1*SECONDS, UINT32_MAX, 10*SECONDS}, 287 {10*SECONDS} }, 288 289 { "_ip_notify_interval", MOD_PROTO_TCP, 290 mod_set_uint32, mod_get_uint32, 291 {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} }, 292 293 { "_ipv4_ttl", MOD_PROTO_TCP, 294 mod_set_uint32, mod_get_uint32, 295 {1, 255, 64}, {64} }, 296 297 { "_keepalive_interval", MOD_PROTO_TCP, 298 mod_set_uint32, mod_get_uint32, 299 {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} }, 300 301 { "_maxpsz_multiplier", MOD_PROTO_TCP, 302 mod_set_uint32, mod_get_uint32, 303 {0, 100, 10}, {10} }, 304 305 { "_mss_def_ipv4", MOD_PROTO_TCP, 306 mod_set_uint32, mod_get_uint32, 307 {1, TCP_MSS_MAX_IPV4, 536}, {536} }, 308 309 { "_mss_max_ipv4", MOD_PROTO_TCP, 310 mod_set_uint32, mod_get_uint32, 311 {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4}, 312 {TCP_MSS_MAX_IPV4} }, 313 314 { "_mss_min", MOD_PROTO_TCP, 315 mod_set_uint32, mod_get_uint32, 316 {1, TCP_MSS_MAX, 108}, {108} }, 317 318 { "_naglim_def", MOD_PROTO_TCP, 319 mod_set_uint32, mod_get_uint32, 320 {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} }, 321 322 /* tunable - 20 */ 323 { "_rexmit_interval_initial", MOD_PROTO_TCP, 324 mod_set_uint32, mod_get_uint32, 325 {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} }, 326 327 { "_rexmit_interval_max", MOD_PROTO_TCP, 328 mod_set_uint32, mod_get_uint32, 329 {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} }, 330 331 { "_rexmit_interval_min", MOD_PROTO_TCP, 332 mod_set_uint32, mod_get_uint32, 333 {1*MS, 2*HOURS, 400*MS}, {400*MS} }, 334 335 { "_deferred_ack_interval", MOD_PROTO_TCP, 336 mod_set_uint32, mod_get_uint32, 337 {1*MS, 1*MINUTES, 100*MS}, {100*MS} }, 338 339 { "_snd_lowat_fraction", MOD_PROTO_TCP, 340 mod_set_uint32, mod_get_uint32, 341 {0, 16, 0}, {0} }, 342 343 { "_dupack_fast_retransmit", MOD_PROTO_TCP, 344 mod_set_uint32, mod_get_uint32, 345 {1, 10000, 3}, {3} }, 346 347 { "_ignore_path_mtu", MOD_PROTO_TCP, 348 mod_set_boolean, mod_get_boolean, 349 {B_FALSE}, {B_FALSE} }, 350 351 { "smallest_anon_port", MOD_PROTO_TCP, 352 tcp_smallest_anon_set, mod_get_uint32, 353 {1024, ULP_MAX_PORT, 32*1024}, {32*1024} }, 354 355 { "largest_anon_port", MOD_PROTO_TCP, 356 tcp_largest_anon_set, mod_get_uint32, 357 {1024, ULP_MAX_PORT, ULP_MAX_PORT}, 358 {ULP_MAX_PORT} }, 359 360 { "send_maxbuf", MOD_PROTO_TCP, 361 mod_set_uint32, mod_get_uint32, 362 {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER}, 363 {TCP_XMIT_HIWATER} }, 364 365 /* tunable - 30 */ 366 { "_xmit_lowat", MOD_PROTO_TCP, 367 mod_set_uint32, mod_get_uint32, 368 {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER}, 369 {TCP_XMIT_LOWATER} }, 370 371 { "recv_maxbuf", MOD_PROTO_TCP, 372 mod_set_uint32, mod_get_uint32, 373 {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER}, 374 {TCP_RECV_HIWATER} }, 375 376 { "_recv_hiwat_minmss", MOD_PROTO_TCP, 377 mod_set_uint32, mod_get_uint32, 378 {1, 65536, 4}, {4} }, 379 380 { "_fin_wait_2_flush_interval", MOD_PROTO_TCP, 381 mod_set_uint32, mod_get_uint32, 382 {1*SECONDS, 2*HOURS, 60*SECONDS}, 383 {60*SECONDS} }, 384 385 { "_max_buf", MOD_PROTO_TCP, 386 mod_set_uint32, mod_get_uint32, 387 {8192, (1<<30), 1024*1024}, {1024*1024} }, 388 389 /* 390 * Question: What default value should I set for tcp_strong_iss? 391 */ 392 { "_strong_iss", MOD_PROTO_TCP, 393 mod_set_uint32, mod_get_uint32, 394 {0, 2, 1}, {1} }, 395 396 { "_rtt_updates", MOD_PROTO_TCP, 397 mod_set_uint32, mod_get_uint32, 398 {0, 65536, 20}, {20} }, 399 400 { "_wscale_always", MOD_PROTO_TCP, 401 mod_set_boolean, mod_get_boolean, 402 {B_TRUE}, {B_TRUE} }, 403 404 { "_tstamp_always", MOD_PROTO_TCP, 405 mod_set_boolean, mod_get_boolean, 406 {B_FALSE}, {B_FALSE} }, 407 408 { "_tstamp_if_wscale", MOD_PROTO_TCP, 409 mod_set_boolean, mod_get_boolean, 410 {B_TRUE}, {B_TRUE} }, 411 412 /* tunable - 40 */ 413 { "_rexmit_interval_extra", MOD_PROTO_TCP, 414 mod_set_uint32, mod_get_uint32, 415 {0*MS, 2*HOURS, 0*MS}, {0*MS} }, 416 417 { "_deferred_acks_max", MOD_PROTO_TCP, 418 mod_set_uint32, mod_get_uint32, 419 {0, 16, 2}, {2} }, 420 421 { "_slow_start_after_idle", MOD_PROTO_TCP, 422 mod_set_uint32, mod_get_uint32, 423 {0, 16384, 0}, {0} }, 424 425 { "_slow_start_initial", MOD_PROTO_TCP, 426 mod_set_uint32, mod_get_uint32, 427 {0, 16, 0}, {0} }, 428 429 { "sack", MOD_PROTO_TCP, 430 mod_set_uint32, mod_get_uint32, 431 {0, 2, 2}, {2} }, 432 433 { "_ipv6_hoplimit", MOD_PROTO_TCP, 434 mod_set_uint32, mod_get_uint32, 435 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS}, 436 {IPV6_DEFAULT_HOPS} }, 437 438 { "_mss_def_ipv6", MOD_PROTO_TCP, 439 mod_set_uint32, mod_get_uint32, 440 {1, TCP_MSS_MAX_IPV6, 1220}, {1220} }, 441 442 { "_mss_max_ipv6", MOD_PROTO_TCP, 443 mod_set_uint32, mod_get_uint32, 444 {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6}, 445 {TCP_MSS_MAX_IPV6} }, 446 447 { "_rev_src_routes", MOD_PROTO_TCP, 448 mod_set_boolean, mod_get_boolean, 449 {B_FALSE}, {B_FALSE} }, 450 451 { "_local_dack_interval", MOD_PROTO_TCP, 452 mod_set_uint32, mod_get_uint32, 453 {10*MS, 500*MS, 50*MS}, {50*MS} }, 454 455 /* tunable - 50 */ 456 { "_local_dacks_max", MOD_PROTO_TCP, 457 mod_set_uint32, mod_get_uint32, 458 {0, 16, 8}, {8} }, 459 460 { "ecn", MOD_PROTO_TCP, 461 mod_set_uint32, mod_get_uint32, 462 {0, 2, 1}, {1} }, 463 464 { "_rst_sent_rate_enabled", MOD_PROTO_TCP, 465 mod_set_boolean, mod_get_boolean, 466 {B_TRUE}, {B_TRUE} }, 467 468 { "_rst_sent_rate", MOD_PROTO_TCP, 469 mod_set_uint32, mod_get_uint32, 470 {0, UINT32_MAX, 40}, {40} }, 471 472 { "_push_timer_interval", MOD_PROTO_TCP, 473 mod_set_uint32, mod_get_uint32, 474 {0, 100*MS, 50*MS}, {50*MS} }, 475 476 { "_use_smss_as_mss_opt", MOD_PROTO_TCP, 477 mod_set_boolean, mod_get_boolean, 478 {B_FALSE}, {B_FALSE} }, 479 480 { "_keepalive_abort_interval", MOD_PROTO_TCP, 481 mod_set_uint32, mod_get_uint32, 482 {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} }, 483 484 /* 485 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 486 * layer header. It has to be a multiple of 8. 487 */ 488 { "_wroff_xtra", MOD_PROTO_TCP, 489 mod_set_aligned, mod_get_uint32, 490 {0, 256, 32}, {32} }, 491 492 { "_dev_flow_ctl", MOD_PROTO_TCP, 493 mod_set_boolean, mod_get_boolean, 494 {B_FALSE}, {B_FALSE} }, 495 496 { "_reass_timeout", MOD_PROTO_TCP, 497 mod_set_uint32, mod_get_uint32, 498 {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} }, 499 500 /* tunable - 60 */ 501 { "extra_priv_ports", MOD_PROTO_TCP, 502 mod_set_extra_privports, mod_get_extra_privports, 503 {1, ULP_MAX_PORT, 0}, {0} }, 504 505 { "_1948_phrase", MOD_PROTO_TCP, 506 tcp_set_1948phrase, NULL, {0}, {0} }, 507 508 { "_listener_limit_conf", MOD_PROTO_TCP, 509 NULL, tcp_listener_conf_get, {0}, {0} }, 510 511 { "_listener_limit_conf_add", MOD_PROTO_TCP, 512 tcp_listener_conf_add, NULL, {0}, {0} }, 513 514 { "_listener_limit_conf_del", MOD_PROTO_TCP, 515 tcp_listener_conf_del, NULL, {0}, {0} }, 516 517 { "_iss_incr", MOD_PROTO_TCP, 518 mod_set_uint32, mod_get_uint32, 519 {1, ISS_INCR, ISS_INCR}, 520 {ISS_INCR} }, 521 522 { "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} }, 523 524 { NULL, 0, NULL, NULL, {0}, {0} } 525 }; 526 527 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);