1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, Joyent Inc. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 */
26 /* Copyright (c) 1990 Mentat Inc. */
27
28 #include <inet/ip.h>
29 #include <inet/tcp_impl.h>
30 #include <sys/multidata.h>
31 #include <sys/sunddi.h>
32
33 /* Max size IP datagram is 64k - 1 */
34 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
35 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
36
37 /* Max of the above */
38 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4
39
40 #define TCP_XMIT_LOWATER 4096
41 #define TCP_XMIT_HIWATER 49152
42 #define TCP_RECV_LOWATER 2048
43 #define TCP_RECV_HIWATER 128000
44
45 /*
46 * Set the RFC 1948 pass phrase
47 */
48 /* ARGSUSED */
49 static int
50 tcp_set_1948phrase(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
51 const char *ifname, const void* pr_val, uint_t flags)
52 {
53 tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
54
55 if (flags & MOD_PROP_DEFAULT)
56 return (ENOTSUP);
57
58 /*
59 * Basically, value contains a new pass phrase. Pass it along!
60 */
61 tcp_iss_key_init((uint8_t *)pr_val, strlen(pr_val), tcps);
62 return (0);
63 }
64
65 /*
66 * returns the current list of listener limit configuration.
67 */
68 /* ARGSUSED */
69 static int
70 tcp_listener_conf_get(void *cbarg, mod_prop_info_t *pinfo, const char *ifname,
71 void *val, uint_t psize, uint_t flags)
72 {
73 tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
74 tcp_listener_t *tl;
75 char *pval = val;
76 size_t nbytes = 0, tbytes = 0;
77 uint_t size;
78 int err = 0;
79
80 bzero(pval, psize);
81 size = psize;
82
83 if (flags & (MOD_PROP_DEFAULT|MOD_PROP_PERM|MOD_PROP_POSSIBLE))
84 return (0);
85
86 mutex_enter(&tcps->tcps_listener_conf_lock);
87 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
88 tl = list_next(&tcps->tcps_listener_conf, tl)) {
89 if (psize == size)
90 nbytes = snprintf(pval, size, "%d:%d", tl->tl_port,
91 tl->tl_ratio);
92 else
93 nbytes = snprintf(pval, size, ",%d:%d", tl->tl_port,
94 tl->tl_ratio);
95 size -= nbytes;
96 pval += nbytes;
97 tbytes += nbytes;
98 if (tbytes >= psize) {
99 /* Buffer overflow, stop copying information */
100 err = ENOBUFS;
101 break;
102 }
103 }
104
105 mutex_exit(&tcps->tcps_listener_conf_lock);
106 return (err);
107 }
108
109 /*
110 * add a new listener limit configuration.
111 */
112 /* ARGSUSED */
113 static int
114 tcp_listener_conf_add(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
115 const char *ifname, const void* pval, uint_t flags)
116 {
117 tcp_listener_t *new_tl;
118 tcp_listener_t *tl;
119 long lport;
120 long ratio;
121 char *colon;
122 tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
123
124 if (flags & MOD_PROP_DEFAULT)
125 return (ENOTSUP);
126
127 if (ddi_strtol(pval, &colon, 10, &lport) != 0 || lport <= 0 ||
128 lport > USHRT_MAX || *colon != ':') {
129 return (EINVAL);
130 }
131 if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0)
132 return (EINVAL);
133
134 mutex_enter(&tcps->tcps_listener_conf_lock);
135 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
136 tl = list_next(&tcps->tcps_listener_conf, tl)) {
137 /* There is an existing entry, so update its ratio value. */
138 if (tl->tl_port == lport) {
139 tl->tl_ratio = ratio;
140 mutex_exit(&tcps->tcps_listener_conf_lock);
141 return (0);
142 }
143 }
144
145 if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) ==
146 NULL) {
147 mutex_exit(&tcps->tcps_listener_conf_lock);
148 return (ENOMEM);
149 }
150
151 new_tl->tl_port = lport;
152 new_tl->tl_ratio = ratio;
153 list_insert_tail(&tcps->tcps_listener_conf, new_tl);
154 mutex_exit(&tcps->tcps_listener_conf_lock);
155 return (0);
156 }
157
158 /*
159 * remove a listener limit configuration.
160 */
161 /* ARGSUSED */
162 static int
163 tcp_listener_conf_del(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
164 const char *ifname, const void* pval, uint_t flags)
165 {
166 tcp_listener_t *tl;
167 long lport;
168 tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
169
170 if (flags & MOD_PROP_DEFAULT)
171 return (ENOTSUP);
172
173 if (ddi_strtol(pval, NULL, 10, &lport) != 0 || lport <= 0 ||
174 lport > USHRT_MAX) {
175 return (EINVAL);
176 }
177 mutex_enter(&tcps->tcps_listener_conf_lock);
178 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL;
179 tl = list_next(&tcps->tcps_listener_conf, tl)) {
180 if (tl->tl_port == lport) {
181 list_remove(&tcps->tcps_listener_conf, tl);
182 mutex_exit(&tcps->tcps_listener_conf_lock);
183 kmem_free(tl, sizeof (tcp_listener_t));
184 return (0);
185 }
186 }
187 mutex_exit(&tcps->tcps_listener_conf_lock);
188 return (ESRCH);
189 }
190
191 /*
192 * Special checkers for smallest/largest anonymous port so they don't
193 * ever happen to be (largest < smallest).
194 */
195 /* ARGSUSED */
196 static int
197 tcp_smallest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
198 const char *ifname, const void *pval, uint_t flags)
199 {
200 unsigned long new_value;
201 tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
202 int err;
203
204 if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0)
205 return (err);
206 /* mod_uint32_value() + pinfo guarantees we're in TCP port range. */
207 if ((uint32_t)new_value > tcps->tcps_largest_anon_port)
208 return (ERANGE);
209 pinfo->prop_cur_uval = (uint32_t)new_value;
210 return (0);
211 }
212
213 /* ARGSUSED */
214 static int
215 tcp_largest_anon_set(void *cbarg, cred_t *cr, mod_prop_info_t *pinfo,
216 const char *ifname, const void *pval, uint_t flags)
217 {
218 unsigned long new_value;
219 tcp_stack_t *tcps = (tcp_stack_t *)cbarg;
220 int err;
221
222 if ((err = mod_uint32_value(pval, pinfo, flags, &new_value)) != 0)
223 return (err);
224 /* mod_uint32_value() + pinfo guarantees we're in TCP port range. */
225 if ((uint32_t)new_value < tcps->tcps_smallest_anon_port)
226 return (ERANGE);
227 pinfo->prop_cur_uval = (uint32_t)new_value;
228 return (0);
229 }
230
231 /*
232 * All of these are alterable, within the min/max values given, at run time.
233 *
234 * Note: All those tunables which do not start with "_" are Committed and
235 * therefore are public. See PSARC 2010/080.
236 */
237 mod_prop_info_t tcp_propinfo_tbl[] = {
238 /* tunable - 0 */
239 { "_time_wait_interval", MOD_PROTO_TCP,
240 mod_set_uint32, mod_get_uint32,
241 {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
242
243 { "_conn_req_max_q", MOD_PROTO_TCP,
244 mod_set_uint32, mod_get_uint32,
245 {1, UINT32_MAX, 128}, {128} },
246
247 { "_conn_req_max_q0", MOD_PROTO_TCP,
248 mod_set_uint32, mod_get_uint32,
249 {0, UINT32_MAX, 1024}, {1024} },
250
251 { "_conn_req_min", MOD_PROTO_TCP,
252 mod_set_uint32, mod_get_uint32,
253 {1, 1024, 1}, {1} },
254
255 { "_conn_grace_period", MOD_PROTO_TCP,
256 mod_set_uint32, mod_get_uint32,
257 {0*MS, 20*SECONDS, 0*MS}, {0*MS} },
258
259 { "_cwnd_max", MOD_PROTO_TCP,
260 mod_set_uint32, mod_get_uint32,
261 {128, (1<<30), 1024*1024}, {1024*1024} },
262
263 { "_debug", MOD_PROTO_TCP,
264 mod_set_uint32, mod_get_uint32,
265 {0, 10, 0}, {0} },
266
267 { "smallest_nonpriv_port", MOD_PROTO_TCP,
268 mod_set_uint32, mod_get_uint32,
269 {1024, (32*1024), 1024}, {1024} },
270
271 { "_ip_abort_cinterval", MOD_PROTO_TCP,
272 mod_set_uint32, mod_get_uint32,
273 {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
274
275 { "_ip_abort_linterval", MOD_PROTO_TCP,
276 mod_set_uint32, mod_get_uint32,
277 {1*SECONDS, UINT32_MAX, 3*MINUTES}, {3*MINUTES} },
278
279 /* tunable - 10 */
280 { "_ip_abort_interval", MOD_PROTO_TCP,
281 mod_set_uint32, mod_get_uint32,
282 {500*MS, UINT32_MAX, 5*MINUTES}, {5*MINUTES} },
283
284 { "_ip_notify_cinterval", MOD_PROTO_TCP,
285 mod_set_uint32, mod_get_uint32,
286 {1*SECONDS, UINT32_MAX, 10*SECONDS},
287 {10*SECONDS} },
288
289 { "_ip_notify_interval", MOD_PROTO_TCP,
290 mod_set_uint32, mod_get_uint32,
291 {500*MS, UINT32_MAX, 10*SECONDS}, {10*SECONDS} },
292
293 { "_ipv4_ttl", MOD_PROTO_TCP,
294 mod_set_uint32, mod_get_uint32,
295 {1, 255, 64}, {64} },
296
297 { "_keepalive_interval", MOD_PROTO_TCP,
298 mod_set_uint32, mod_get_uint32,
299 {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
300
301 { "_maxpsz_multiplier", MOD_PROTO_TCP,
302 mod_set_uint32, mod_get_uint32,
303 {0, 100, 10}, {10} },
304
305 { "_mss_def_ipv4", MOD_PROTO_TCP,
306 mod_set_uint32, mod_get_uint32,
307 {1, TCP_MSS_MAX_IPV4, 536}, {536} },
308
309 { "_mss_max_ipv4", MOD_PROTO_TCP,
310 mod_set_uint32, mod_get_uint32,
311 {1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4},
312 {TCP_MSS_MAX_IPV4} },
313
314 { "_mss_min", MOD_PROTO_TCP,
315 mod_set_uint32, mod_get_uint32,
316 {1, TCP_MSS_MAX, 108}, {108} },
317
318 { "_naglim_def", MOD_PROTO_TCP,
319 mod_set_uint32, mod_get_uint32,
320 {1, (64*1024)-1, (4*1024)-1}, {(4*1024)-1} },
321
322 /* tunable - 20 */
323 { "_rexmit_interval_initial", MOD_PROTO_TCP,
324 mod_set_uint32, mod_get_uint32,
325 {1*MS, 20*SECONDS, 1*SECONDS}, {1*SECONDS} },
326
327 { "_rexmit_interval_max", MOD_PROTO_TCP,
328 mod_set_uint32, mod_get_uint32,
329 {1*MS, 2*HOURS, 60*SECONDS}, {60*SECONDS} },
330
331 { "_rexmit_interval_min", MOD_PROTO_TCP,
332 mod_set_uint32, mod_get_uint32,
333 {1*MS, 2*HOURS, 400*MS}, {400*MS} },
334
335 { "_deferred_ack_interval", MOD_PROTO_TCP,
336 mod_set_uint32, mod_get_uint32,
337 {1*MS, 1*MINUTES, 100*MS}, {100*MS} },
338
339 { "_snd_lowat_fraction", MOD_PROTO_TCP,
340 mod_set_uint32, mod_get_uint32,
341 {0, 16, 0}, {0} },
342
343 { "_dupack_fast_retransmit", MOD_PROTO_TCP,
344 mod_set_uint32, mod_get_uint32,
345 {1, 10000, 3}, {3} },
346
347 { "_ignore_path_mtu", MOD_PROTO_TCP,
348 mod_set_boolean, mod_get_boolean,
349 {B_FALSE}, {B_FALSE} },
350
351 { "smallest_anon_port", MOD_PROTO_TCP,
352 tcp_smallest_anon_set, mod_get_uint32,
353 {1024, ULP_MAX_PORT, 32*1024}, {32*1024} },
354
355 { "largest_anon_port", MOD_PROTO_TCP,
356 tcp_largest_anon_set, mod_get_uint32,
357 {1024, ULP_MAX_PORT, ULP_MAX_PORT},
358 {ULP_MAX_PORT} },
359
360 { "send_maxbuf", MOD_PROTO_TCP,
361 mod_set_uint32, mod_get_uint32,
362 {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER},
363 {TCP_XMIT_HIWATER} },
364
365 /* tunable - 30 */
366 { "_xmit_lowat", MOD_PROTO_TCP,
367 mod_set_uint32, mod_get_uint32,
368 {TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER},
369 {TCP_XMIT_LOWATER} },
370
371 { "recv_maxbuf", MOD_PROTO_TCP,
372 mod_set_uint32, mod_get_uint32,
373 {TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER},
374 {TCP_RECV_HIWATER} },
375
376 { "_recv_hiwat_minmss", MOD_PROTO_TCP,
377 mod_set_uint32, mod_get_uint32,
378 {1, 65536, 4}, {4} },
379
380 { "_fin_wait_2_flush_interval", MOD_PROTO_TCP,
381 mod_set_uint32, mod_get_uint32,
382 {1*SECONDS, 2*HOURS, 60*SECONDS},
383 {60*SECONDS} },
384
385 { "_max_buf", MOD_PROTO_TCP,
386 mod_set_uint32, mod_get_uint32,
387 {8192, (1<<30), 1024*1024}, {1024*1024} },
388
389 /*
390 * Question: What default value should I set for tcp_strong_iss?
391 */
392 { "_strong_iss", MOD_PROTO_TCP,
393 mod_set_uint32, mod_get_uint32,
394 {0, 2, 1}, {1} },
395
396 { "_rtt_updates", MOD_PROTO_TCP,
397 mod_set_uint32, mod_get_uint32,
398 {0, 65536, 20}, {20} },
399
400 { "_wscale_always", MOD_PROTO_TCP,
401 mod_set_boolean, mod_get_boolean,
402 {B_TRUE}, {B_TRUE} },
403
404 { "_tstamp_always", MOD_PROTO_TCP,
405 mod_set_boolean, mod_get_boolean,
406 {B_FALSE}, {B_FALSE} },
407
408 { "_tstamp_if_wscale", MOD_PROTO_TCP,
409 mod_set_boolean, mod_get_boolean,
410 {B_TRUE}, {B_TRUE} },
411
412 /* tunable - 40 */
413 { "_rexmit_interval_extra", MOD_PROTO_TCP,
414 mod_set_uint32, mod_get_uint32,
415 {0*MS, 2*HOURS, 0*MS}, {0*MS} },
416
417 { "_deferred_acks_max", MOD_PROTO_TCP,
418 mod_set_uint32, mod_get_uint32,
419 {0, 16, 2}, {2} },
420
421 { "_slow_start_after_idle", MOD_PROTO_TCP,
422 mod_set_uint32, mod_get_uint32,
423 {0, 16384, 0}, {0} },
424
425 { "_slow_start_initial", MOD_PROTO_TCP,
426 mod_set_uint32, mod_get_uint32,
427 {0, 16, 0}, {0} },
428
429 { "sack", MOD_PROTO_TCP,
430 mod_set_uint32, mod_get_uint32,
431 {0, 2, 2}, {2} },
432
433 { "_ipv6_hoplimit", MOD_PROTO_TCP,
434 mod_set_uint32, mod_get_uint32,
435 {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
436 {IPV6_DEFAULT_HOPS} },
437
438 { "_mss_def_ipv6", MOD_PROTO_TCP,
439 mod_set_uint32, mod_get_uint32,
440 {1, TCP_MSS_MAX_IPV6, 1220}, {1220} },
441
442 { "_mss_max_ipv6", MOD_PROTO_TCP,
443 mod_set_uint32, mod_get_uint32,
444 {1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6},
445 {TCP_MSS_MAX_IPV6} },
446
447 { "_rev_src_routes", MOD_PROTO_TCP,
448 mod_set_boolean, mod_get_boolean,
449 {B_FALSE}, {B_FALSE} },
450
451 { "_local_dack_interval", MOD_PROTO_TCP,
452 mod_set_uint32, mod_get_uint32,
453 {10*MS, 500*MS, 50*MS}, {50*MS} },
454
455 /* tunable - 50 */
456 { "_local_dacks_max", MOD_PROTO_TCP,
457 mod_set_uint32, mod_get_uint32,
458 {0, 16, 8}, {8} },
459
460 { "ecn", MOD_PROTO_TCP,
461 mod_set_uint32, mod_get_uint32,
462 {0, 2, 1}, {1} },
463
464 { "_rst_sent_rate_enabled", MOD_PROTO_TCP,
465 mod_set_boolean, mod_get_boolean,
466 {B_TRUE}, {B_TRUE} },
467
468 { "_rst_sent_rate", MOD_PROTO_TCP,
469 mod_set_uint32, mod_get_uint32,
470 {0, UINT32_MAX, 40}, {40} },
471
472 { "_push_timer_interval", MOD_PROTO_TCP,
473 mod_set_uint32, mod_get_uint32,
474 {0, 100*MS, 50*MS}, {50*MS} },
475
476 { "_use_smss_as_mss_opt", MOD_PROTO_TCP,
477 mod_set_boolean, mod_get_boolean,
478 {B_FALSE}, {B_FALSE} },
479
480 { "_keepalive_abort_interval", MOD_PROTO_TCP,
481 mod_set_uint32, mod_get_uint32,
482 {0, UINT32_MAX, 8*MINUTES}, {8*MINUTES} },
483
484 /*
485 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
486 * layer header. It has to be a multiple of 8.
487 */
488 { "_wroff_xtra", MOD_PROTO_TCP,
489 mod_set_aligned, mod_get_uint32,
490 {0, 256, 32}, {32} },
491
492 { "_dev_flow_ctl", MOD_PROTO_TCP,
493 mod_set_boolean, mod_get_boolean,
494 {B_FALSE}, {B_FALSE} },
495
496 { "_reass_timeout", MOD_PROTO_TCP,
497 mod_set_uint32, mod_get_uint32,
498 {0, UINT32_MAX, 100*SECONDS}, {100*SECONDS} },
499
500 /* tunable - 60 */
501 { "extra_priv_ports", MOD_PROTO_TCP,
502 mod_set_extra_privports, mod_get_extra_privports,
503 {1, ULP_MAX_PORT, 0}, {0} },
504
505 { "_1948_phrase", MOD_PROTO_TCP,
506 tcp_set_1948phrase, NULL, {0}, {0} },
507
508 { "_listener_limit_conf", MOD_PROTO_TCP,
509 NULL, tcp_listener_conf_get, {0}, {0} },
510
511 { "_listener_limit_conf_add", MOD_PROTO_TCP,
512 tcp_listener_conf_add, NULL, {0}, {0} },
513
514 { "_listener_limit_conf_del", MOD_PROTO_TCP,
515 tcp_listener_conf_del, NULL, {0}, {0} },
516
517 { "_iss_incr", MOD_PROTO_TCP,
518 mod_set_uint32, mod_get_uint32,
519 {1, ISS_INCR, ISS_INCR},
520 {ISS_INCR} },
521
522 { "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
523
524 { NULL, 0, NULL, NULL, {0}, {0} }
525 };
526
527 int tcp_propinfo_count = A_CNT(tcp_propinfo_tbl);