1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /* This file contains all TCP kernel socket related functions. */
27
28 #include <sys/types.h>
29 #include <sys/strlog.h>
30 #include <sys/policy.h>
31 #include <sys/sockio.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #define _SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 #include <sys/timod.h>
39 #include <sys/tpicommon.h>
40 #include <sys/socketvar.h>
41
42 #include <inet/common.h>
43 #include <inet/proto_set.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47
48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
49 sock_upcalls_t *, int, cred_t *);
50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
51 sock_upper_handle_t, cred_t *);
52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *,
53 socklen_t, cred_t *);
54 static int tcp_listen(sock_lower_handle_t, int, cred_t *);
55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *,
56 socklen_t, sock_connid_t *, cred_t *);
57 static int tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
58 socklen_t *, cred_t *);
59 static int tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
60 socklen_t *, cred_t *);
61 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *,
62 socklen_t *, cred_t *);
63 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
64 socklen_t, cred_t *);
65 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
66 cred_t *);
67 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *);
68 static void tcp_clr_flowctrl(sock_lower_handle_t);
69 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
70 cred_t *);
71 static int tcp_close(sock_lower_handle_t, int, cred_t *);
72
73 sock_downcalls_t sock_tcp_downcalls = {
74 tcp_activate,
75 tcp_accept,
76 tcp_bind,
77 tcp_listen,
78 tcp_connect,
79 tcp_getpeername,
80 tcp_getsockname,
81 tcp_getsockopt,
82 tcp_setsockopt,
83 tcp_sendmsg,
84 NULL,
85 NULL,
86 NULL,
87 tcp_shutdown,
88 tcp_clr_flowctrl,
89 tcp_ioctl,
90 tcp_close,
91 };
92
93 /* ARGSUSED */
94 static void
95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
96 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
97 {
98 conn_t *connp = (conn_t *)proto_handle;
99 struct sock_proto_props sopp;
100 extern struct module_info tcp_rinfo;
101
102 ASSERT(connp->conn_upper_handle == NULL);
103
104 /* All Solaris components should pass a cred for this operation. */
105 ASSERT(cr != NULL);
106
107 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
108 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
109 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
110
111 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
112 sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
113 sopp.sopp_maxpsz = INFPSZ;
114 sopp.sopp_maxblk = INFPSZ;
115 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
116 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
117 sopp.sopp_maxaddrlen = sizeof (sin6_t);
118 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
119 tcp_rinfo.mi_minpsz;
120
121 connp->conn_upcalls = sock_upcalls;
122 connp->conn_upper_handle = sock_handle;
123
124 ASSERT(connp->conn_rcvbuf != 0 &&
125 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
126 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
127 }
128
129 /*ARGSUSED*/
130 static int
131 tcp_accept(sock_lower_handle_t lproto_handle,
132 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
133 cred_t *cr)
134 {
135 conn_t *lconnp, *econnp;
136 tcp_t *listener, *eager;
137
138 /* All Solaris components should pass a cred for this operation. */
139 ASSERT(cr != NULL);
140
141 /*
142 * KSSL can move a socket from one listener to another, in which
143 * case `lproto_handle' points to the new listener. To ensure that
144 * the original listener is used the information is obtained from
145 * the eager.
146 */
147 econnp = (conn_t *)eproto_handle;
148 eager = econnp->conn_tcp;
149 ASSERT(IPCL_IS_NONSTR(econnp));
150 ASSERT(eager->tcp_listener != NULL);
151 listener = eager->tcp_listener;
152 lconnp = (conn_t *)listener->tcp_connp;
153 ASSERT(listener->tcp_state == TCPS_LISTEN);
154 ASSERT(lconnp->conn_upper_handle != NULL);
155
156 /*
157 * It is possible for the accept thread to race with the thread that
158 * made the su_newconn upcall in tcp_newconn_notify. Both
159 * tcp_newconn_notify and tcp_accept require that conn_upper_handle
160 * and conn_upcalls be set before returning, so they both write to
161 * them. However, we're guaranteed that the value written is the same
162 * for both threads.
163 */
164 ASSERT(econnp->conn_upper_handle == NULL ||
165 econnp->conn_upper_handle == sock_handle);
166 ASSERT(econnp->conn_upcalls == NULL ||
167 econnp->conn_upcalls == lconnp->conn_upcalls);
168 econnp->conn_upper_handle = sock_handle;
169 econnp->conn_upcalls = lconnp->conn_upcalls;
170
171 ASSERT(econnp->conn_netstack ==
172 listener->tcp_connp->conn_netstack);
173 ASSERT(eager->tcp_tcps == listener->tcp_tcps);
174
175 /*
176 * We should have a minimum of 2 references on the conn at this
177 * point. One for TCP and one for the newconn notification
178 * (which is now taken over by IP). In the normal case we would
179 * also have another reference (making a total of 3) for the conn
180 * being in the classifier hash list. However the eager could have
181 * received an RST subsequently and tcp_closei_local could have
182 * removed the eager from the classifier hash list, hence we can't
183 * assert that reference.
184 */
185 ASSERT(econnp->conn_ref >= 2);
186
187 mutex_enter(&listener->tcp_eager_lock);
188 /*
189 * Non-STREAMS listeners never defer the notification of new
190 * connections.
191 */
192 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
193 tcp_eager_unlink(eager);
194 mutex_exit(&listener->tcp_eager_lock);
195 CONN_DEC_REF(listener->tcp_connp);
196
197 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
198 }
199
200 static int
201 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
202 socklen_t len, cred_t *cr)
203 {
204 int error;
205 conn_t *connp = (conn_t *)proto_handle;
206
207 /* All Solaris components should pass a cred for this operation. */
208 ASSERT(cr != NULL);
209 ASSERT(connp->conn_upper_handle != NULL);
210
211 error = squeue_synch_enter(connp, NULL);
212 if (error != 0) {
213 /* failed to enter */
214 return (ENOSR);
215 }
216
217 /* binding to a NULL address really means unbind */
218 if (sa == NULL) {
219 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
220 error = tcp_do_unbind(connp);
221 else
222 error = EINVAL;
223 } else {
224 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
225 }
226
227 squeue_synch_exit(connp);
228
229 if (error < 0) {
230 if (error == -TOUTSTATE)
231 error = EINVAL;
232 else
233 error = proto_tlitosyserr(-error);
234 }
235
236 return (error);
237 }
238
239 /* ARGSUSED */
240 static int
241 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
242 {
243 conn_t *connp = (conn_t *)proto_handle;
244 tcp_t *tcp = connp->conn_tcp;
245 int error;
246
247 ASSERT(connp->conn_upper_handle != NULL);
248
249 /* All Solaris components should pass a cred for this operation. */
250 ASSERT(cr != NULL);
251
252 error = squeue_synch_enter(connp, NULL);
253 if (error != 0) {
254 /* failed to enter */
255 return (ENOBUFS);
256 }
257
258 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
259 if (error == 0) {
260 /*
261 * sockfs needs to know what's the maximum number of socket
262 * that can be queued on the listener.
263 */
264 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
265 SOCK_OPCTL_ENAB_ACCEPT,
266 (uintptr_t)(tcp->tcp_conn_req_max +
267 tcp->tcp_tcps->tcps_conn_req_max_q0));
268 } else if (error < 0) {
269 if (error == -TOUTSTATE)
270 error = EINVAL;
271 else
272 error = proto_tlitosyserr(-error);
273 }
274 squeue_synch_exit(connp);
275 return (error);
276 }
277
278 static int
279 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
280 socklen_t len, sock_connid_t *id, cred_t *cr)
281 {
282 conn_t *connp = (conn_t *)proto_handle;
283 int error;
284
285 ASSERT(connp->conn_upper_handle != NULL);
286
287 /* All Solaris components should pass a cred for this operation. */
288 ASSERT(cr != NULL);
289
290 error = proto_verify_ip_addr(connp->conn_family, sa, len);
291 if (error != 0) {
292 return (error);
293 }
294
295 error = squeue_synch_enter(connp, NULL);
296 if (error != 0) {
297 /* failed to enter */
298 return (ENOSR);
299 }
300
301 /*
302 * TCP supports quick connect, so no need to do an implicit bind
303 */
304 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
305 if (error == 0) {
306 *id = connp->conn_tcp->tcp_connid;
307 } else if (error < 0) {
308 if (error == -TOUTSTATE) {
309 switch (connp->conn_tcp->tcp_state) {
310 case TCPS_SYN_SENT:
311 error = EALREADY;
312 break;
313 case TCPS_ESTABLISHED:
314 error = EISCONN;
315 break;
316 case TCPS_LISTEN:
317 error = EOPNOTSUPP;
318 break;
319 default:
320 error = EINVAL;
321 break;
322 }
323 } else {
324 error = proto_tlitosyserr(-error);
325 }
326 }
327
328 if (connp->conn_tcp->tcp_loopback) {
329 struct sock_proto_props sopp;
330
331 sopp.sopp_flags = SOCKOPT_LOOPBACK;
332 sopp.sopp_loopback = B_TRUE;
333
334 (*connp->conn_upcalls->su_set_proto_props)(
335 connp->conn_upper_handle, &sopp);
336 }
337 done:
338 squeue_synch_exit(connp);
339
340 return ((error == 0) ? EINPROGRESS : error);
341 }
342
343 /* ARGSUSED3 */
344 static int
345 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
346 socklen_t *addrlenp, cred_t *cr)
347 {
348 conn_t *connp = (conn_t *)proto_handle;
349 tcp_t *tcp = connp->conn_tcp;
350
351 /* All Solaris components should pass a cred for this operation. */
352 ASSERT(cr != NULL);
353
354 ASSERT(tcp != NULL);
355 if (tcp->tcp_state < TCPS_SYN_RCVD)
356 return (ENOTCONN);
357
358 return (conn_getpeername(connp, addr, addrlenp));
359 }
360
361 /* ARGSUSED3 */
362 static int
363 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
364 socklen_t *addrlenp, cred_t *cr)
365 {
366 conn_t *connp = (conn_t *)proto_handle;
367
368 /* All Solaris components should pass a cred for this operation. */
369 ASSERT(cr != NULL);
370
371 return (conn_getsockname(connp, addr, addrlenp));
372 }
373
374 /* returns UNIX error, the optlen is a value-result arg */
375 static int
376 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
377 void *optvalp, socklen_t *optlen, cred_t *cr)
378 {
379 conn_t *connp = (conn_t *)proto_handle;
380 int error;
381 t_uscalar_t max_optbuf_len;
382 void *optvalp_buf;
383 int len;
384
385 ASSERT(connp->conn_upper_handle != NULL);
386
387 /* All Solaris components should pass a cred for this operation. */
388 ASSERT(cr != NULL);
389
390 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
391 tcp_opt_obj.odb_opt_des_arr,
392 tcp_opt_obj.odb_opt_arr_cnt,
393 B_FALSE, B_TRUE, cr);
394 if (error != 0) {
395 if (error < 0) {
396 error = proto_tlitosyserr(-error);
397 }
398 return (error);
399 }
400
401 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
402
403 error = squeue_synch_enter(connp, NULL);
404 if (error == ENOMEM) {
405 kmem_free(optvalp_buf, max_optbuf_len);
406 return (ENOMEM);
407 }
408
409 len = tcp_opt_get(connp, level, option_name, optvalp_buf);
410 squeue_synch_exit(connp);
411
412 if (len == -1) {
413 kmem_free(optvalp_buf, max_optbuf_len);
414 return (EINVAL);
415 }
416
417 /*
418 * update optlen and copy option value
419 */
420 t_uscalar_t size = MIN(len, *optlen);
421
422 bcopy(optvalp_buf, optvalp, size);
423 bcopy(&size, optlen, sizeof (size));
424
425 kmem_free(optvalp_buf, max_optbuf_len);
426 return (0);
427 }
428
429 static int
430 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
431 const void *optvalp, socklen_t optlen, cred_t *cr)
432 {
433 conn_t *connp = (conn_t *)proto_handle;
434 int error;
435
436 ASSERT(connp->conn_upper_handle != NULL);
437
438 /* All Solaris components should pass a cred for this operation. */
439 ASSERT(cr != NULL);
440
441 /*
442 * Entering the squeue synchronously can result in a context switch,
443 * which can cause a rather sever performance degradation. So we try to
444 * handle whatever options we can without entering the squeue.
445 */
446 if (level == IPPROTO_TCP) {
447 switch (option_name) {
448 case TCP_NODELAY:
449 if (optlen != sizeof (int32_t))
450 return (EINVAL);
451 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
452 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
453 connp->conn_tcp->tcp_mss;
454 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
455 return (0);
456 default:
457 break;
458 }
459 }
460
461 error = squeue_synch_enter(connp, NULL);
462 if (error == ENOMEM) {
463 return (ENOMEM);
464 }
465
466 error = proto_opt_check(level, option_name, optlen, NULL,
467 tcp_opt_obj.odb_opt_des_arr,
468 tcp_opt_obj.odb_opt_arr_cnt,
469 B_TRUE, B_FALSE, cr);
470
471 if (error != 0) {
472 if (error < 0) {
473 error = proto_tlitosyserr(-error);
474 }
475 squeue_synch_exit(connp);
476 return (error);
477 }
478
479 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
480 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
481 NULL, cr);
482 squeue_synch_exit(connp);
483
484 ASSERT(error >= 0);
485
486 return (error);
487 }
488
489 /* ARGSUSED */
490 static int
491 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
492 cred_t *cr)
493 {
494 tcp_t *tcp;
495 uint32_t msize;
496 conn_t *connp = (conn_t *)proto_handle;
497 int32_t tcpstate;
498
499 /* All Solaris components should pass a cred for this operation. */
500 ASSERT(cr != NULL);
501
502 ASSERT(connp->conn_ref >= 2);
503 ASSERT(connp->conn_upper_handle != NULL);
504
505 if (msg->msg_controllen != 0) {
506 freemsg(mp);
507 return (EOPNOTSUPP);
508 }
509
510 switch (DB_TYPE(mp)) {
511 case M_DATA:
512 tcp = connp->conn_tcp;
513 ASSERT(tcp != NULL);
514
515 tcpstate = tcp->tcp_state;
516 if (tcpstate < TCPS_ESTABLISHED) {
517 freemsg(mp);
518 /*
519 * We return ENOTCONN if the endpoint is trying to
520 * connect or has never been connected, and EPIPE if it
521 * has been disconnected. The connection id helps us
522 * distinguish between the last two cases.
523 */
524 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
525 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
526 } else if (tcpstate > TCPS_CLOSE_WAIT) {
527 freemsg(mp);
528 return (EPIPE);
529 }
530
531 msize = msgdsize(mp);
532
533 mutex_enter(&tcp->tcp_non_sq_lock);
534 tcp->tcp_squeue_bytes += msize;
535 /*
536 * Squeue Flow Control
537 */
538 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
539 tcp_setqfull(tcp);
540 }
541 mutex_exit(&tcp->tcp_non_sq_lock);
542
543 /*
544 * The application may pass in an address in the msghdr, but
545 * we ignore the address on connection-oriented sockets.
546 * Just like BSD this code does not generate an error for
547 * TCP (a CONNREQUIRED socket) when sending to an address
548 * passed in with sendto/sendmsg. Instead the data is
549 * delivered on the connection as if no address had been
550 * supplied.
551 */
552 CONN_INC_REF(connp);
553
554 if (msg->msg_flags & MSG_OOB) {
555 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
556 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
557 } else {
558 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
559 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
560 }
561
562 return (0);
563
564 default:
565 ASSERT(0);
566 }
567
568 freemsg(mp);
569 return (0);
570 }
571
572 /* ARGSUSED */
573 static int
574 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
575 {
576 conn_t *connp = (conn_t *)proto_handle;
577 tcp_t *tcp = connp->conn_tcp;
578
579 ASSERT(connp->conn_upper_handle != NULL);
580
581 /* All Solaris components should pass a cred for this operation. */
582 ASSERT(cr != NULL);
583
584 /*
585 * X/Open requires that we check the connected state.
586 */
587 if (tcp->tcp_state < TCPS_SYN_SENT)
588 return (ENOTCONN);
589
590 /* shutdown the send side */
591 if (how != SHUT_RD) {
592 mblk_t *bp;
593
594 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
595 CONN_INC_REF(connp);
596 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
597 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
598
599 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
600 SOCK_OPCTL_SHUT_SEND, 0);
601 }
602
603 /* shutdown the recv side */
604 if (how != SHUT_WR)
605 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
606 SOCK_OPCTL_SHUT_RECV, 0);
607
608 return (0);
609 }
610
611 static void
612 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
613 {
614 conn_t *connp = (conn_t *)proto_handle;
615 tcp_t *tcp = connp->conn_tcp;
616 mblk_t *mp;
617 int error;
618
619 ASSERT(connp->conn_upper_handle != NULL);
620
621 /*
622 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
623 * is currently running.
624 */
625 mutex_enter(&tcp->tcp_rsrv_mp_lock);
626 if ((mp = tcp->tcp_rsrv_mp) == NULL) {
627 mutex_exit(&tcp->tcp_rsrv_mp_lock);
628 return;
629 }
630 tcp->tcp_rsrv_mp = NULL;
631 mutex_exit(&tcp->tcp_rsrv_mp_lock);
632
633 error = squeue_synch_enter(connp, mp);
634 ASSERT(error == 0);
635
636 mutex_enter(&tcp->tcp_rsrv_mp_lock);
637 tcp->tcp_rsrv_mp = mp;
638 mutex_exit(&tcp->tcp_rsrv_mp_lock);
639
640 if (tcp->tcp_fused) {
641 tcp_fuse_backenable(tcp);
642 } else {
643 tcp->tcp_rwnd = connp->conn_rcvbuf;
644 /*
645 * Send back a window update immediately if TCP is above
646 * ESTABLISHED state and the increase of the rcv window
647 * that the other side knows is at least 1 MSS after flow
648 * control is lifted.
649 */
650 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
651 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
652 tcp_xmit_ctl(NULL, tcp,
653 (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
654 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
655 }
656 }
657
658 squeue_synch_exit(connp);
659 }
660
661 /* ARGSUSED */
662 static int
663 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
664 int mode, int32_t *rvalp, cred_t *cr)
665 {
666 conn_t *connp = (conn_t *)proto_handle;
667 int error;
668
669 ASSERT(connp->conn_upper_handle != NULL);
670
671 /* All Solaris components should pass a cred for this operation. */
672 ASSERT(cr != NULL);
673
674 /*
675 * If we don't have a helper stream then create one.
676 * ip_create_helper_stream takes care of locking the conn_t,
677 * so this check for NULL is just a performance optimization.
678 */
679 if (connp->conn_helper_info == NULL) {
680 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
681
682 /*
683 * Create a helper stream for non-STREAMS socket.
684 */
685 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
686 if (error != 0) {
687 ip0dbg(("tcp_ioctl: create of IP helper stream "
688 "failed %d\n", error));
689 return (error);
690 }
691 }
692
693 switch (cmd) {
694 case ND_SET:
695 case ND_GET:
696 case _SIOCSOCKFALLBACK:
697 case TCP_IOC_ABORT_CONN:
698 case TI_GETPEERNAME:
699 case TI_GETMYNAME:
700 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
701 cmd));
702 error = EINVAL;
703 break;
704 default:
705 /*
706 * If the conn is not closing, pass on to IP using
707 * helper stream. Bump the ioctlref to prevent tcp_close
708 * from closing the rq/wq out from underneath the ioctl
709 * if it ends up queued or aborted/interrupted.
710 */
711 mutex_enter(&connp->conn_lock);
712 if (connp->conn_state_flags & (CONN_CLOSING)) {
713 mutex_exit(&connp->conn_lock);
714 error = EINVAL;
715 break;
716 }
717 CONN_INC_IOCTLREF_LOCKED(connp);
718 error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
719 cmd, arg, mode, cr, rvalp);
720 CONN_DEC_IOCTLREF(connp);
721 break;
722 }
723 return (error);
724 }
725
726 /* ARGSUSED */
727 static int
728 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
729 {
730 conn_t *connp = (conn_t *)proto_handle;
731
732 ASSERT(connp->conn_upper_handle != NULL);
733
734 /* All Solaris components should pass a cred for this operation. */
735 ASSERT(cr != NULL);
736
737 tcp_close_common(connp, flags);
738
739 ip_free_helper_stream(connp);
740
741 /*
742 * Drop IP's reference on the conn. This is the last reference
743 * on the connp if the state was less than established. If the
744 * connection has gone into timewait state, then we will have
745 * one ref for the TCP and one more ref (total of two) for the
746 * classifier connected hash list (a timewait connections stays
747 * in connected hash till closed).
748 *
749 * We can't assert the references because there might be other
750 * transient reference places because of some walkers or queued
751 * packets in squeue for the timewait state.
752 */
753 CONN_DEC_REF(connp);
754
755 /*
756 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
757 * freeing the socket.
758 */
759 return (EINPROGRESS);
760 }
761
762 /* ARGSUSED */
763 sock_lower_handle_t
764 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
765 uint_t *smodep, int *errorp, int flags, cred_t *credp)
766 {
767 conn_t *connp;
768 boolean_t isv6 = family == AF_INET6;
769
770 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
771 (proto != 0 && proto != IPPROTO_TCP)) {
772 *errorp = EPROTONOSUPPORT;
773 return (NULL);
774 }
775
776 connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
777 if (connp == NULL) {
778 return (NULL);
779 }
780
781 /*
782 * Put the ref for TCP. Ref for IP was already put
783 * by ipcl_conn_create. Also make the conn_t globally
784 * visible to walkers
785 */
786 mutex_enter(&connp->conn_lock);
787 CONN_INC_REF_LOCKED(connp);
788 ASSERT(connp->conn_ref == 2);
789 connp->conn_state_flags &= ~CONN_INCIPIENT;
790
791 connp->conn_flags |= IPCL_NONSTR;
792 mutex_exit(&connp->conn_lock);
793
794 ASSERT(errorp != NULL);
795 *errorp = 0;
796 *sock_downcalls = &sock_tcp_downcalls;
797 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
798 SM_SENDFILESUPP;
799
800 return ((sock_lower_handle_t)connp);
801 }
802
803 /*
804 * tcp_fallback
805 *
806 * A direct socket is falling back to using STREAMS. The queue
807 * that is being passed down was created using tcp_open() with
808 * the SO_FALLBACK flag set. As a result, the queue is not
809 * associated with a conn, and the q_ptrs instead contain the
810 * dev and minor area that should be used.
811 *
812 * The 'issocket' flag indicates whether the FireEngine
813 * optimizations should be used. The common case would be that
814 * optimizations are enabled, and they might be subsequently
815 * disabled using the _SIOCSOCKFALLBACK ioctl.
816 */
817
818 /*
819 * An active connection is falling back to TPI. Gather all the information
820 * required by the STREAM head and TPI sonode and send it up.
821 */
822 static void
823 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
824 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
825 sock_quiesce_arg_t *arg)
826 {
827 conn_t *connp = tcp->tcp_connp;
828 struct stroptions *stropt;
829 struct T_capability_ack tca;
830 struct sockaddr_in6 laddr, faddr;
831 socklen_t laddrlen, faddrlen;
832 short opts;
833 int error;
834 mblk_t *mp, *mpnext;
835
836 connp->conn_dev = (dev_t)RD(q)->q_ptr;
837 connp->conn_minor_arena = WR(q)->q_ptr;
838
839 RD(q)->q_ptr = WR(q)->q_ptr = connp;
840
841 connp->conn_rq = RD(q);
842 connp->conn_wq = WR(q);
843
844 WR(q)->q_qinfo = &tcp_sock_winit;
845
846 if (!issocket)
847 tcp_use_pure_tpi(tcp);
848
849 /*
850 * free the helper stream
851 */
852 ip_free_helper_stream(connp);
853
854 /*
855 * Notify the STREAM head about options
856 */
857 DB_TYPE(stropt_mp) = M_SETOPTS;
858 stropt = (struct stroptions *)stropt_mp->b_rptr;
859 stropt_mp->b_wptr += sizeof (struct stroptions);
860 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
861
862 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
863 tcp->tcp_tcps->tcps_wroff_xtra);
864 if (tcp->tcp_snd_sack_ok)
865 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
866 stropt->so_hiwat = connp->conn_rcvbuf;
867 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
868
869 putnext(RD(q), stropt_mp);
870
871 /*
872 * Collect the information needed to sync with the sonode
873 */
874 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
875
876 laddrlen = faddrlen = sizeof (sin6_t);
877 (void) tcp_getsockname((sock_lower_handle_t)connp,
878 (struct sockaddr *)&laddr, &laddrlen, CRED());
879 error = tcp_getpeername((sock_lower_handle_t)connp,
880 (struct sockaddr *)&faddr, &faddrlen, CRED());
881 if (error != 0)
882 faddrlen = 0;
883
884 opts = 0;
885 if (connp->conn_oobinline)
886 opts |= SO_OOBINLINE;
887 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
888 opts |= SO_DONTROUTE;
889
890 /*
891 * Notify the socket that the protocol is now quiescent,
892 * and it's therefore safe move data from the socket
893 * to the stream head.
894 */
895 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
896 (struct sockaddr *)&laddr, laddrlen,
897 (struct sockaddr *)&faddr, faddrlen, opts);
898
899 while (mp != NULL) {
900 mpnext = mp->b_next;
901 tcp->tcp_rcv_list = mp->b_next;
902 mp->b_next = NULL;
903 putnext(q, mp);
904 mp = mpnext;
905 }
906 ASSERT(tcp->tcp_rcv_last_head == NULL);
907 ASSERT(tcp->tcp_rcv_last_tail == NULL);
908 ASSERT(tcp->tcp_rcv_cnt == 0);
909
910 /*
911 * All eagers in q0 are marked as being non-STREAM, so they will
912 * make su_newconn upcalls when the handshake completes, which
913 * will fail (resulting in the conn being closed). So we just blow
914 * off everything in q0 instead of waiting for the inevitable.
915 */
916 if (tcp->tcp_conn_req_cnt_q0 != 0)
917 tcp_eager_cleanup(tcp, B_TRUE);
918 }
919
920 /*
921 * An eager is falling back to TPI. All we have to do is send
922 * up a T_CONN_IND.
923 */
924 static void
925 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
926 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
927 {
928 conn_t *connp = eager->tcp_connp;
929 tcp_t *listener = eager->tcp_listener;
930 mblk_t *mp;
931
932 ASSERT(listener != NULL);
933
934 /*
935 * Notify the socket that the protocol is now quiescent,
936 * and it's therefore safe move data from the socket
937 * to tcp's rcv queue.
938 */
939 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
940 NULL, 0, 0);
941
942 if (mp != NULL) {
943 ASSERT(eager->tcp_rcv_cnt == 0);
944
945 eager->tcp_rcv_list = mp;
946 eager->tcp_rcv_cnt = msgdsize(mp);
947 while (mp->b_next != NULL) {
948 mp = mp->b_next;
949 eager->tcp_rcv_cnt += msgdsize(mp);
950 }
951 eager->tcp_rcv_last_head = mp;
952 while (mp->b_cont)
953 mp = mp->b_cont;
954 eager->tcp_rcv_last_tail = mp;
955 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
956 eager->tcp_rwnd = 0;
957 else
958 eager->tcp_rwnd -= eager->tcp_rcv_cnt;
959 }
960
961 if (!issocket)
962 eager->tcp_issocket = B_FALSE;
963 /*
964 * The stream for this eager does not yet exist, so mark it as
965 * being detached.
966 */
967 eager->tcp_detached = B_TRUE;
968 eager->tcp_hard_binding = B_TRUE;
969 connp->conn_rq = listener->tcp_connp->conn_rq;
970 connp->conn_wq = listener->tcp_connp->conn_wq;
971
972 /* Send up the connection indication */
973 mp = eager->tcp_conn.tcp_eager_conn_ind;
974 ASSERT(mp != NULL);
975 eager->tcp_conn.tcp_eager_conn_ind = NULL;
976
977 /*
978 * TLI/XTI applications will get confused by
979 * sending eager as an option since it violates
980 * the option semantics. So remove the eager as
981 * option since TLI/XTI app doesn't need it anyway.
982 */
983 if (!issocket) {
984 struct T_conn_ind *conn_ind;
985
986 conn_ind = (struct T_conn_ind *)mp->b_rptr;
987 conn_ind->OPT_length = 0;
988 conn_ind->OPT_offset = 0;
989 }
990
991 /*
992 * Sockfs guarantees that the listener will not be closed
993 * during fallback. So we can safely use the listener's queue.
994 */
995 putnext(listener->tcp_connp->conn_rq, mp);
996 }
997
998
999 int
1000 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
1001 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
1002 sock_quiesce_arg_t *arg)
1003 {
1004 tcp_t *tcp;
1005 conn_t *connp = (conn_t *)proto_handle;
1006 int error;
1007 mblk_t *stropt_mp;
1008 mblk_t *ordrel_mp;
1009
1010 tcp = connp->conn_tcp;
1011
1012 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1013 NULL);
1014
1015 /* Pre-allocate the T_ordrel_ind mblk. */
1016 ASSERT(tcp->tcp_ordrel_mp == NULL);
1017 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1018 STR_NOSIG, NULL);
1019 ordrel_mp->b_datap->db_type = M_PROTO;
1020 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1021 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1022
1023 /*
1024 * Enter the squeue so that no new packets can come in
1025 */
1026 error = squeue_synch_enter(connp, NULL);
1027 if (error != 0) {
1028 /* failed to enter, free all the pre-allocated messages. */
1029 freeb(stropt_mp);
1030 freeb(ordrel_mp);
1031 return (ENOMEM);
1032 }
1033
1034 /*
1035 * Both endpoints must be of the same type (either STREAMS or
1036 * non-STREAMS) for fusion to be enabled. So if we are fused,
1037 * we have to unfuse.
1038 */
1039 if (tcp->tcp_fused)
1040 tcp_unfuse(tcp);
1041
1042 if (tcp->tcp_listener != NULL) {
1043 /* The eager will deal with opts when accept() is called */
1044 freeb(stropt_mp);
1045 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1046 } else {
1047 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1048 quiesced_cb, arg);
1049 }
1050
1051 /*
1052 * No longer a direct socket
1053 *
1054 * Note that we intentionally leave the upper_handle and upcalls
1055 * intact, since eagers may still be using them.
1056 */
1057 connp->conn_flags &= ~IPCL_NONSTR;
1058 tcp->tcp_ordrel_mp = ordrel_mp;
1059
1060 /*
1061 * There should be atleast two ref's (IP + TCP)
1062 */
1063 ASSERT(connp->conn_ref >= 2);
1064 squeue_synch_exit(connp);
1065
1066 return (0);
1067 }
1068
1069 /*
1070 * Notifies a non-STREAMS based listener about a new connection. This
1071 * function is executed on the *eager*'s squeue once the 3 way handshake
1072 * has completed. Note that the behavior differs from STREAMS, where the
1073 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1074 * squeue.
1075 *
1076 * Returns B_TRUE if the notification succeeded and an upper handle was
1077 * obtained. `tcp' should be closed on failure.
1078 */
1079 boolean_t
1080 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1081 {
1082 tcp_t *listener = tcp->tcp_listener;
1083 conn_t *lconnp = listener->tcp_connp;
1084 conn_t *econnp = tcp->tcp_connp;
1085 tcp_t *tail;
1086 ipaddr_t *addr_cache;
1087 sock_upper_handle_t upper;
1088 struct sock_proto_props sopp;
1089
1090 mutex_enter(&listener->tcp_eager_lock);
1091 /*
1092 * Take the eager out, if it is in the list of droppable eagers
1093 * as we are here because the 3W handshake is over.
1094 */
1095 MAKE_UNDROPPABLE(tcp);
1096 /*
1097 * The eager already has an extra ref put in tcp_input_data
1098 * so that it stays till accept comes back even though it
1099 * might get into TCPS_CLOSED as a result of a TH_RST etc.
1100 */
1101 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1102 listener->tcp_conn_req_cnt_q0--;
1103 listener->tcp_conn_req_cnt_q++;
1104
1105 /* Move from SYN_RCVD to ESTABLISHED list */
1106 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1107 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1108 tcp->tcp_eager_prev_q0 = NULL;
1109 tcp->tcp_eager_next_q0 = NULL;
1110
1111 /*
1112 * Insert at end of the queue because connections are accepted
1113 * in chronological order. Leaving the older connections at front
1114 * of the queue helps reducing search time.
1115 */
1116 tail = listener->tcp_eager_last_q;
1117 if (tail != NULL)
1118 tail->tcp_eager_next_q = tcp;
1119 else
1120 listener->tcp_eager_next_q = tcp;
1121 listener->tcp_eager_last_q = tcp;
1122 tcp->tcp_eager_next_q = NULL;
1123
1124 /* we have timed out before */
1125 if (tcp->tcp_syn_rcvd_timeout != 0) {
1126 tcp->tcp_syn_rcvd_timeout = 0;
1127 listener->tcp_syn_rcvd_timeout--;
1128 if (listener->tcp_syn_defense &&
1129 listener->tcp_syn_rcvd_timeout <=
1130 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1131 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1132 listener->tcp_last_rcv_lbolt)) {
1133 /*
1134 * Turn off the defense mode if we
1135 * believe the SYN attack is over.
1136 */
1137 listener->tcp_syn_defense = B_FALSE;
1138 if (listener->tcp_ip_addr_cache) {
1139 kmem_free((void *)listener->tcp_ip_addr_cache,
1140 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1141 listener->tcp_ip_addr_cache = NULL;
1142 }
1143 }
1144 }
1145 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1146 if (addr_cache != NULL) {
1147 /*
1148 * We have finished a 3-way handshake with this
1149 * remote host. This proves the IP addr is good.
1150 * Cache it!
1151 */
1152 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1153 tcp->tcp_connp->conn_faddr_v4;
1154 }
1155 mutex_exit(&listener->tcp_eager_lock);
1156
1157 /*
1158 * Notify the ULP about the newconn. It is guaranteed that no
1159 * tcp_accept() call will be made for the eager if the
1160 * notification fails.
1161 */
1162 if ((upper = (*lconnp->conn_upcalls->su_newconn)
1163 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1164 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1165 &econnp->conn_upcalls)) == NULL) {
1166 return (B_FALSE);
1167 }
1168 econnp->conn_upper_handle = upper;
1169
1170 tcp->tcp_detached = B_FALSE;
1171 tcp->tcp_hard_binding = B_FALSE;
1172 tcp->tcp_tconnind_started = B_TRUE;
1173
1174 if (econnp->conn_keepalive) {
1175 tcp->tcp_ka_last_intrvl = 0;
1176 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1177 tcp->tcp_ka_interval);
1178 }
1179
1180 /* Update the necessary parameters */
1181 tcp_get_proto_props(tcp, &sopp);
1182
1183 (*econnp->conn_upcalls->su_set_proto_props)
1184 (econnp->conn_upper_handle, &sopp);
1185
1186 return (B_TRUE);
1187 }