1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP kernel socket related functions. */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/policy.h> 31 #include <sys/sockio.h> 32 #include <sys/strsubr.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 #include <sys/timod.h> 39 #include <sys/tpicommon.h> 40 #include <sys/socketvar.h> 41 42 #include <inet/common.h> 43 #include <inet/proto_set.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 49 sock_upcalls_t *, int, cred_t *); 50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 51 sock_upper_handle_t, cred_t *); 52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 53 socklen_t, cred_t *); 54 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 56 socklen_t, sock_connid_t *, cred_t *); 57 static int tcp_getpeername(sock_lower_handle_t, struct sockaddr *, 58 socklen_t *, cred_t *); 59 static int tcp_getsockname(sock_lower_handle_t, struct sockaddr *, 60 socklen_t *, cred_t *); 61 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 62 socklen_t *, cred_t *); 63 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 64 socklen_t, cred_t *); 65 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 66 cred_t *); 67 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 68 static void tcp_clr_flowctrl(sock_lower_handle_t); 69 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 70 cred_t *); 71 static int tcp_close(sock_lower_handle_t, int, cred_t *); 72 73 sock_downcalls_t sock_tcp_downcalls = { 74 tcp_activate, 75 tcp_accept, 76 tcp_bind, 77 tcp_listen, 78 tcp_connect, 79 tcp_getpeername, 80 tcp_getsockname, 81 tcp_getsockopt, 82 tcp_setsockopt, 83 tcp_sendmsg, 84 NULL, 85 NULL, 86 NULL, 87 tcp_shutdown, 88 tcp_clr_flowctrl, 89 tcp_ioctl, 90 tcp_close, 91 }; 92 93 /* ARGSUSED */ 94 static void 95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 96 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 97 { 98 conn_t *connp = (conn_t *)proto_handle; 99 struct sock_proto_props sopp; 100 extern struct module_info tcp_rinfo; 101 102 ASSERT(connp->conn_upper_handle == NULL); 103 104 /* All Solaris components should pass a cred for this operation. */ 105 ASSERT(cr != NULL); 106 107 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 108 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 109 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 110 111 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 112 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 113 sopp.sopp_maxpsz = INFPSZ; 114 sopp.sopp_maxblk = INFPSZ; 115 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 116 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 117 sopp.sopp_maxaddrlen = sizeof (sin6_t); 118 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 119 tcp_rinfo.mi_minpsz; 120 121 connp->conn_upcalls = sock_upcalls; 122 connp->conn_upper_handle = sock_handle; 123 124 ASSERT(connp->conn_rcvbuf != 0 && 125 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 126 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 127 } 128 129 /*ARGSUSED*/ 130 static int 131 tcp_accept(sock_lower_handle_t lproto_handle, 132 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 133 cred_t *cr) 134 { 135 conn_t *lconnp, *econnp; 136 tcp_t *listener, *eager; 137 138 /* 139 * KSSL can move a socket from one listener to another, in which 140 * case `lproto_handle' points to the new listener. To ensure that 141 * the original listener is used the information is obtained from 142 * the eager. 143 */ 144 econnp = (conn_t *)eproto_handle; 145 eager = econnp->conn_tcp; 146 ASSERT(IPCL_IS_NONSTR(econnp)); 147 ASSERT(eager->tcp_listener != NULL); 148 listener = eager->tcp_listener; 149 lconnp = (conn_t *)listener->tcp_connp; 150 ASSERT(listener->tcp_state == TCPS_LISTEN); 151 ASSERT(lconnp->conn_upper_handle != NULL); 152 153 /* 154 * It is possible for the accept thread to race with the thread that 155 * made the su_newconn upcall in tcp_newconn_notify. Both 156 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 157 * and conn_upcalls be set before returning, so they both write to 158 * them. However, we're guaranteed that the value written is the same 159 * for both threads. 160 */ 161 ASSERT(econnp->conn_upper_handle == NULL || 162 econnp->conn_upper_handle == sock_handle); 163 ASSERT(econnp->conn_upcalls == NULL || 164 econnp->conn_upcalls == lconnp->conn_upcalls); 165 econnp->conn_upper_handle = sock_handle; 166 econnp->conn_upcalls = lconnp->conn_upcalls; 167 168 ASSERT(econnp->conn_netstack == 169 listener->tcp_connp->conn_netstack); 170 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 171 172 /* 173 * We should have a minimum of 2 references on the conn at this 174 * point. One for TCP and one for the newconn notification 175 * (which is now taken over by IP). In the normal case we would 176 * also have another reference (making a total of 3) for the conn 177 * being in the classifier hash list. However the eager could have 178 * received an RST subsequently and tcp_closei_local could have 179 * removed the eager from the classifier hash list, hence we can't 180 * assert that reference. 181 */ 182 ASSERT(econnp->conn_ref >= 2); 183 184 mutex_enter(&listener->tcp_eager_lock); 185 /* 186 * Non-STREAMS listeners never defer the notification of new 187 * connections. 188 */ 189 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 190 tcp_eager_unlink(eager); 191 mutex_exit(&listener->tcp_eager_lock); 192 CONN_DEC_REF(listener->tcp_connp); 193 194 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0); 195 } 196 197 static int 198 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 199 socklen_t len, cred_t *cr) 200 { 201 int error; 202 conn_t *connp = (conn_t *)proto_handle; 203 204 /* All Solaris components should pass a cred for this operation. */ 205 ASSERT(cr != NULL); 206 ASSERT(connp->conn_upper_handle != NULL); 207 208 error = squeue_synch_enter(connp, NULL); 209 if (error != 0) { 210 /* failed to enter */ 211 return (ENOSR); 212 } 213 214 /* binding to a NULL address really means unbind */ 215 if (sa == NULL) { 216 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 217 error = tcp_do_unbind(connp); 218 else 219 error = EINVAL; 220 } else { 221 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 222 } 223 224 squeue_synch_exit(connp); 225 226 if (error < 0) { 227 if (error == -TOUTSTATE) 228 error = EINVAL; 229 else 230 error = proto_tlitosyserr(-error); 231 } 232 233 return (error); 234 } 235 236 /* ARGSUSED */ 237 static int 238 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 239 { 240 conn_t *connp = (conn_t *)proto_handle; 241 tcp_t *tcp = connp->conn_tcp; 242 int error; 243 244 ASSERT(connp->conn_upper_handle != NULL); 245 246 /* All Solaris components should pass a cred for this operation. */ 247 ASSERT(cr != NULL); 248 249 error = squeue_synch_enter(connp, NULL); 250 if (error != 0) { 251 /* failed to enter */ 252 return (ENOBUFS); 253 } 254 255 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 256 if (error == 0) { 257 /* 258 * sockfs needs to know what's the maximum number of socket 259 * that can be queued on the listener. 260 */ 261 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 262 SOCK_OPCTL_ENAB_ACCEPT, 263 (uintptr_t)(tcp->tcp_conn_req_max + 264 tcp->tcp_tcps->tcps_conn_req_max_q0)); 265 } else if (error < 0) { 266 if (error == -TOUTSTATE) 267 error = EINVAL; 268 else 269 error = proto_tlitosyserr(-error); 270 } 271 squeue_synch_exit(connp); 272 return (error); 273 } 274 275 static int 276 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 277 socklen_t len, sock_connid_t *id, cred_t *cr) 278 { 279 conn_t *connp = (conn_t *)proto_handle; 280 int error; 281 282 ASSERT(connp->conn_upper_handle != NULL); 283 284 /* All Solaris components should pass a cred for this operation. */ 285 ASSERT(cr != NULL); 286 287 error = proto_verify_ip_addr(connp->conn_family, sa, len); 288 if (error != 0) { 289 return (error); 290 } 291 292 error = squeue_synch_enter(connp, NULL); 293 if (error != 0) { 294 /* failed to enter */ 295 return (ENOSR); 296 } 297 298 /* 299 * TCP supports quick connect, so no need to do an implicit bind 300 */ 301 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 302 if (error == 0) { 303 *id = connp->conn_tcp->tcp_connid; 304 } else if (error < 0) { 305 if (error == -TOUTSTATE) { 306 switch (connp->conn_tcp->tcp_state) { 307 case TCPS_SYN_SENT: 308 error = EALREADY; 309 break; 310 case TCPS_ESTABLISHED: 311 error = EISCONN; 312 break; 313 case TCPS_LISTEN: 314 error = EOPNOTSUPP; 315 break; 316 default: 317 error = EINVAL; 318 break; 319 } 320 } else { 321 error = proto_tlitosyserr(-error); 322 } 323 } 324 325 if (connp->conn_tcp->tcp_loopback) { 326 struct sock_proto_props sopp; 327 328 sopp.sopp_flags = SOCKOPT_LOOPBACK; 329 sopp.sopp_loopback = B_TRUE; 330 331 (*connp->conn_upcalls->su_set_proto_props)( 332 connp->conn_upper_handle, &sopp); 333 } 334 done: 335 squeue_synch_exit(connp); 336 337 return ((error == 0) ? EINPROGRESS : error); 338 } 339 340 /* ARGSUSED3 */ 341 int 342 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 343 socklen_t *addrlenp, cred_t *cr) 344 { 345 conn_t *connp = (conn_t *)proto_handle; 346 tcp_t *tcp = connp->conn_tcp; 347 348 /* All Solaris components should pass a cred for this operation. */ 349 ASSERT(cr != NULL); 350 351 ASSERT(tcp != NULL); 352 if (tcp->tcp_state < TCPS_SYN_RCVD) 353 return (ENOTCONN); 354 355 return (conn_getpeername(connp, addr, addrlenp)); 356 } 357 358 /* ARGSUSED3 */ 359 int 360 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 361 socklen_t *addrlenp, cred_t *cr) 362 { 363 conn_t *connp = (conn_t *)proto_handle; 364 365 /* All Solaris components should pass a cred for this operation. */ 366 ASSERT(cr != NULL); 367 368 return (conn_getsockname(connp, addr, addrlenp)); 369 } 370 371 /* returns UNIX error, the optlen is a value-result arg */ 372 static int 373 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 374 void *optvalp, socklen_t *optlen, cred_t *cr) 375 { 376 conn_t *connp = (conn_t *)proto_handle; 377 int error; 378 t_uscalar_t max_optbuf_len; 379 void *optvalp_buf; 380 int len; 381 382 ASSERT(connp->conn_upper_handle != NULL); 383 384 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 385 tcp_opt_obj.odb_opt_des_arr, 386 tcp_opt_obj.odb_opt_arr_cnt, 387 B_FALSE, B_TRUE, cr); 388 if (error != 0) { 389 if (error < 0) { 390 error = proto_tlitosyserr(-error); 391 } 392 return (error); 393 } 394 395 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 396 397 error = squeue_synch_enter(connp, NULL); 398 if (error == ENOMEM) { 399 kmem_free(optvalp_buf, max_optbuf_len); 400 return (ENOMEM); 401 } 402 403 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 404 squeue_synch_exit(connp); 405 406 if (len == -1) { 407 kmem_free(optvalp_buf, max_optbuf_len); 408 return (EINVAL); 409 } 410 411 /* 412 * update optlen and copy option value 413 */ 414 t_uscalar_t size = MIN(len, *optlen); 415 416 bcopy(optvalp_buf, optvalp, size); 417 bcopy(&size, optlen, sizeof (size)); 418 419 kmem_free(optvalp_buf, max_optbuf_len); 420 return (0); 421 } 422 423 static int 424 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 425 const void *optvalp, socklen_t optlen, cred_t *cr) 426 { 427 conn_t *connp = (conn_t *)proto_handle; 428 int error; 429 430 ASSERT(connp->conn_upper_handle != NULL); 431 /* 432 * Entering the squeue synchronously can result in a context switch, 433 * which can cause a rather sever performance degradation. So we try to 434 * handle whatever options we can without entering the squeue. 435 */ 436 if (level == IPPROTO_TCP) { 437 switch (option_name) { 438 case TCP_NODELAY: 439 if (optlen != sizeof (int32_t)) 440 return (EINVAL); 441 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 442 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 443 connp->conn_tcp->tcp_mss; 444 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 445 return (0); 446 default: 447 break; 448 } 449 } 450 451 error = squeue_synch_enter(connp, NULL); 452 if (error == ENOMEM) { 453 return (ENOMEM); 454 } 455 456 error = proto_opt_check(level, option_name, optlen, NULL, 457 tcp_opt_obj.odb_opt_des_arr, 458 tcp_opt_obj.odb_opt_arr_cnt, 459 B_TRUE, B_FALSE, cr); 460 461 if (error != 0) { 462 if (error < 0) { 463 error = proto_tlitosyserr(-error); 464 } 465 squeue_synch_exit(connp); 466 return (error); 467 } 468 469 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 470 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 471 NULL, cr); 472 squeue_synch_exit(connp); 473 474 ASSERT(error >= 0); 475 476 return (error); 477 } 478 479 /* ARGSUSED */ 480 static int 481 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 482 cred_t *cr) 483 { 484 tcp_t *tcp; 485 uint32_t msize; 486 conn_t *connp = (conn_t *)proto_handle; 487 int32_t tcpstate; 488 489 /* All Solaris components should pass a cred for this operation. */ 490 ASSERT(cr != NULL); 491 492 ASSERT(connp->conn_ref >= 2); 493 ASSERT(connp->conn_upper_handle != NULL); 494 495 if (msg->msg_controllen != 0) { 496 freemsg(mp); 497 return (EOPNOTSUPP); 498 } 499 500 switch (DB_TYPE(mp)) { 501 case M_DATA: 502 tcp = connp->conn_tcp; 503 ASSERT(tcp != NULL); 504 505 tcpstate = tcp->tcp_state; 506 if (tcpstate < TCPS_ESTABLISHED) { 507 freemsg(mp); 508 /* 509 * We return ENOTCONN if the endpoint is trying to 510 * connect or has never been connected, and EPIPE if it 511 * has been disconnected. The connection id helps us 512 * distinguish between the last two cases. 513 */ 514 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 515 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 516 } else if (tcpstate > TCPS_CLOSE_WAIT) { 517 freemsg(mp); 518 return (EPIPE); 519 } 520 521 msize = msgdsize(mp); 522 523 mutex_enter(&tcp->tcp_non_sq_lock); 524 tcp->tcp_squeue_bytes += msize; 525 /* 526 * Squeue Flow Control 527 */ 528 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 529 tcp_setqfull(tcp); 530 } 531 mutex_exit(&tcp->tcp_non_sq_lock); 532 533 /* 534 * The application may pass in an address in the msghdr, but 535 * we ignore the address on connection-oriented sockets. 536 * Just like BSD this code does not generate an error for 537 * TCP (a CONNREQUIRED socket) when sending to an address 538 * passed in with sendto/sendmsg. Instead the data is 539 * delivered on the connection as if no address had been 540 * supplied. 541 */ 542 CONN_INC_REF(connp); 543 544 if (msg->msg_flags & MSG_OOB) { 545 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 546 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 547 } else { 548 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 549 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 550 } 551 552 return (0); 553 554 default: 555 ASSERT(0); 556 } 557 558 freemsg(mp); 559 return (0); 560 } 561 562 /* ARGSUSED */ 563 static int 564 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 565 { 566 conn_t *connp = (conn_t *)proto_handle; 567 tcp_t *tcp = connp->conn_tcp; 568 569 ASSERT(connp->conn_upper_handle != NULL); 570 571 /* All Solaris components should pass a cred for this operation. */ 572 ASSERT(cr != NULL); 573 574 /* 575 * X/Open requires that we check the connected state. 576 */ 577 if (tcp->tcp_state < TCPS_SYN_SENT) 578 return (ENOTCONN); 579 580 /* shutdown the send side */ 581 if (how != SHUT_RD) { 582 mblk_t *bp; 583 584 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 585 CONN_INC_REF(connp); 586 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 587 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 588 589 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 590 SOCK_OPCTL_SHUT_SEND, 0); 591 } 592 593 /* shutdown the recv side */ 594 if (how != SHUT_WR) 595 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 596 SOCK_OPCTL_SHUT_RECV, 0); 597 598 return (0); 599 } 600 601 static void 602 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 603 { 604 conn_t *connp = (conn_t *)proto_handle; 605 tcp_t *tcp = connp->conn_tcp; 606 mblk_t *mp; 607 int error; 608 609 ASSERT(connp->conn_upper_handle != NULL); 610 611 /* 612 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 613 * is currently running. 614 */ 615 mutex_enter(&tcp->tcp_rsrv_mp_lock); 616 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 617 mutex_exit(&tcp->tcp_rsrv_mp_lock); 618 return; 619 } 620 tcp->tcp_rsrv_mp = NULL; 621 mutex_exit(&tcp->tcp_rsrv_mp_lock); 622 623 error = squeue_synch_enter(connp, mp); 624 ASSERT(error == 0); 625 626 mutex_enter(&tcp->tcp_rsrv_mp_lock); 627 tcp->tcp_rsrv_mp = mp; 628 mutex_exit(&tcp->tcp_rsrv_mp_lock); 629 630 if (tcp->tcp_fused) { 631 tcp_fuse_backenable(tcp); 632 } else { 633 tcp->tcp_rwnd = connp->conn_rcvbuf; 634 /* 635 * Send back a window update immediately if TCP is above 636 * ESTABLISHED state and the increase of the rcv window 637 * that the other side knows is at least 1 MSS after flow 638 * control is lifted. 639 */ 640 if (tcp->tcp_state >= TCPS_ESTABLISHED && 641 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 642 tcp_xmit_ctl(NULL, tcp, 643 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 644 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 645 } 646 } 647 648 squeue_synch_exit(connp); 649 } 650 651 /* ARGSUSED */ 652 static int 653 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 654 int mode, int32_t *rvalp, cred_t *cr) 655 { 656 conn_t *connp = (conn_t *)proto_handle; 657 int error; 658 659 ASSERT(connp->conn_upper_handle != NULL); 660 661 /* All Solaris components should pass a cred for this operation. */ 662 ASSERT(cr != NULL); 663 664 /* 665 * If we don't have a helper stream then create one. 666 * ip_create_helper_stream takes care of locking the conn_t, 667 * so this check for NULL is just a performance optimization. 668 */ 669 if (connp->conn_helper_info == NULL) { 670 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 671 672 /* 673 * Create a helper stream for non-STREAMS socket. 674 */ 675 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 676 if (error != 0) { 677 ip0dbg(("tcp_ioctl: create of IP helper stream " 678 "failed %d\n", error)); 679 return (error); 680 } 681 } 682 683 switch (cmd) { 684 case ND_SET: 685 case ND_GET: 686 case _SIOCSOCKFALLBACK: 687 case TCP_IOC_ABORT_CONN: 688 case TI_GETPEERNAME: 689 case TI_GETMYNAME: 690 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 691 cmd)); 692 error = EINVAL; 693 break; 694 default: 695 /* 696 * If the conn is not closing, pass on to IP using 697 * helper stream. Bump the ioctlref to prevent tcp_close 698 * from closing the rq/wq out from underneath the ioctl 699 * if it ends up queued or aborted/interrupted. 700 */ 701 mutex_enter(&connp->conn_lock); 702 if (connp->conn_state_flags & (CONN_CLOSING)) { 703 mutex_exit(&connp->conn_lock); 704 error = EINVAL; 705 break; 706 } 707 CONN_INC_IOCTLREF_LOCKED(connp); 708 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 709 cmd, arg, mode, cr, rvalp); 710 CONN_DEC_IOCTLREF(connp); 711 break; 712 } 713 return (error); 714 } 715 716 /* ARGSUSED */ 717 static int 718 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 719 { 720 conn_t *connp = (conn_t *)proto_handle; 721 722 ASSERT(connp->conn_upper_handle != NULL); 723 724 /* All Solaris components should pass a cred for this operation. */ 725 ASSERT(cr != NULL); 726 727 tcp_close_common(connp, flags); 728 729 ip_free_helper_stream(connp); 730 731 /* 732 * Drop IP's reference on the conn. This is the last reference 733 * on the connp if the state was less than established. If the 734 * connection has gone into timewait state, then we will have 735 * one ref for the TCP and one more ref (total of two) for the 736 * classifier connected hash list (a timewait connections stays 737 * in connected hash till closed). 738 * 739 * We can't assert the references because there might be other 740 * transient reference places because of some walkers or queued 741 * packets in squeue for the timewait state. 742 */ 743 CONN_DEC_REF(connp); 744 745 /* 746 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 747 * freeing the socket. 748 */ 749 return (EINPROGRESS); 750 } 751 752 /* ARGSUSED */ 753 sock_lower_handle_t 754 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 755 uint_t *smodep, int *errorp, int flags, cred_t *credp) 756 { 757 conn_t *connp; 758 boolean_t isv6 = family == AF_INET6; 759 760 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 761 (proto != 0 && proto != IPPROTO_TCP)) { 762 *errorp = EPROTONOSUPPORT; 763 return (NULL); 764 } 765 766 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 767 if (connp == NULL) { 768 return (NULL); 769 } 770 771 /* 772 * Put the ref for TCP. Ref for IP was already put 773 * by ipcl_conn_create. Also make the conn_t globally 774 * visible to walkers. 775 */ 776 mutex_enter(&connp->conn_lock); 777 CONN_INC_REF_LOCKED(connp); 778 ASSERT(connp->conn_ref == 2); 779 connp->conn_state_flags &= ~CONN_INCIPIENT; 780 781 connp->conn_flags |= IPCL_NONSTR; 782 mutex_exit(&connp->conn_lock); 783 784 ASSERT(errorp != NULL); 785 *errorp = 0; 786 *sock_downcalls = &sock_tcp_downcalls; 787 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 788 SM_SENDFILESUPP; 789 790 return ((sock_lower_handle_t)connp); 791 } 792 793 /* 794 * tcp_fallback 795 * 796 * A direct socket is falling back to using STREAMS. The queue 797 * that is being passed down was created using tcp_open() with 798 * the SO_FALLBACK flag set. As a result, the queue is not 799 * associated with a conn, and the q_ptrs instead contain the 800 * dev and minor area that should be used. 801 * 802 * The 'issocket' flag indicates whether the FireEngine 803 * optimizations should be used. The common case would be that 804 * optimizations are enabled, and they might be subsequently 805 * disabled using the _SIOCSOCKFALLBACK ioctl. 806 */ 807 808 /* 809 * An active connection is falling back to TPI. Gather all the information 810 * required by the STREAM head and TPI sonode and send it up. 811 */ 812 static void 813 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 814 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 815 sock_quiesce_arg_t *arg) 816 { 817 conn_t *connp = tcp->tcp_connp; 818 struct stroptions *stropt; 819 struct T_capability_ack tca; 820 struct sockaddr_in6 laddr, faddr; 821 socklen_t laddrlen, faddrlen; 822 short opts; 823 int error; 824 mblk_t *mp, *mpnext; 825 826 connp->conn_dev = (dev_t)RD(q)->q_ptr; 827 connp->conn_minor_arena = WR(q)->q_ptr; 828 829 RD(q)->q_ptr = WR(q)->q_ptr = connp; 830 831 connp->conn_rq = RD(q); 832 connp->conn_wq = WR(q); 833 834 WR(q)->q_qinfo = &tcp_sock_winit; 835 836 if (!issocket) 837 tcp_use_pure_tpi(tcp); 838 839 /* 840 * free the helper stream 841 */ 842 ip_free_helper_stream(connp); 843 844 /* 845 * Notify the STREAM head about options 846 */ 847 DB_TYPE(stropt_mp) = M_SETOPTS; 848 stropt = (struct stroptions *)stropt_mp->b_rptr; 849 stropt_mp->b_wptr += sizeof (struct stroptions); 850 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 851 852 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 853 tcp->tcp_tcps->tcps_wroff_xtra); 854 if (tcp->tcp_snd_sack_ok) 855 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 856 stropt->so_hiwat = connp->conn_rcvbuf; 857 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 858 859 putnext(RD(q), stropt_mp); 860 861 /* 862 * Collect the information needed to sync with the sonode 863 */ 864 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 865 866 laddrlen = faddrlen = sizeof (sin6_t); 867 (void) tcp_getsockname((sock_lower_handle_t)connp, 868 (struct sockaddr *)&laddr, &laddrlen, CRED()); 869 error = tcp_getpeername((sock_lower_handle_t)connp, 870 (struct sockaddr *)&faddr, &faddrlen, CRED()); 871 if (error != 0) 872 faddrlen = 0; 873 874 opts = 0; 875 if (connp->conn_oobinline) 876 opts |= SO_OOBINLINE; 877 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 878 opts |= SO_DONTROUTE; 879 880 /* 881 * Notify the socket that the protocol is now quiescent, 882 * and it's therefore safe move data from the socket 883 * to the stream head. 884 */ 885 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 886 (struct sockaddr *)&laddr, laddrlen, 887 (struct sockaddr *)&faddr, faddrlen, opts); 888 889 while (mp != NULL) { 890 mpnext = mp->b_next; 891 tcp->tcp_rcv_list = mp->b_next; 892 mp->b_next = NULL; 893 putnext(q, mp); 894 mp = mpnext; 895 } 896 ASSERT(tcp->tcp_rcv_last_head == NULL); 897 ASSERT(tcp->tcp_rcv_last_tail == NULL); 898 ASSERT(tcp->tcp_rcv_cnt == 0); 899 900 /* 901 * All eagers in q0 are marked as being non-STREAM, so they will 902 * make su_newconn upcalls when the handshake completes, which 903 * will fail (resulting in the conn being closed). So we just blow 904 * off everything in q0 instead of waiting for the inevitable. 905 */ 906 if (tcp->tcp_conn_req_cnt_q0 != 0) 907 tcp_eager_cleanup(tcp, B_TRUE); 908 } 909 910 /* 911 * An eager is falling back to TPI. All we have to do is send 912 * up a T_CONN_IND. 913 */ 914 static void 915 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 916 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 917 { 918 conn_t *connp = eager->tcp_connp; 919 tcp_t *listener = eager->tcp_listener; 920 mblk_t *mp; 921 922 ASSERT(listener != NULL); 923 924 /* 925 * Notify the socket that the protocol is now quiescent, 926 * and it's therefore safe move data from the socket 927 * to tcp's rcv queue. 928 */ 929 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 930 NULL, 0, 0); 931 932 if (mp != NULL) { 933 ASSERT(eager->tcp_rcv_cnt == 0); 934 935 eager->tcp_rcv_list = mp; 936 eager->tcp_rcv_cnt = msgdsize(mp); 937 while (mp->b_next != NULL) { 938 mp = mp->b_next; 939 eager->tcp_rcv_cnt += msgdsize(mp); 940 } 941 eager->tcp_rcv_last_head = mp; 942 while (mp->b_cont) 943 mp = mp->b_cont; 944 eager->tcp_rcv_last_tail = mp; 945 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 946 eager->tcp_rwnd = 0; 947 else 948 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 949 } 950 951 if (!issocket) 952 eager->tcp_issocket = B_FALSE; 953 /* 954 * The stream for this eager does not yet exist, so mark it as 955 * being detached. 956 */ 957 eager->tcp_detached = B_TRUE; 958 eager->tcp_hard_binding = B_TRUE; 959 connp->conn_rq = listener->tcp_connp->conn_rq; 960 connp->conn_wq = listener->tcp_connp->conn_wq; 961 962 /* Send up the connection indication */ 963 mp = eager->tcp_conn.tcp_eager_conn_ind; 964 ASSERT(mp != NULL); 965 eager->tcp_conn.tcp_eager_conn_ind = NULL; 966 967 /* 968 * TLI/XTI applications will get confused by 969 * sending eager as an option since it violates 970 * the option semantics. So remove the eager as 971 * option since TLI/XTI app doesn't need it anyway. 972 */ 973 if (!issocket) { 974 struct T_conn_ind *conn_ind; 975 976 conn_ind = (struct T_conn_ind *)mp->b_rptr; 977 conn_ind->OPT_length = 0; 978 conn_ind->OPT_offset = 0; 979 } 980 981 /* 982 * Sockfs guarantees that the listener will not be closed 983 * during fallback. So we can safely use the listener's queue. 984 */ 985 putnext(listener->tcp_connp->conn_rq, mp); 986 } 987 988 989 int 990 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 991 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 992 sock_quiesce_arg_t *arg) 993 { 994 tcp_t *tcp; 995 conn_t *connp = (conn_t *)proto_handle; 996 int error; 997 mblk_t *stropt_mp; 998 mblk_t *ordrel_mp; 999 1000 tcp = connp->conn_tcp; 1001 1002 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 1003 NULL); 1004 1005 /* Pre-allocate the T_ordrel_ind mblk. */ 1006 ASSERT(tcp->tcp_ordrel_mp == NULL); 1007 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1008 STR_NOSIG, NULL); 1009 ordrel_mp->b_datap->db_type = M_PROTO; 1010 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1011 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1012 1013 /* 1014 * Enter the squeue so that no new packets can come in 1015 */ 1016 error = squeue_synch_enter(connp, NULL); 1017 if (error != 0) { 1018 /* failed to enter, free all the pre-allocated messages. */ 1019 freeb(stropt_mp); 1020 freeb(ordrel_mp); 1021 return (ENOMEM); 1022 } 1023 1024 /* 1025 * Both endpoints must be of the same type (either STREAMS or 1026 * non-STREAMS) for fusion to be enabled. So if we are fused, 1027 * we have to unfuse. 1028 */ 1029 if (tcp->tcp_fused) 1030 tcp_unfuse(tcp); 1031 1032 if (tcp->tcp_listener != NULL) { 1033 /* The eager will deal with opts when accept() is called */ 1034 freeb(stropt_mp); 1035 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1036 } else { 1037 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1038 quiesced_cb, arg); 1039 } 1040 1041 /* 1042 * No longer a direct socket 1043 * 1044 * Note that we intentionally leave the upper_handle and upcalls 1045 * intact, since eagers may still be using them. 1046 */ 1047 connp->conn_flags &= ~IPCL_NONSTR; 1048 tcp->tcp_ordrel_mp = ordrel_mp; 1049 1050 /* 1051 * There should be atleast two ref's (IP + TCP) 1052 */ 1053 ASSERT(connp->conn_ref >= 2); 1054 squeue_synch_exit(connp); 1055 1056 return (0); 1057 } 1058 1059 /* 1060 * Notifies a non-STREAMS based listener about a new connection. This 1061 * function is executed on the *eager*'s squeue once the 3 way handshake 1062 * has completed. Note that the behavior differs from STREAMS, where the 1063 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s 1064 * squeue. 1065 * 1066 * Returns B_TRUE if the notification succeeded and an upper handle was 1067 * obtained. `tcp' should be closed on failure. 1068 */ 1069 boolean_t 1070 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1071 { 1072 tcp_t *listener = tcp->tcp_listener; 1073 conn_t *lconnp = listener->tcp_connp; 1074 conn_t *econnp = tcp->tcp_connp; 1075 tcp_t *tail; 1076 ipaddr_t *addr_cache; 1077 sock_upper_handle_t upper; 1078 struct sock_proto_props sopp; 1079 1080 mutex_enter(&listener->tcp_eager_lock); 1081 /* 1082 * Take the eager out, if it is in the list of droppable eagers 1083 * as we are here because the 3W handshake is over. 1084 */ 1085 MAKE_UNDROPPABLE(tcp); 1086 /* 1087 * The eager already has an extra ref put in tcp_input_data 1088 * so that it stays till accept comes back even though it 1089 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1090 */ 1091 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1092 listener->tcp_conn_req_cnt_q0--; 1093 listener->tcp_conn_req_cnt_q++; 1094 1095 /* Move from SYN_RCVD to ESTABLISHED list */ 1096 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1097 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1098 tcp->tcp_eager_prev_q0 = NULL; 1099 tcp->tcp_eager_next_q0 = NULL; 1100 1101 /* 1102 * Insert at end of the queue because connections are accepted 1103 * in chronological order. Leaving the older connections at front 1104 * of the queue helps reducing search time. 1105 */ 1106 tail = listener->tcp_eager_last_q; 1107 if (tail != NULL) 1108 tail->tcp_eager_next_q = tcp; 1109 else 1110 listener->tcp_eager_next_q = tcp; 1111 listener->tcp_eager_last_q = tcp; 1112 tcp->tcp_eager_next_q = NULL; 1113 1114 /* we have timed out before */ 1115 if (tcp->tcp_syn_rcvd_timeout != 0) { 1116 tcp->tcp_syn_rcvd_timeout = 0; 1117 listener->tcp_syn_rcvd_timeout--; 1118 if (listener->tcp_syn_defense && 1119 listener->tcp_syn_rcvd_timeout <= 1120 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1121 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1122 listener->tcp_last_rcv_lbolt)) { 1123 /* 1124 * Turn off the defense mode if we 1125 * believe the SYN attack is over. 1126 */ 1127 listener->tcp_syn_defense = B_FALSE; 1128 if (listener->tcp_ip_addr_cache) { 1129 kmem_free((void *)listener->tcp_ip_addr_cache, 1130 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1131 listener->tcp_ip_addr_cache = NULL; 1132 } 1133 } 1134 } 1135 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1136 if (addr_cache != NULL) { 1137 /* 1138 * We have finished a 3-way handshake with this 1139 * remote host. This proves the IP addr is good. 1140 * Cache it! 1141 */ 1142 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1143 tcp->tcp_connp->conn_faddr_v4; 1144 } 1145 mutex_exit(&listener->tcp_eager_lock); 1146 1147 /* 1148 * Notify the ULP about the newconn. It is guaranteed that no 1149 * tcp_accept() call will be made for the eager if the 1150 * notification fails. 1151 */ 1152 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1153 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1154 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1155 &econnp->conn_upcalls)) == NULL) { 1156 return (B_FALSE); 1157 } 1158 econnp->conn_upper_handle = upper; 1159 1160 tcp->tcp_detached = B_FALSE; 1161 tcp->tcp_hard_binding = B_FALSE; 1162 tcp->tcp_tconnind_started = B_TRUE; 1163 1164 if (econnp->conn_keepalive) { 1165 tcp->tcp_ka_last_intrvl = 0; 1166 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1167 tcp->tcp_ka_interval); 1168 } 1169 1170 /* Update the necessary parameters */ 1171 tcp_get_proto_props(tcp, &sopp); 1172 1173 (*econnp->conn_upcalls->su_set_proto_props) 1174 (econnp->conn_upper_handle, &sopp); 1175 1176 return (B_TRUE); 1177 }