1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP kernel socket related functions. */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/policy.h> 31 #include <sys/sockio.h> 32 #include <sys/strsubr.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 #include <sys/timod.h> 39 #include <sys/tpicommon.h> 40 #include <sys/socketvar.h> 41 42 #include <inet/common.h> 43 #include <inet/proto_set.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 49 sock_upcalls_t *, int, cred_t *); 50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 51 sock_upper_handle_t, cred_t *); 52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 53 socklen_t, cred_t *); 54 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 56 socklen_t, sock_connid_t *, cred_t *); 57 static int tcp_getpeername(sock_lower_handle_t, struct sockaddr *, 58 socklen_t *, cred_t *); 59 static int tcp_getsockname(sock_lower_handle_t, struct sockaddr *, 60 socklen_t *, cred_t *); 61 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 62 socklen_t *, cred_t *); 63 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 64 socklen_t, cred_t *); 65 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 66 cred_t *); 67 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 68 static void tcp_clr_flowctrl(sock_lower_handle_t); 69 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 70 cred_t *); 71 static int tcp_close(sock_lower_handle_t, int, cred_t *); 72 73 sock_downcalls_t sock_tcp_downcalls = { 74 tcp_activate, 75 tcp_accept, 76 tcp_bind, 77 tcp_listen, 78 tcp_connect, 79 tcp_getpeername, 80 tcp_getsockname, 81 tcp_getsockopt, 82 tcp_setsockopt, 83 tcp_sendmsg, 84 NULL, 85 NULL, 86 NULL, 87 tcp_shutdown, 88 tcp_clr_flowctrl, 89 tcp_ioctl, 90 tcp_close, 91 }; 92 93 /* ARGSUSED */ 94 static void 95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 96 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 97 { 98 conn_t *connp = (conn_t *)proto_handle; 99 struct sock_proto_props sopp; 100 extern struct module_info tcp_rinfo; 101 102 ASSERT(connp->conn_upper_handle == NULL); 103 104 /* All Solaris components should pass a cred for this operation. */ 105 ASSERT(cr != NULL); 106 107 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 108 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 109 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 110 111 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 112 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 113 sopp.sopp_maxpsz = INFPSZ; 114 sopp.sopp_maxblk = INFPSZ; 115 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 116 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 117 sopp.sopp_maxaddrlen = sizeof (sin6_t); 118 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 119 tcp_rinfo.mi_minpsz; 120 121 connp->conn_upcalls = sock_upcalls; 122 connp->conn_upper_handle = sock_handle; 123 124 ASSERT(connp->conn_rcvbuf != 0 && 125 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 126 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 127 } 128 129 /*ARGSUSED*/ 130 static int 131 tcp_accept(sock_lower_handle_t lproto_handle, 132 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 133 cred_t *cr) 134 { 135 conn_t *lconnp, *econnp; 136 tcp_t *listener, *eager; 137 138 /* 139 * KSSL can move a socket from one listener to another, in which 140 * case `lproto_handle' points to the new listener. To ensure that 141 * the original listener is used the information is obtained from 142 * the eager. 143 */ 144 econnp = (conn_t *)eproto_handle; 145 eager = econnp->conn_tcp; 146 ASSERT(IPCL_IS_NONSTR(econnp)); 147 ASSERT(eager->tcp_listener != NULL); 148 listener = eager->tcp_listener; 149 lconnp = (conn_t *)listener->tcp_connp; 150 ASSERT(listener->tcp_state == TCPS_LISTEN); 151 ASSERT(lconnp->conn_upper_handle != NULL); 152 153 /* 154 * It is possible for the accept thread to race with the thread that 155 * made the su_newconn upcall in tcp_newconn_notify. Both 156 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 157 * and conn_upcalls be set before returning, so they both write to 158 * them. However, we're guaranteed that the value written is the same 159 * for both threads. 160 */ 161 ASSERT(econnp->conn_upper_handle == NULL || 162 econnp->conn_upper_handle == sock_handle); 163 ASSERT(econnp->conn_upcalls == NULL || 164 econnp->conn_upcalls == lconnp->conn_upcalls); 165 econnp->conn_upper_handle = sock_handle; 166 econnp->conn_upcalls = lconnp->conn_upcalls; 167 168 ASSERT(econnp->conn_netstack == 169 listener->tcp_connp->conn_netstack); 170 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 171 172 /* 173 * We should have a minimum of 2 references on the conn at this 174 * point. One for TCP and one for the newconn notification 175 * (which is now taken over by IP). In the normal case we would 176 * also have another reference (making a total of 3) for the conn 177 * being in the classifier hash list. However the eager could have 178 * received an RST subsequently and tcp_closei_local could have 179 * removed the eager from the classifier hash list, hence we can't 180 * assert that reference. 181 */ 182 ASSERT(econnp->conn_ref >= 2); 183 184 mutex_enter(&listener->tcp_eager_lock); 185 /* 186 * Non-STREAMS listeners never defer the notification of new 187 * connections. 188 */ 189 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 190 tcp_eager_unlink(eager); 191 mutex_exit(&listener->tcp_eager_lock); 192 CONN_DEC_REF(listener->tcp_connp); 193 194 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0); 195 } 196 197 static int 198 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 199 socklen_t len, cred_t *cr) 200 { 201 int error; 202 conn_t *connp = (conn_t *)proto_handle; 203 204 /* All Solaris components should pass a cred for this operation. */ 205 ASSERT(cr != NULL); 206 ASSERT(connp->conn_upper_handle != NULL); 207 208 error = squeue_synch_enter(connp, NULL); 209 if (error != 0) { 210 /* failed to enter */ 211 return (ENOSR); 212 } 213 214 /* binding to a NULL address really means unbind */ 215 if (sa == NULL) { 216 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 217 error = tcp_do_unbind(connp); 218 else 219 error = EINVAL; 220 } else { 221 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 222 } 223 224 squeue_synch_exit(connp); 225 226 if (error < 0) { 227 if (error == -TOUTSTATE) 228 error = EINVAL; 229 else 230 error = proto_tlitosyserr(-error); 231 } 232 233 return (error); 234 } 235 236 /* ARGSUSED */ 237 static int 238 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 239 { 240 conn_t *connp = (conn_t *)proto_handle; 241 tcp_t *tcp = connp->conn_tcp; 242 int error; 243 244 ASSERT(connp->conn_upper_handle != NULL); 245 246 /* All Solaris components should pass a cred for this operation. */ 247 ASSERT(cr != NULL); 248 249 error = squeue_synch_enter(connp, NULL); 250 if (error != 0) { 251 /* failed to enter */ 252 return (ENOBUFS); 253 } 254 255 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 256 if (error == 0) { 257 /* 258 * sockfs needs to know what's the maximum number of socket 259 * that can be queued on the listener. 260 */ 261 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 262 SOCK_OPCTL_ENAB_ACCEPT, 263 (uintptr_t)(tcp->tcp_conn_req_max + 264 tcp->tcp_tcps->tcps_conn_req_max_q0)); 265 } else if (error < 0) { 266 if (error == -TOUTSTATE) 267 error = EINVAL; 268 else 269 error = proto_tlitosyserr(-error); 270 } 271 squeue_synch_exit(connp); 272 return (error); 273 } 274 275 static int 276 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 277 socklen_t len, sock_connid_t *id, cred_t *cr) 278 { 279 conn_t *connp = (conn_t *)proto_handle; 280 int error; 281 282 ASSERT(connp->conn_upper_handle != NULL); 283 284 /* All Solaris components should pass a cred for this operation. */ 285 ASSERT(cr != NULL); 286 287 error = proto_verify_ip_addr(connp->conn_family, sa, len); 288 if (error != 0) { 289 return (error); 290 } 291 292 error = squeue_synch_enter(connp, NULL); 293 if (error != 0) { 294 /* failed to enter */ 295 return (ENOSR); 296 } 297 298 /* 299 * TCP supports quick connect, so no need to do an implicit bind 300 */ 301 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 302 if (error == 0) { 303 *id = connp->conn_tcp->tcp_connid; 304 } else if (error < 0) { 305 if (error == -TOUTSTATE) { 306 switch (connp->conn_tcp->tcp_state) { 307 case TCPS_SYN_SENT: 308 error = EALREADY; 309 break; 310 case TCPS_ESTABLISHED: 311 error = EISCONN; 312 break; 313 case TCPS_LISTEN: 314 error = EOPNOTSUPP; 315 break; 316 default: 317 error = EINVAL; 318 break; 319 } 320 } else { 321 error = proto_tlitosyserr(-error); 322 } 323 } 324 325 if (connp->conn_tcp->tcp_loopback) { 326 struct sock_proto_props sopp; 327 328 sopp.sopp_flags = SOCKOPT_LOOPBACK; 329 sopp.sopp_loopback = B_TRUE; 330 331 (*connp->conn_upcalls->su_set_proto_props)( 332 connp->conn_upper_handle, &sopp); 333 } 334 squeue_synch_exit(connp); 335 336 return ((error == 0) ? EINPROGRESS : error); 337 } 338 339 /* ARGSUSED3 */ 340 int 341 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 342 socklen_t *addrlenp, cred_t *cr) 343 { 344 conn_t *connp = (conn_t *)proto_handle; 345 tcp_t *tcp = connp->conn_tcp; 346 347 /* All Solaris components should pass a cred for this operation. */ 348 ASSERT(cr != NULL); 349 350 ASSERT(tcp != NULL); 351 if (tcp->tcp_state < TCPS_SYN_RCVD) 352 return (ENOTCONN); 353 354 return (conn_getpeername(connp, addr, addrlenp)); 355 } 356 357 /* ARGSUSED3 */ 358 int 359 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 360 socklen_t *addrlenp, cred_t *cr) 361 { 362 conn_t *connp = (conn_t *)proto_handle; 363 364 /* All Solaris components should pass a cred for this operation. */ 365 ASSERT(cr != NULL); 366 367 return (conn_getsockname(connp, addr, addrlenp)); 368 } 369 370 /* returns UNIX error, the optlen is a value-result arg */ 371 static int 372 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 373 void *optvalp, socklen_t *optlen, cred_t *cr) 374 { 375 conn_t *connp = (conn_t *)proto_handle; 376 int error; 377 t_uscalar_t max_optbuf_len; 378 void *optvalp_buf; 379 int len; 380 381 ASSERT(connp->conn_upper_handle != NULL); 382 383 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 384 tcp_opt_obj.odb_opt_des_arr, 385 tcp_opt_obj.odb_opt_arr_cnt, 386 B_FALSE, B_TRUE, cr); 387 if (error != 0) { 388 if (error < 0) { 389 error = proto_tlitosyserr(-error); 390 } 391 return (error); 392 } 393 394 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 395 396 error = squeue_synch_enter(connp, NULL); 397 if (error == ENOMEM) { 398 kmem_free(optvalp_buf, max_optbuf_len); 399 return (ENOMEM); 400 } 401 402 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 403 squeue_synch_exit(connp); 404 405 if (len == -1) { 406 kmem_free(optvalp_buf, max_optbuf_len); 407 return (EINVAL); 408 } 409 410 /* 411 * update optlen and copy option value 412 */ 413 t_uscalar_t size = MIN(len, *optlen); 414 415 bcopy(optvalp_buf, optvalp, size); 416 bcopy(&size, optlen, sizeof (size)); 417 418 kmem_free(optvalp_buf, max_optbuf_len); 419 return (0); 420 } 421 422 static int 423 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 424 const void *optvalp, socklen_t optlen, cred_t *cr) 425 { 426 conn_t *connp = (conn_t *)proto_handle; 427 int error; 428 429 ASSERT(connp->conn_upper_handle != NULL); 430 /* 431 * Entering the squeue synchronously can result in a context switch, 432 * which can cause a rather sever performance degradation. So we try to 433 * handle whatever options we can without entering the squeue. 434 */ 435 if (level == IPPROTO_TCP) { 436 switch (option_name) { 437 case TCP_NODELAY: 438 if (optlen != sizeof (int32_t)) 439 return (EINVAL); 440 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 441 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 442 connp->conn_tcp->tcp_mss; 443 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 444 return (0); 445 default: 446 break; 447 } 448 } 449 450 error = squeue_synch_enter(connp, NULL); 451 if (error == ENOMEM) { 452 return (ENOMEM); 453 } 454 455 error = proto_opt_check(level, option_name, optlen, NULL, 456 tcp_opt_obj.odb_opt_des_arr, 457 tcp_opt_obj.odb_opt_arr_cnt, 458 B_TRUE, B_FALSE, cr); 459 460 if (error != 0) { 461 if (error < 0) { 462 error = proto_tlitosyserr(-error); 463 } 464 squeue_synch_exit(connp); 465 return (error); 466 } 467 468 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 469 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 470 NULL, cr); 471 squeue_synch_exit(connp); 472 473 ASSERT(error >= 0); 474 475 return (error); 476 } 477 478 /* ARGSUSED */ 479 static int 480 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 481 cred_t *cr) 482 { 483 tcp_t *tcp; 484 uint32_t msize; 485 conn_t *connp = (conn_t *)proto_handle; 486 int32_t tcpstate; 487 488 /* All Solaris components should pass a cred for this operation. */ 489 ASSERT(cr != NULL); 490 491 ASSERT(connp->conn_ref >= 2); 492 ASSERT(connp->conn_upper_handle != NULL); 493 494 if (msg->msg_controllen != 0) { 495 freemsg(mp); 496 return (EOPNOTSUPP); 497 } 498 499 switch (DB_TYPE(mp)) { 500 case M_DATA: 501 tcp = connp->conn_tcp; 502 ASSERT(tcp != NULL); 503 504 tcpstate = tcp->tcp_state; 505 if (tcpstate < TCPS_ESTABLISHED) { 506 freemsg(mp); 507 /* 508 * We return ENOTCONN if the endpoint is trying to 509 * connect or has never been connected, and EPIPE if it 510 * has been disconnected. The connection id helps us 511 * distinguish between the last two cases. 512 */ 513 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 514 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 515 } else if (tcpstate > TCPS_CLOSE_WAIT) { 516 freemsg(mp); 517 return (EPIPE); 518 } 519 520 msize = msgdsize(mp); 521 522 mutex_enter(&tcp->tcp_non_sq_lock); 523 tcp->tcp_squeue_bytes += msize; 524 /* 525 * Squeue Flow Control 526 */ 527 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 528 tcp_setqfull(tcp); 529 } 530 mutex_exit(&tcp->tcp_non_sq_lock); 531 532 /* 533 * The application may pass in an address in the msghdr, but 534 * we ignore the address on connection-oriented sockets. 535 * Just like BSD this code does not generate an error for 536 * TCP (a CONNREQUIRED socket) when sending to an address 537 * passed in with sendto/sendmsg. Instead the data is 538 * delivered on the connection as if no address had been 539 * supplied. 540 */ 541 CONN_INC_REF(connp); 542 543 if (msg->msg_flags & MSG_OOB) { 544 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 545 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 546 } else { 547 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 548 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 549 } 550 551 return (0); 552 553 default: 554 ASSERT(0); 555 } 556 557 freemsg(mp); 558 return (0); 559 } 560 561 /* ARGSUSED */ 562 static int 563 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 564 { 565 conn_t *connp = (conn_t *)proto_handle; 566 tcp_t *tcp = connp->conn_tcp; 567 568 ASSERT(connp->conn_upper_handle != NULL); 569 570 /* All Solaris components should pass a cred for this operation. */ 571 ASSERT(cr != NULL); 572 573 /* 574 * X/Open requires that we check the connected state. 575 */ 576 if (tcp->tcp_state < TCPS_SYN_SENT) 577 return (ENOTCONN); 578 579 /* shutdown the send side */ 580 if (how != SHUT_RD) { 581 mblk_t *bp; 582 583 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 584 CONN_INC_REF(connp); 585 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 586 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 587 588 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 589 SOCK_OPCTL_SHUT_SEND, 0); 590 } 591 592 /* shutdown the recv side */ 593 if (how != SHUT_WR) 594 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 595 SOCK_OPCTL_SHUT_RECV, 0); 596 597 return (0); 598 } 599 600 static void 601 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 602 { 603 conn_t *connp = (conn_t *)proto_handle; 604 tcp_t *tcp = connp->conn_tcp; 605 mblk_t *mp; 606 int error; 607 608 ASSERT(connp->conn_upper_handle != NULL); 609 610 /* 611 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 612 * is currently running. 613 */ 614 mutex_enter(&tcp->tcp_rsrv_mp_lock); 615 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 616 mutex_exit(&tcp->tcp_rsrv_mp_lock); 617 return; 618 } 619 tcp->tcp_rsrv_mp = NULL; 620 mutex_exit(&tcp->tcp_rsrv_mp_lock); 621 622 error = squeue_synch_enter(connp, mp); 623 ASSERT(error == 0); 624 625 mutex_enter(&tcp->tcp_rsrv_mp_lock); 626 tcp->tcp_rsrv_mp = mp; 627 mutex_exit(&tcp->tcp_rsrv_mp_lock); 628 629 if (tcp->tcp_fused) { 630 tcp_fuse_backenable(tcp); 631 } else { 632 tcp->tcp_rwnd = connp->conn_rcvbuf; 633 /* 634 * Send back a window update immediately if TCP is above 635 * ESTABLISHED state and the increase of the rcv window 636 * that the other side knows is at least 1 MSS after flow 637 * control is lifted. 638 */ 639 if (tcp->tcp_state >= TCPS_ESTABLISHED && 640 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 641 tcp_xmit_ctl(NULL, tcp, 642 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 643 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 644 } 645 } 646 647 squeue_synch_exit(connp); 648 } 649 650 /* ARGSUSED */ 651 static int 652 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 653 int mode, int32_t *rvalp, cred_t *cr) 654 { 655 conn_t *connp = (conn_t *)proto_handle; 656 int error; 657 658 ASSERT(connp->conn_upper_handle != NULL); 659 660 /* All Solaris components should pass a cred for this operation. */ 661 ASSERT(cr != NULL); 662 663 /* 664 * If we don't have a helper stream then create one. 665 * ip_create_helper_stream takes care of locking the conn_t, 666 * so this check for NULL is just a performance optimization. 667 */ 668 if (connp->conn_helper_info == NULL) { 669 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 670 671 /* 672 * Create a helper stream for non-STREAMS socket. 673 */ 674 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 675 if (error != 0) { 676 ip0dbg(("tcp_ioctl: create of IP helper stream " 677 "failed %d\n", error)); 678 return (error); 679 } 680 } 681 682 switch (cmd) { 683 case ND_SET: 684 case ND_GET: 685 case _SIOCSOCKFALLBACK: 686 case TCP_IOC_ABORT_CONN: 687 case TI_GETPEERNAME: 688 case TI_GETMYNAME: 689 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 690 cmd)); 691 error = EINVAL; 692 break; 693 default: 694 /* 695 * If the conn is not closing, pass on to IP using 696 * helper stream. Bump the ioctlref to prevent tcp_close 697 * from closing the rq/wq out from underneath the ioctl 698 * if it ends up queued or aborted/interrupted. 699 */ 700 mutex_enter(&connp->conn_lock); 701 if (connp->conn_state_flags & (CONN_CLOSING)) { 702 mutex_exit(&connp->conn_lock); 703 error = EINVAL; 704 break; 705 } 706 CONN_INC_IOCTLREF_LOCKED(connp); 707 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 708 cmd, arg, mode, cr, rvalp); 709 CONN_DEC_IOCTLREF(connp); 710 break; 711 } 712 return (error); 713 } 714 715 /* ARGSUSED */ 716 static int 717 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 718 { 719 conn_t *connp = (conn_t *)proto_handle; 720 721 ASSERT(connp->conn_upper_handle != NULL); 722 723 /* All Solaris components should pass a cred for this operation. */ 724 ASSERT(cr != NULL); 725 726 tcp_close_common(connp, flags); 727 728 ip_free_helper_stream(connp); 729 730 /* 731 * Drop IP's reference on the conn. This is the last reference 732 * on the connp if the state was less than established. If the 733 * connection has gone into timewait state, then we will have 734 * one ref for the TCP and one more ref (total of two) for the 735 * classifier connected hash list (a timewait connections stays 736 * in connected hash till closed). 737 * 738 * We can't assert the references because there might be other 739 * transient reference places because of some walkers or queued 740 * packets in squeue for the timewait state. 741 */ 742 CONN_DEC_REF(connp); 743 744 /* 745 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 746 * freeing the socket. 747 */ 748 return (EINPROGRESS); 749 } 750 751 /* ARGSUSED */ 752 sock_lower_handle_t 753 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 754 uint_t *smodep, int *errorp, int flags, cred_t *credp) 755 { 756 conn_t *connp; 757 boolean_t isv6 = family == AF_INET6; 758 759 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 760 (proto != 0 && proto != IPPROTO_TCP)) { 761 *errorp = EPROTONOSUPPORT; 762 return (NULL); 763 } 764 765 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 766 if (connp == NULL) { 767 return (NULL); 768 } 769 770 /* 771 * Put the ref for TCP. Ref for IP was already put 772 * by ipcl_conn_create. Also make the conn_t globally 773 * visible to walkers. 774 */ 775 mutex_enter(&connp->conn_lock); 776 CONN_INC_REF_LOCKED(connp); 777 ASSERT(connp->conn_ref == 2); 778 connp->conn_state_flags &= ~CONN_INCIPIENT; 779 780 connp->conn_flags |= IPCL_NONSTR; 781 mutex_exit(&connp->conn_lock); 782 783 ASSERT(errorp != NULL); 784 *errorp = 0; 785 *sock_downcalls = &sock_tcp_downcalls; 786 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 787 SM_SENDFILESUPP; 788 789 return ((sock_lower_handle_t)connp); 790 } 791 792 /* 793 * tcp_fallback 794 * 795 * A direct socket is falling back to using STREAMS. The queue 796 * that is being passed down was created using tcp_open() with 797 * the SO_FALLBACK flag set. As a result, the queue is not 798 * associated with a conn, and the q_ptrs instead contain the 799 * dev and minor area that should be used. 800 * 801 * The 'issocket' flag indicates whether the FireEngine 802 * optimizations should be used. The common case would be that 803 * optimizations are enabled, and they might be subsequently 804 * disabled using the _SIOCSOCKFALLBACK ioctl. 805 */ 806 807 /* 808 * An active connection is falling back to TPI. Gather all the information 809 * required by the STREAM head and TPI sonode and send it up. 810 */ 811 static void 812 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 813 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 814 sock_quiesce_arg_t *arg) 815 { 816 conn_t *connp = tcp->tcp_connp; 817 struct stroptions *stropt; 818 struct T_capability_ack tca; 819 struct sockaddr_in6 laddr, faddr; 820 socklen_t laddrlen, faddrlen; 821 short opts; 822 int error; 823 mblk_t *mp, *mpnext; 824 825 connp->conn_dev = (dev_t)RD(q)->q_ptr; 826 connp->conn_minor_arena = WR(q)->q_ptr; 827 828 RD(q)->q_ptr = WR(q)->q_ptr = connp; 829 830 connp->conn_rq = RD(q); 831 connp->conn_wq = WR(q); 832 833 WR(q)->q_qinfo = &tcp_sock_winit; 834 835 if (!issocket) 836 tcp_use_pure_tpi(tcp); 837 838 /* 839 * free the helper stream 840 */ 841 ip_free_helper_stream(connp); 842 843 /* 844 * Notify the STREAM head about options 845 */ 846 DB_TYPE(stropt_mp) = M_SETOPTS; 847 stropt = (struct stroptions *)stropt_mp->b_rptr; 848 stropt_mp->b_wptr += sizeof (struct stroptions); 849 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 850 851 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 852 tcp->tcp_tcps->tcps_wroff_xtra); 853 if (tcp->tcp_snd_sack_ok) 854 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 855 stropt->so_hiwat = connp->conn_rcvbuf; 856 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 857 858 putnext(RD(q), stropt_mp); 859 860 /* 861 * Collect the information needed to sync with the sonode 862 */ 863 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 864 865 laddrlen = faddrlen = sizeof (sin6_t); 866 (void) tcp_getsockname((sock_lower_handle_t)connp, 867 (struct sockaddr *)&laddr, &laddrlen, CRED()); 868 error = tcp_getpeername((sock_lower_handle_t)connp, 869 (struct sockaddr *)&faddr, &faddrlen, CRED()); 870 if (error != 0) 871 faddrlen = 0; 872 873 opts = 0; 874 if (connp->conn_oobinline) 875 opts |= SO_OOBINLINE; 876 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 877 opts |= SO_DONTROUTE; 878 879 /* 880 * Notify the socket that the protocol is now quiescent, 881 * and it's therefore safe move data from the socket 882 * to the stream head. 883 */ 884 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 885 (struct sockaddr *)&laddr, laddrlen, 886 (struct sockaddr *)&faddr, faddrlen, opts); 887 888 while (mp != NULL) { 889 mpnext = mp->b_next; 890 tcp->tcp_rcv_list = mp->b_next; 891 mp->b_next = NULL; 892 putnext(q, mp); 893 mp = mpnext; 894 } 895 ASSERT(tcp->tcp_rcv_last_head == NULL); 896 ASSERT(tcp->tcp_rcv_last_tail == NULL); 897 ASSERT(tcp->tcp_rcv_cnt == 0); 898 899 /* 900 * All eagers in q0 are marked as being non-STREAM, so they will 901 * make su_newconn upcalls when the handshake completes, which 902 * will fail (resulting in the conn being closed). So we just blow 903 * off everything in q0 instead of waiting for the inevitable. 904 */ 905 if (tcp->tcp_conn_req_cnt_q0 != 0) 906 tcp_eager_cleanup(tcp, B_TRUE); 907 } 908 909 /* 910 * An eager is falling back to TPI. All we have to do is send 911 * up a T_CONN_IND. 912 */ 913 static void 914 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 915 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 916 { 917 conn_t *connp = eager->tcp_connp; 918 tcp_t *listener = eager->tcp_listener; 919 mblk_t *mp; 920 921 ASSERT(listener != NULL); 922 923 /* 924 * Notify the socket that the protocol is now quiescent, 925 * and it's therefore safe move data from the socket 926 * to tcp's rcv queue. 927 */ 928 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 929 NULL, 0, 0); 930 931 if (mp != NULL) { 932 ASSERT(eager->tcp_rcv_cnt == 0); 933 934 eager->tcp_rcv_list = mp; 935 eager->tcp_rcv_cnt = msgdsize(mp); 936 while (mp->b_next != NULL) { 937 mp = mp->b_next; 938 eager->tcp_rcv_cnt += msgdsize(mp); 939 } 940 eager->tcp_rcv_last_head = mp; 941 while (mp->b_cont) 942 mp = mp->b_cont; 943 eager->tcp_rcv_last_tail = mp; 944 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 945 eager->tcp_rwnd = 0; 946 else 947 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 948 } 949 950 if (!issocket) 951 eager->tcp_issocket = B_FALSE; 952 /* 953 * The stream for this eager does not yet exist, so mark it as 954 * being detached. 955 */ 956 eager->tcp_detached = B_TRUE; 957 eager->tcp_hard_binding = B_TRUE; 958 connp->conn_rq = listener->tcp_connp->conn_rq; 959 connp->conn_wq = listener->tcp_connp->conn_wq; 960 961 /* Send up the connection indication */ 962 mp = eager->tcp_conn.tcp_eager_conn_ind; 963 ASSERT(mp != NULL); 964 eager->tcp_conn.tcp_eager_conn_ind = NULL; 965 966 /* 967 * TLI/XTI applications will get confused by 968 * sending eager as an option since it violates 969 * the option semantics. So remove the eager as 970 * option since TLI/XTI app doesn't need it anyway. 971 */ 972 if (!issocket) { 973 struct T_conn_ind *conn_ind; 974 975 conn_ind = (struct T_conn_ind *)mp->b_rptr; 976 conn_ind->OPT_length = 0; 977 conn_ind->OPT_offset = 0; 978 } 979 980 /* 981 * Sockfs guarantees that the listener will not be closed 982 * during fallback. So we can safely use the listener's queue. 983 */ 984 putnext(listener->tcp_connp->conn_rq, mp); 985 } 986 987 988 int 989 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 990 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 991 sock_quiesce_arg_t *arg) 992 { 993 tcp_t *tcp; 994 conn_t *connp = (conn_t *)proto_handle; 995 int error; 996 mblk_t *stropt_mp; 997 mblk_t *ordrel_mp; 998 999 tcp = connp->conn_tcp; 1000 1001 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 1002 NULL); 1003 1004 /* Pre-allocate the T_ordrel_ind mblk. */ 1005 ASSERT(tcp->tcp_ordrel_mp == NULL); 1006 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1007 STR_NOSIG, NULL); 1008 ordrel_mp->b_datap->db_type = M_PROTO; 1009 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1010 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1011 1012 /* 1013 * Enter the squeue so that no new packets can come in 1014 */ 1015 error = squeue_synch_enter(connp, NULL); 1016 if (error != 0) { 1017 /* failed to enter, free all the pre-allocated messages. */ 1018 freeb(stropt_mp); 1019 freeb(ordrel_mp); 1020 return (ENOMEM); 1021 } 1022 1023 /* 1024 * Both endpoints must be of the same type (either STREAMS or 1025 * non-STREAMS) for fusion to be enabled. So if we are fused, 1026 * we have to unfuse. 1027 */ 1028 if (tcp->tcp_fused) 1029 tcp_unfuse(tcp); 1030 1031 if (tcp->tcp_listener != NULL) { 1032 /* The eager will deal with opts when accept() is called */ 1033 freeb(stropt_mp); 1034 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1035 } else { 1036 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1037 quiesced_cb, arg); 1038 } 1039 1040 /* 1041 * No longer a direct socket 1042 * 1043 * Note that we intentionally leave the upper_handle and upcalls 1044 * intact, since eagers may still be using them. 1045 */ 1046 connp->conn_flags &= ~IPCL_NONSTR; 1047 tcp->tcp_ordrel_mp = ordrel_mp; 1048 1049 /* 1050 * There should be atleast two ref's (IP + TCP) 1051 */ 1052 ASSERT(connp->conn_ref >= 2); 1053 squeue_synch_exit(connp); 1054 1055 return (0); 1056 } 1057 1058 /* 1059 * Notifies a non-STREAMS based listener about a new connection. This 1060 * function is executed on the *eager*'s squeue once the 3 way handshake 1061 * has completed. Note that the behavior differs from STREAMS, where the 1062 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s 1063 * squeue. 1064 * 1065 * Returns B_TRUE if the notification succeeded and an upper handle was 1066 * obtained. `tcp' should be closed on failure. 1067 */ 1068 boolean_t 1069 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1070 { 1071 tcp_t *listener = tcp->tcp_listener; 1072 conn_t *lconnp = listener->tcp_connp; 1073 conn_t *econnp = tcp->tcp_connp; 1074 tcp_t *tail; 1075 ipaddr_t *addr_cache; 1076 sock_upper_handle_t upper; 1077 struct sock_proto_props sopp; 1078 1079 mutex_enter(&listener->tcp_eager_lock); 1080 /* 1081 * Take the eager out, if it is in the list of droppable eagers 1082 * as we are here because the 3W handshake is over. 1083 */ 1084 MAKE_UNDROPPABLE(tcp); 1085 /* 1086 * The eager already has an extra ref put in tcp_input_data 1087 * so that it stays till accept comes back even though it 1088 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1089 */ 1090 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1091 listener->tcp_conn_req_cnt_q0--; 1092 listener->tcp_conn_req_cnt_q++; 1093 1094 /* Move from SYN_RCVD to ESTABLISHED list */ 1095 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1096 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1097 tcp->tcp_eager_prev_q0 = NULL; 1098 tcp->tcp_eager_next_q0 = NULL; 1099 1100 /* 1101 * Insert at end of the queue because connections are accepted 1102 * in chronological order. Leaving the older connections at front 1103 * of the queue helps reducing search time. 1104 */ 1105 tail = listener->tcp_eager_last_q; 1106 if (tail != NULL) 1107 tail->tcp_eager_next_q = tcp; 1108 else 1109 listener->tcp_eager_next_q = tcp; 1110 listener->tcp_eager_last_q = tcp; 1111 tcp->tcp_eager_next_q = NULL; 1112 1113 /* we have timed out before */ 1114 if (tcp->tcp_syn_rcvd_timeout != 0) { 1115 tcp->tcp_syn_rcvd_timeout = 0; 1116 listener->tcp_syn_rcvd_timeout--; 1117 if (listener->tcp_syn_defense && 1118 listener->tcp_syn_rcvd_timeout <= 1119 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1120 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1121 listener->tcp_last_rcv_lbolt)) { 1122 /* 1123 * Turn off the defense mode if we 1124 * believe the SYN attack is over. 1125 */ 1126 listener->tcp_syn_defense = B_FALSE; 1127 if (listener->tcp_ip_addr_cache) { 1128 kmem_free((void *)listener->tcp_ip_addr_cache, 1129 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1130 listener->tcp_ip_addr_cache = NULL; 1131 } 1132 } 1133 } 1134 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1135 if (addr_cache != NULL) { 1136 /* 1137 * We have finished a 3-way handshake with this 1138 * remote host. This proves the IP addr is good. 1139 * Cache it! 1140 */ 1141 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1142 tcp->tcp_connp->conn_faddr_v4; 1143 } 1144 mutex_exit(&listener->tcp_eager_lock); 1145 1146 /* 1147 * Notify the ULP about the newconn. It is guaranteed that no 1148 * tcp_accept() call will be made for the eager if the 1149 * notification fails. 1150 */ 1151 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1152 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1153 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1154 &econnp->conn_upcalls)) == NULL) { 1155 return (B_FALSE); 1156 } 1157 econnp->conn_upper_handle = upper; 1158 1159 tcp->tcp_detached = B_FALSE; 1160 tcp->tcp_hard_binding = B_FALSE; 1161 tcp->tcp_tconnind_started = B_TRUE; 1162 1163 if (econnp->conn_keepalive) { 1164 tcp->tcp_ka_last_intrvl = 0; 1165 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1166 tcp->tcp_ka_interval); 1167 } 1168 1169 /* Update the necessary parameters */ 1170 tcp_get_proto_props(tcp, &sopp); 1171 1172 (*econnp->conn_upcalls->su_set_proto_props) 1173 (econnp->conn_upper_handle, &sopp); 1174 1175 return (B_TRUE); 1176 }