1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* This file contains all TCP kernel socket related functions. */ 27 28 #include <sys/types.h> 29 #include <sys/strlog.h> 30 #include <sys/policy.h> 31 #include <sys/sockio.h> 32 #include <sys/strsubr.h> 33 #include <sys/strsun.h> 34 #include <sys/squeue_impl.h> 35 #include <sys/squeue.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 #include <sys/timod.h> 39 #include <sys/tpicommon.h> 40 #include <sys/socketvar.h> 41 42 #include <inet/common.h> 43 #include <inet/proto_set.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 48 static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, 49 sock_upcalls_t *, int, cred_t *); 50 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 51 sock_upper_handle_t, cred_t *); 52 static int tcp_bind(sock_lower_handle_t, struct sockaddr *, 53 socklen_t, cred_t *); 54 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 55 static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, 56 socklen_t, sock_connid_t *, cred_t *); 57 static int tcp_getpeername(sock_lower_handle_t, struct sockaddr *, 58 socklen_t *, cred_t *); 59 static int tcp_getsockname(sock_lower_handle_t, struct sockaddr *, 60 socklen_t *, cred_t *); 61 static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, 62 socklen_t *, cred_t *); 63 static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, 64 socklen_t, cred_t *); 65 static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, 66 cred_t *); 67 static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); 68 static void tcp_clr_flowctrl(sock_lower_handle_t); 69 static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, 70 cred_t *); 71 static int tcp_close(sock_lower_handle_t, int, cred_t *); 72 73 sock_downcalls_t sock_tcp_downcalls = { 74 tcp_activate, 75 tcp_accept, 76 tcp_bind, 77 tcp_listen, 78 tcp_connect, 79 tcp_getpeername, 80 tcp_getsockname, 81 tcp_getsockopt, 82 tcp_setsockopt, 83 tcp_sendmsg, 84 NULL, 85 NULL, 86 NULL, 87 tcp_shutdown, 88 tcp_clr_flowctrl, 89 tcp_ioctl, 90 tcp_close, 91 }; 92 93 /* ARGSUSED */ 94 static void 95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 96 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 97 { 98 conn_t *connp = (conn_t *)proto_handle; 99 struct sock_proto_props sopp; 100 extern struct module_info tcp_rinfo; 101 102 ASSERT(connp->conn_upper_handle == NULL); 103 104 /* All Solaris components should pass a cred for this operation. */ 105 ASSERT(cr != NULL); 106 107 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 108 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 109 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 110 111 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 112 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 113 sopp.sopp_maxpsz = INFPSZ; 114 sopp.sopp_maxblk = INFPSZ; 115 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 116 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 117 sopp.sopp_maxaddrlen = sizeof (sin6_t); 118 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 119 tcp_rinfo.mi_minpsz; 120 121 connp->conn_upcalls = sock_upcalls; 122 connp->conn_upper_handle = sock_handle; 123 124 ASSERT(connp->conn_rcvbuf != 0 && 125 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 126 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 127 } 128 129 /*ARGSUSED*/ 130 static int 131 tcp_accept(sock_lower_handle_t lproto_handle, 132 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 133 cred_t *cr) 134 { 135 conn_t *lconnp, *econnp; 136 tcp_t *listener, *eager; 137 138 /* All Solaris components should pass a cred for this operation. */ 139 ASSERT(cr != NULL); 140 141 /* 142 * KSSL can move a socket from one listener to another, in which 143 * case `lproto_handle' points to the new listener. To ensure that 144 * the original listener is used the information is obtained from 145 * the eager. 146 */ 147 econnp = (conn_t *)eproto_handle; 148 eager = econnp->conn_tcp; 149 ASSERT(IPCL_IS_NONSTR(econnp)); 150 ASSERT(eager->tcp_listener != NULL); 151 listener = eager->tcp_listener; 152 lconnp = (conn_t *)listener->tcp_connp; 153 ASSERT(listener->tcp_state == TCPS_LISTEN); 154 ASSERT(lconnp->conn_upper_handle != NULL); 155 156 /* 157 * It is possible for the accept thread to race with the thread that 158 * made the su_newconn upcall in tcp_newconn_notify. Both 159 * tcp_newconn_notify and tcp_accept require that conn_upper_handle 160 * and conn_upcalls be set before returning, so they both write to 161 * them. However, we're guaranteed that the value written is the same 162 * for both threads. 163 */ 164 ASSERT(econnp->conn_upper_handle == NULL || 165 econnp->conn_upper_handle == sock_handle); 166 ASSERT(econnp->conn_upcalls == NULL || 167 econnp->conn_upcalls == lconnp->conn_upcalls); 168 econnp->conn_upper_handle = sock_handle; 169 econnp->conn_upcalls = lconnp->conn_upcalls; 170 171 ASSERT(econnp->conn_netstack == 172 listener->tcp_connp->conn_netstack); 173 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 174 175 /* 176 * We should have a minimum of 2 references on the conn at this 177 * point. One for TCP and one for the newconn notification 178 * (which is now taken over by IP). In the normal case we would 179 * also have another reference (making a total of 3) for the conn 180 * being in the classifier hash list. However the eager could have 181 * received an RST subsequently and tcp_closei_local could have 182 * removed the eager from the classifier hash list, hence we can't 183 * assert that reference. 184 */ 185 ASSERT(econnp->conn_ref >= 2); 186 187 mutex_enter(&listener->tcp_eager_lock); 188 /* 189 * Non-STREAMS listeners never defer the notification of new 190 * connections. 191 */ 192 ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0); 193 tcp_eager_unlink(eager); 194 mutex_exit(&listener->tcp_eager_lock); 195 CONN_DEC_REF(listener->tcp_connp); 196 197 return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0); 198 } 199 200 static int 201 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 202 socklen_t len, cred_t *cr) 203 { 204 int error; 205 conn_t *connp = (conn_t *)proto_handle; 206 207 /* All Solaris components should pass a cred for this operation. */ 208 ASSERT(cr != NULL); 209 ASSERT(connp->conn_upper_handle != NULL); 210 211 error = squeue_synch_enter(connp, NULL); 212 if (error != 0) { 213 /* failed to enter */ 214 return (ENOSR); 215 } 216 217 /* binding to a NULL address really means unbind */ 218 if (sa == NULL) { 219 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 220 error = tcp_do_unbind(connp); 221 else 222 error = EINVAL; 223 } else { 224 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 225 } 226 227 squeue_synch_exit(connp); 228 229 if (error < 0) { 230 if (error == -TOUTSTATE) 231 error = EINVAL; 232 else 233 error = proto_tlitosyserr(-error); 234 } 235 236 return (error); 237 } 238 239 /* ARGSUSED */ 240 static int 241 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 242 { 243 conn_t *connp = (conn_t *)proto_handle; 244 tcp_t *tcp = connp->conn_tcp; 245 int error; 246 247 ASSERT(connp->conn_upper_handle != NULL); 248 249 /* All Solaris components should pass a cred for this operation. */ 250 ASSERT(cr != NULL); 251 252 error = squeue_synch_enter(connp, NULL); 253 if (error != 0) { 254 /* failed to enter */ 255 return (ENOBUFS); 256 } 257 258 error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); 259 if (error == 0) { 260 /* 261 * sockfs needs to know what's the maximum number of socket 262 * that can be queued on the listener. 263 */ 264 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 265 SOCK_OPCTL_ENAB_ACCEPT, 266 (uintptr_t)(tcp->tcp_conn_req_max + 267 tcp->tcp_tcps->tcps_conn_req_max_q0)); 268 } else if (error < 0) { 269 if (error == -TOUTSTATE) 270 error = EINVAL; 271 else 272 error = proto_tlitosyserr(-error); 273 } 274 squeue_synch_exit(connp); 275 return (error); 276 } 277 278 static int 279 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 280 socklen_t len, sock_connid_t *id, cred_t *cr) 281 { 282 conn_t *connp = (conn_t *)proto_handle; 283 int error; 284 285 ASSERT(connp->conn_upper_handle != NULL); 286 287 /* All Solaris components should pass a cred for this operation. */ 288 ASSERT(cr != NULL); 289 290 error = proto_verify_ip_addr(connp->conn_family, sa, len); 291 if (error != 0) { 292 return (error); 293 } 294 295 error = squeue_synch_enter(connp, NULL); 296 if (error != 0) { 297 /* failed to enter */ 298 return (ENOSR); 299 } 300 301 /* 302 * TCP supports quick connect, so no need to do an implicit bind 303 */ 304 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 305 if (error == 0) { 306 *id = connp->conn_tcp->tcp_connid; 307 } else if (error < 0) { 308 if (error == -TOUTSTATE) { 309 switch (connp->conn_tcp->tcp_state) { 310 case TCPS_SYN_SENT: 311 error = EALREADY; 312 break; 313 case TCPS_ESTABLISHED: 314 error = EISCONN; 315 break; 316 case TCPS_LISTEN: 317 error = EOPNOTSUPP; 318 break; 319 default: 320 error = EINVAL; 321 break; 322 } 323 } else { 324 error = proto_tlitosyserr(-error); 325 } 326 } 327 328 if (connp->conn_tcp->tcp_loopback) { 329 struct sock_proto_props sopp; 330 331 sopp.sopp_flags = SOCKOPT_LOOPBACK; 332 sopp.sopp_loopback = B_TRUE; 333 334 (*connp->conn_upcalls->su_set_proto_props)( 335 connp->conn_upper_handle, &sopp); 336 } 337 done: 338 squeue_synch_exit(connp); 339 340 return ((error == 0) ? EINPROGRESS : error); 341 } 342 343 /* ARGSUSED3 */ 344 static int 345 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 346 socklen_t *addrlenp, cred_t *cr) 347 { 348 conn_t *connp = (conn_t *)proto_handle; 349 tcp_t *tcp = connp->conn_tcp; 350 351 /* All Solaris components should pass a cred for this operation. */ 352 ASSERT(cr != NULL); 353 354 ASSERT(tcp != NULL); 355 if (tcp->tcp_state < TCPS_SYN_RCVD) 356 return (ENOTCONN); 357 358 return (conn_getpeername(connp, addr, addrlenp)); 359 } 360 361 /* ARGSUSED3 */ 362 static int 363 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 364 socklen_t *addrlenp, cred_t *cr) 365 { 366 conn_t *connp = (conn_t *)proto_handle; 367 368 /* All Solaris components should pass a cred for this operation. */ 369 ASSERT(cr != NULL); 370 371 return (conn_getsockname(connp, addr, addrlenp)); 372 } 373 374 /* returns UNIX error, the optlen is a value-result arg */ 375 static int 376 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 377 void *optvalp, socklen_t *optlen, cred_t *cr) 378 { 379 conn_t *connp = (conn_t *)proto_handle; 380 int error; 381 t_uscalar_t max_optbuf_len; 382 void *optvalp_buf; 383 int len; 384 385 ASSERT(connp->conn_upper_handle != NULL); 386 387 /* All Solaris components should pass a cred for this operation. */ 388 ASSERT(cr != NULL); 389 390 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 391 tcp_opt_obj.odb_opt_des_arr, 392 tcp_opt_obj.odb_opt_arr_cnt, 393 B_FALSE, B_TRUE, cr); 394 if (error != 0) { 395 if (error < 0) { 396 error = proto_tlitosyserr(-error); 397 } 398 return (error); 399 } 400 401 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 402 403 error = squeue_synch_enter(connp, NULL); 404 if (error == ENOMEM) { 405 kmem_free(optvalp_buf, max_optbuf_len); 406 return (ENOMEM); 407 } 408 409 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 410 squeue_synch_exit(connp); 411 412 if (len == -1) { 413 kmem_free(optvalp_buf, max_optbuf_len); 414 return (EINVAL); 415 } 416 417 /* 418 * update optlen and copy option value 419 */ 420 t_uscalar_t size = MIN(len, *optlen); 421 422 bcopy(optvalp_buf, optvalp, size); 423 bcopy(&size, optlen, sizeof (size)); 424 425 kmem_free(optvalp_buf, max_optbuf_len); 426 return (0); 427 } 428 429 static int 430 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 431 const void *optvalp, socklen_t optlen, cred_t *cr) 432 { 433 conn_t *connp = (conn_t *)proto_handle; 434 int error; 435 436 ASSERT(connp->conn_upper_handle != NULL); 437 438 /* All Solaris components should pass a cred for this operation. */ 439 ASSERT(cr != NULL); 440 441 /* 442 * Entering the squeue synchronously can result in a context switch, 443 * which can cause a rather sever performance degradation. So we try to 444 * handle whatever options we can without entering the squeue. 445 */ 446 if (level == IPPROTO_TCP) { 447 switch (option_name) { 448 case TCP_NODELAY: 449 if (optlen != sizeof (int32_t)) 450 return (EINVAL); 451 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 452 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 453 connp->conn_tcp->tcp_mss; 454 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 455 return (0); 456 default: 457 break; 458 } 459 } 460 461 error = squeue_synch_enter(connp, NULL); 462 if (error == ENOMEM) { 463 return (ENOMEM); 464 } 465 466 error = proto_opt_check(level, option_name, optlen, NULL, 467 tcp_opt_obj.odb_opt_des_arr, 468 tcp_opt_obj.odb_opt_arr_cnt, 469 B_TRUE, B_FALSE, cr); 470 471 if (error != 0) { 472 if (error < 0) { 473 error = proto_tlitosyserr(-error); 474 } 475 squeue_synch_exit(connp); 476 return (error); 477 } 478 479 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 480 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 481 NULL, cr); 482 squeue_synch_exit(connp); 483 484 ASSERT(error >= 0); 485 486 return (error); 487 } 488 489 /* ARGSUSED */ 490 static int 491 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 492 cred_t *cr) 493 { 494 tcp_t *tcp; 495 uint32_t msize; 496 conn_t *connp = (conn_t *)proto_handle; 497 int32_t tcpstate; 498 499 /* All Solaris components should pass a cred for this operation. */ 500 ASSERT(cr != NULL); 501 502 ASSERT(connp->conn_ref >= 2); 503 ASSERT(connp->conn_upper_handle != NULL); 504 505 if (msg->msg_controllen != 0) { 506 freemsg(mp); 507 return (EOPNOTSUPP); 508 } 509 510 switch (DB_TYPE(mp)) { 511 case M_DATA: 512 tcp = connp->conn_tcp; 513 ASSERT(tcp != NULL); 514 515 tcpstate = tcp->tcp_state; 516 if (tcpstate < TCPS_ESTABLISHED) { 517 freemsg(mp); 518 /* 519 * We return ENOTCONN if the endpoint is trying to 520 * connect or has never been connected, and EPIPE if it 521 * has been disconnected. The connection id helps us 522 * distinguish between the last two cases. 523 */ 524 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 525 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 526 } else if (tcpstate > TCPS_CLOSE_WAIT) { 527 freemsg(mp); 528 return (EPIPE); 529 } 530 531 msize = msgdsize(mp); 532 533 mutex_enter(&tcp->tcp_non_sq_lock); 534 tcp->tcp_squeue_bytes += msize; 535 /* 536 * Squeue Flow Control 537 */ 538 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 539 tcp_setqfull(tcp); 540 } 541 mutex_exit(&tcp->tcp_non_sq_lock); 542 543 /* 544 * The application may pass in an address in the msghdr, but 545 * we ignore the address on connection-oriented sockets. 546 * Just like BSD this code does not generate an error for 547 * TCP (a CONNREQUIRED socket) when sending to an address 548 * passed in with sendto/sendmsg. Instead the data is 549 * delivered on the connection as if no address had been 550 * supplied. 551 */ 552 CONN_INC_REF(connp); 553 554 if (msg->msg_flags & MSG_OOB) { 555 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 556 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 557 } else { 558 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 559 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 560 } 561 562 return (0); 563 564 default: 565 ASSERT(0); 566 } 567 568 freemsg(mp); 569 return (0); 570 } 571 572 /* ARGSUSED */ 573 static int 574 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 575 { 576 conn_t *connp = (conn_t *)proto_handle; 577 tcp_t *tcp = connp->conn_tcp; 578 579 ASSERT(connp->conn_upper_handle != NULL); 580 581 /* All Solaris components should pass a cred for this operation. */ 582 ASSERT(cr != NULL); 583 584 /* 585 * X/Open requires that we check the connected state. 586 */ 587 if (tcp->tcp_state < TCPS_SYN_SENT) 588 return (ENOTCONN); 589 590 /* shutdown the send side */ 591 if (how != SHUT_RD) { 592 mblk_t *bp; 593 594 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 595 CONN_INC_REF(connp); 596 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 597 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 598 599 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 600 SOCK_OPCTL_SHUT_SEND, 0); 601 } 602 603 /* shutdown the recv side */ 604 if (how != SHUT_WR) 605 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 606 SOCK_OPCTL_SHUT_RECV, 0); 607 608 return (0); 609 } 610 611 static void 612 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 613 { 614 conn_t *connp = (conn_t *)proto_handle; 615 tcp_t *tcp = connp->conn_tcp; 616 mblk_t *mp; 617 int error; 618 619 ASSERT(connp->conn_upper_handle != NULL); 620 621 /* 622 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 623 * is currently running. 624 */ 625 mutex_enter(&tcp->tcp_rsrv_mp_lock); 626 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 627 mutex_exit(&tcp->tcp_rsrv_mp_lock); 628 return; 629 } 630 tcp->tcp_rsrv_mp = NULL; 631 mutex_exit(&tcp->tcp_rsrv_mp_lock); 632 633 error = squeue_synch_enter(connp, mp); 634 ASSERT(error == 0); 635 636 mutex_enter(&tcp->tcp_rsrv_mp_lock); 637 tcp->tcp_rsrv_mp = mp; 638 mutex_exit(&tcp->tcp_rsrv_mp_lock); 639 640 if (tcp->tcp_fused) { 641 tcp_fuse_backenable(tcp); 642 } else { 643 tcp->tcp_rwnd = connp->conn_rcvbuf; 644 /* 645 * Send back a window update immediately if TCP is above 646 * ESTABLISHED state and the increase of the rcv window 647 * that the other side knows is at least 1 MSS after flow 648 * control is lifted. 649 */ 650 if (tcp->tcp_state >= TCPS_ESTABLISHED && 651 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 652 tcp_xmit_ctl(NULL, tcp, 653 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 654 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 655 } 656 } 657 658 squeue_synch_exit(connp); 659 } 660 661 /* ARGSUSED */ 662 static int 663 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 664 int mode, int32_t *rvalp, cred_t *cr) 665 { 666 conn_t *connp = (conn_t *)proto_handle; 667 int error; 668 669 ASSERT(connp->conn_upper_handle != NULL); 670 671 /* All Solaris components should pass a cred for this operation. */ 672 ASSERT(cr != NULL); 673 674 /* 675 * If we don't have a helper stream then create one. 676 * ip_create_helper_stream takes care of locking the conn_t, 677 * so this check for NULL is just a performance optimization. 678 */ 679 if (connp->conn_helper_info == NULL) { 680 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 681 682 /* 683 * Create a helper stream for non-STREAMS socket. 684 */ 685 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 686 if (error != 0) { 687 ip0dbg(("tcp_ioctl: create of IP helper stream " 688 "failed %d\n", error)); 689 return (error); 690 } 691 } 692 693 switch (cmd) { 694 case ND_SET: 695 case ND_GET: 696 case _SIOCSOCKFALLBACK: 697 case TCP_IOC_ABORT_CONN: 698 case TI_GETPEERNAME: 699 case TI_GETMYNAME: 700 ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", 701 cmd)); 702 error = EINVAL; 703 break; 704 default: 705 /* 706 * If the conn is not closing, pass on to IP using 707 * helper stream. Bump the ioctlref to prevent tcp_close 708 * from closing the rq/wq out from underneath the ioctl 709 * if it ends up queued or aborted/interrupted. 710 */ 711 mutex_enter(&connp->conn_lock); 712 if (connp->conn_state_flags & (CONN_CLOSING)) { 713 mutex_exit(&connp->conn_lock); 714 error = EINVAL; 715 break; 716 } 717 CONN_INC_IOCTLREF_LOCKED(connp); 718 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 719 cmd, arg, mode, cr, rvalp); 720 CONN_DEC_IOCTLREF(connp); 721 break; 722 } 723 return (error); 724 } 725 726 /* ARGSUSED */ 727 static int 728 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 729 { 730 conn_t *connp = (conn_t *)proto_handle; 731 732 ASSERT(connp->conn_upper_handle != NULL); 733 734 /* All Solaris components should pass a cred for this operation. */ 735 ASSERT(cr != NULL); 736 737 tcp_close_common(connp, flags); 738 739 ip_free_helper_stream(connp); 740 741 /* 742 * Drop IP's reference on the conn. This is the last reference 743 * on the connp if the state was less than established. If the 744 * connection has gone into timewait state, then we will have 745 * one ref for the TCP and one more ref (total of two) for the 746 * classifier connected hash list (a timewait connections stays 747 * in connected hash till closed). 748 * 749 * We can't assert the references because there might be other 750 * transient reference places because of some walkers or queued 751 * packets in squeue for the timewait state. 752 */ 753 CONN_DEC_REF(connp); 754 755 /* 756 * EINPROGRESS tells sockfs to wait for a 'closed' upcall before 757 * freeing the socket. 758 */ 759 return (EINPROGRESS); 760 } 761 762 /* ARGSUSED */ 763 sock_lower_handle_t 764 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 765 uint_t *smodep, int *errorp, int flags, cred_t *credp) 766 { 767 conn_t *connp; 768 boolean_t isv6 = family == AF_INET6; 769 770 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 771 (proto != 0 && proto != IPPROTO_TCP)) { 772 *errorp = EPROTONOSUPPORT; 773 return (NULL); 774 } 775 776 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 777 if (connp == NULL) { 778 return (NULL); 779 } 780 781 /* 782 * Put the ref for TCP. Ref for IP was already put 783 * by ipcl_conn_create. Also make the conn_t globally 784 * visible to walkers 785 */ 786 mutex_enter(&connp->conn_lock); 787 CONN_INC_REF_LOCKED(connp); 788 ASSERT(connp->conn_ref == 2); 789 connp->conn_state_flags &= ~CONN_INCIPIENT; 790 791 connp->conn_flags |= IPCL_NONSTR; 792 mutex_exit(&connp->conn_lock); 793 794 ASSERT(errorp != NULL); 795 *errorp = 0; 796 *sock_downcalls = &sock_tcp_downcalls; 797 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 798 SM_SENDFILESUPP; 799 800 return ((sock_lower_handle_t)connp); 801 } 802 803 /* 804 * tcp_fallback 805 * 806 * A direct socket is falling back to using STREAMS. The queue 807 * that is being passed down was created using tcp_open() with 808 * the SO_FALLBACK flag set. As a result, the queue is not 809 * associated with a conn, and the q_ptrs instead contain the 810 * dev and minor area that should be used. 811 * 812 * The 'issocket' flag indicates whether the FireEngine 813 * optimizations should be used. The common case would be that 814 * optimizations are enabled, and they might be subsequently 815 * disabled using the _SIOCSOCKFALLBACK ioctl. 816 */ 817 818 /* 819 * An active connection is falling back to TPI. Gather all the information 820 * required by the STREAM head and TPI sonode and send it up. 821 */ 822 static void 823 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 824 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb, 825 sock_quiesce_arg_t *arg) 826 { 827 conn_t *connp = tcp->tcp_connp; 828 struct stroptions *stropt; 829 struct T_capability_ack tca; 830 struct sockaddr_in6 laddr, faddr; 831 socklen_t laddrlen, faddrlen; 832 short opts; 833 int error; 834 mblk_t *mp, *mpnext; 835 836 connp->conn_dev = (dev_t)RD(q)->q_ptr; 837 connp->conn_minor_arena = WR(q)->q_ptr; 838 839 RD(q)->q_ptr = WR(q)->q_ptr = connp; 840 841 connp->conn_rq = RD(q); 842 connp->conn_wq = WR(q); 843 844 WR(q)->q_qinfo = &tcp_sock_winit; 845 846 if (!issocket) 847 tcp_use_pure_tpi(tcp); 848 849 /* 850 * free the helper stream 851 */ 852 ip_free_helper_stream(connp); 853 854 /* 855 * Notify the STREAM head about options 856 */ 857 DB_TYPE(stropt_mp) = M_SETOPTS; 858 stropt = (struct stroptions *)stropt_mp->b_rptr; 859 stropt_mp->b_wptr += sizeof (struct stroptions); 860 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 861 862 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 863 tcp->tcp_tcps->tcps_wroff_xtra); 864 if (tcp->tcp_snd_sack_ok) 865 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 866 stropt->so_hiwat = connp->conn_rcvbuf; 867 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 868 869 putnext(RD(q), stropt_mp); 870 871 /* 872 * Collect the information needed to sync with the sonode 873 */ 874 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 875 876 laddrlen = faddrlen = sizeof (sin6_t); 877 (void) tcp_getsockname((sock_lower_handle_t)connp, 878 (struct sockaddr *)&laddr, &laddrlen, CRED()); 879 error = tcp_getpeername((sock_lower_handle_t)connp, 880 (struct sockaddr *)&faddr, &faddrlen, CRED()); 881 if (error != 0) 882 faddrlen = 0; 883 884 opts = 0; 885 if (connp->conn_oobinline) 886 opts |= SO_OOBINLINE; 887 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 888 opts |= SO_DONTROUTE; 889 890 /* 891 * Notify the socket that the protocol is now quiescent, 892 * and it's therefore safe move data from the socket 893 * to the stream head. 894 */ 895 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca, 896 (struct sockaddr *)&laddr, laddrlen, 897 (struct sockaddr *)&faddr, faddrlen, opts); 898 899 while (mp != NULL) { 900 mpnext = mp->b_next; 901 tcp->tcp_rcv_list = mp->b_next; 902 mp->b_next = NULL; 903 putnext(q, mp); 904 mp = mpnext; 905 } 906 ASSERT(tcp->tcp_rcv_last_head == NULL); 907 ASSERT(tcp->tcp_rcv_last_tail == NULL); 908 ASSERT(tcp->tcp_rcv_cnt == 0); 909 910 /* 911 * All eagers in q0 are marked as being non-STREAM, so they will 912 * make su_newconn upcalls when the handshake completes, which 913 * will fail (resulting in the conn being closed). So we just blow 914 * off everything in q0 instead of waiting for the inevitable. 915 */ 916 if (tcp->tcp_conn_req_cnt_q0 != 0) 917 tcp_eager_cleanup(tcp, B_TRUE); 918 } 919 920 /* 921 * An eager is falling back to TPI. All we have to do is send 922 * up a T_CONN_IND. 923 */ 924 static void 925 tcp_fallback_eager(tcp_t *eager, boolean_t issocket, 926 so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg) 927 { 928 conn_t *connp = eager->tcp_connp; 929 tcp_t *listener = eager->tcp_listener; 930 mblk_t *mp; 931 932 ASSERT(listener != NULL); 933 934 /* 935 * Notify the socket that the protocol is now quiescent, 936 * and it's therefore safe move data from the socket 937 * to tcp's rcv queue. 938 */ 939 mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0, 940 NULL, 0, 0); 941 942 if (mp != NULL) { 943 ASSERT(eager->tcp_rcv_cnt == 0); 944 945 eager->tcp_rcv_list = mp; 946 eager->tcp_rcv_cnt = msgdsize(mp); 947 while (mp->b_next != NULL) { 948 mp = mp->b_next; 949 eager->tcp_rcv_cnt += msgdsize(mp); 950 } 951 eager->tcp_rcv_last_head = mp; 952 while (mp->b_cont) 953 mp = mp->b_cont; 954 eager->tcp_rcv_last_tail = mp; 955 if (eager->tcp_rcv_cnt > eager->tcp_rwnd) 956 eager->tcp_rwnd = 0; 957 else 958 eager->tcp_rwnd -= eager->tcp_rcv_cnt; 959 } 960 961 if (!issocket) 962 eager->tcp_issocket = B_FALSE; 963 /* 964 * The stream for this eager does not yet exist, so mark it as 965 * being detached. 966 */ 967 eager->tcp_detached = B_TRUE; 968 eager->tcp_hard_binding = B_TRUE; 969 connp->conn_rq = listener->tcp_connp->conn_rq; 970 connp->conn_wq = listener->tcp_connp->conn_wq; 971 972 /* Send up the connection indication */ 973 mp = eager->tcp_conn.tcp_eager_conn_ind; 974 ASSERT(mp != NULL); 975 eager->tcp_conn.tcp_eager_conn_ind = NULL; 976 977 /* 978 * TLI/XTI applications will get confused by 979 * sending eager as an option since it violates 980 * the option semantics. So remove the eager as 981 * option since TLI/XTI app doesn't need it anyway. 982 */ 983 if (!issocket) { 984 struct T_conn_ind *conn_ind; 985 986 conn_ind = (struct T_conn_ind *)mp->b_rptr; 987 conn_ind->OPT_length = 0; 988 conn_ind->OPT_offset = 0; 989 } 990 991 /* 992 * Sockfs guarantees that the listener will not be closed 993 * during fallback. So we can safely use the listener's queue. 994 */ 995 putnext(listener->tcp_connp->conn_rq, mp); 996 } 997 998 999 int 1000 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 1001 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb, 1002 sock_quiesce_arg_t *arg) 1003 { 1004 tcp_t *tcp; 1005 conn_t *connp = (conn_t *)proto_handle; 1006 int error; 1007 mblk_t *stropt_mp; 1008 mblk_t *ordrel_mp; 1009 1010 tcp = connp->conn_tcp; 1011 1012 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 1013 NULL); 1014 1015 /* Pre-allocate the T_ordrel_ind mblk. */ 1016 ASSERT(tcp->tcp_ordrel_mp == NULL); 1017 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 1018 STR_NOSIG, NULL); 1019 ordrel_mp->b_datap->db_type = M_PROTO; 1020 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 1021 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 1022 1023 /* 1024 * Enter the squeue so that no new packets can come in 1025 */ 1026 error = squeue_synch_enter(connp, NULL); 1027 if (error != 0) { 1028 /* failed to enter, free all the pre-allocated messages. */ 1029 freeb(stropt_mp); 1030 freeb(ordrel_mp); 1031 return (ENOMEM); 1032 } 1033 1034 /* 1035 * Both endpoints must be of the same type (either STREAMS or 1036 * non-STREAMS) for fusion to be enabled. So if we are fused, 1037 * we have to unfuse. 1038 */ 1039 if (tcp->tcp_fused) 1040 tcp_unfuse(tcp); 1041 1042 if (tcp->tcp_listener != NULL) { 1043 /* The eager will deal with opts when accept() is called */ 1044 freeb(stropt_mp); 1045 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg); 1046 } else { 1047 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 1048 quiesced_cb, arg); 1049 } 1050 1051 /* 1052 * No longer a direct socket 1053 * 1054 * Note that we intentionally leave the upper_handle and upcalls 1055 * intact, since eagers may still be using them. 1056 */ 1057 connp->conn_flags &= ~IPCL_NONSTR; 1058 tcp->tcp_ordrel_mp = ordrel_mp; 1059 1060 /* 1061 * There should be atleast two ref's (IP + TCP) 1062 */ 1063 ASSERT(connp->conn_ref >= 2); 1064 squeue_synch_exit(connp); 1065 1066 return (0); 1067 } 1068 1069 /* 1070 * Notifies a non-STREAMS based listener about a new connection. This 1071 * function is executed on the *eager*'s squeue once the 3 way handshake 1072 * has completed. Note that the behavior differs from STREAMS, where the 1073 * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s 1074 * squeue. 1075 * 1076 * Returns B_TRUE if the notification succeeded and an upper handle was 1077 * obtained. `tcp' should be closed on failure. 1078 */ 1079 boolean_t 1080 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira) 1081 { 1082 tcp_t *listener = tcp->tcp_listener; 1083 conn_t *lconnp = listener->tcp_connp; 1084 conn_t *econnp = tcp->tcp_connp; 1085 tcp_t *tail; 1086 ipaddr_t *addr_cache; 1087 sock_upper_handle_t upper; 1088 struct sock_proto_props sopp; 1089 1090 mutex_enter(&listener->tcp_eager_lock); 1091 /* 1092 * Take the eager out, if it is in the list of droppable eagers 1093 * as we are here because the 3W handshake is over. 1094 */ 1095 MAKE_UNDROPPABLE(tcp); 1096 /* 1097 * The eager already has an extra ref put in tcp_input_data 1098 * so that it stays till accept comes back even though it 1099 * might get into TCPS_CLOSED as a result of a TH_RST etc. 1100 */ 1101 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 1102 listener->tcp_conn_req_cnt_q0--; 1103 listener->tcp_conn_req_cnt_q++; 1104 1105 /* Move from SYN_RCVD to ESTABLISHED list */ 1106 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; 1107 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1108 tcp->tcp_eager_prev_q0 = NULL; 1109 tcp->tcp_eager_next_q0 = NULL; 1110 1111 /* 1112 * Insert at end of the queue because connections are accepted 1113 * in chronological order. Leaving the older connections at front 1114 * of the queue helps reducing search time. 1115 */ 1116 tail = listener->tcp_eager_last_q; 1117 if (tail != NULL) 1118 tail->tcp_eager_next_q = tcp; 1119 else 1120 listener->tcp_eager_next_q = tcp; 1121 listener->tcp_eager_last_q = tcp; 1122 tcp->tcp_eager_next_q = NULL; 1123 1124 /* we have timed out before */ 1125 if (tcp->tcp_syn_rcvd_timeout != 0) { 1126 tcp->tcp_syn_rcvd_timeout = 0; 1127 listener->tcp_syn_rcvd_timeout--; 1128 if (listener->tcp_syn_defense && 1129 listener->tcp_syn_rcvd_timeout <= 1130 (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) && 1131 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 1132 listener->tcp_last_rcv_lbolt)) { 1133 /* 1134 * Turn off the defense mode if we 1135 * believe the SYN attack is over. 1136 */ 1137 listener->tcp_syn_defense = B_FALSE; 1138 if (listener->tcp_ip_addr_cache) { 1139 kmem_free((void *)listener->tcp_ip_addr_cache, 1140 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1141 listener->tcp_ip_addr_cache = NULL; 1142 } 1143 } 1144 } 1145 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 1146 if (addr_cache != NULL) { 1147 /* 1148 * We have finished a 3-way handshake with this 1149 * remote host. This proves the IP addr is good. 1150 * Cache it! 1151 */ 1152 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 1153 tcp->tcp_connp->conn_faddr_v4; 1154 } 1155 mutex_exit(&listener->tcp_eager_lock); 1156 1157 /* 1158 * Notify the ULP about the newconn. It is guaranteed that no 1159 * tcp_accept() call will be made for the eager if the 1160 * notification fails. 1161 */ 1162 if ((upper = (*lconnp->conn_upcalls->su_newconn) 1163 (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp, 1164 &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid, 1165 &econnp->conn_upcalls)) == NULL) { 1166 return (B_FALSE); 1167 } 1168 econnp->conn_upper_handle = upper; 1169 1170 tcp->tcp_detached = B_FALSE; 1171 tcp->tcp_hard_binding = B_FALSE; 1172 tcp->tcp_tconnind_started = B_TRUE; 1173 1174 if (econnp->conn_keepalive) { 1175 tcp->tcp_ka_last_intrvl = 0; 1176 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, 1177 tcp->tcp_ka_interval); 1178 } 1179 1180 /* Update the necessary parameters */ 1181 tcp_get_proto_props(tcp, &sopp); 1182 1183 (*econnp->conn_upcalls->su_set_proto_props) 1184 (econnp->conn_upper_handle, &sopp); 1185 1186 return (B_TRUE); 1187 }