1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* This file contains all TCP kernel socket related functions. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/strlog.h>
  30 #include <sys/policy.h>
  31 #include <sys/sockio.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/strsun.h>
  34 #include <sys/squeue_impl.h>
  35 #include <sys/squeue.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/timod.h>
  39 #include <sys/tpicommon.h>
  40 #include <sys/socketvar.h>
  41 
  42 #include <inet/common.h>
  43 #include <inet/proto_set.h>
  44 #include <inet/ip.h>
  45 #include <inet/tcp.h>
  46 #include <inet/tcp_impl.h>
  47 
  48 static void     tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
  49                     sock_upcalls_t *, int, cred_t *);
  50 static int      tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
  51                     sock_upper_handle_t, cred_t *);
  52 static int      tcp_bind(sock_lower_handle_t, struct sockaddr *,
  53                     socklen_t, cred_t *);
  54 static int      tcp_listen(sock_lower_handle_t, int, cred_t *);
  55 static int      tcp_connect(sock_lower_handle_t, const struct sockaddr *,
  56                     socklen_t, sock_connid_t *, cred_t *);
  57 static int      tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
  58                     socklen_t *, cred_t *);
  59 static int      tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
  60                     socklen_t *, cred_t *);
  61 static int      tcp_getsockopt(sock_lower_handle_t, int, int, void *,
  62                     socklen_t *, cred_t *);
  63 static int      tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
  64                     socklen_t, cred_t *);
  65 static int      tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
  66                     cred_t *);
  67 static int      tcp_shutdown(sock_lower_handle_t, int, cred_t *);
  68 static void     tcp_clr_flowctrl(sock_lower_handle_t);
  69 static int      tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  70                     cred_t *);
  71 static int      tcp_close(sock_lower_handle_t, int, cred_t *);
  72 
  73 sock_downcalls_t sock_tcp_downcalls = {
  74         tcp_activate,
  75         tcp_accept,
  76         tcp_bind,
  77         tcp_listen,
  78         tcp_connect,
  79         tcp_getpeername,
  80         tcp_getsockname,
  81         tcp_getsockopt,
  82         tcp_setsockopt,
  83         tcp_sendmsg,
  84         NULL,
  85         NULL,
  86         NULL,
  87         tcp_shutdown,
  88         tcp_clr_flowctrl,
  89         tcp_ioctl,
  90         tcp_close,
  91 };
  92 
  93 /* ARGSUSED */
  94 static void
  95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
  96     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
  97 {
  98         conn_t *connp = (conn_t *)proto_handle;
  99         struct sock_proto_props sopp;
 100         extern struct module_info tcp_rinfo;
 101 
 102         ASSERT(connp->conn_upper_handle == NULL);
 103 
 104         /* All Solaris components should pass a cred for this operation. */
 105         ASSERT(cr != NULL);
 106 
 107         sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 108             SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
 109             SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
 110 
 111         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
 112         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
 113         sopp.sopp_maxpsz = INFPSZ;
 114         sopp.sopp_maxblk = INFPSZ;
 115         sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
 116         sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
 117         sopp.sopp_maxaddrlen = sizeof (sin6_t);
 118         sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
 119             tcp_rinfo.mi_minpsz;
 120 
 121         connp->conn_upcalls = sock_upcalls;
 122         connp->conn_upper_handle = sock_handle;
 123 
 124         ASSERT(connp->conn_rcvbuf != 0 &&
 125             connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 126         (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 127 }
 128 
 129 /*ARGSUSED*/
 130 static int
 131 tcp_accept(sock_lower_handle_t lproto_handle,
 132     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
 133     cred_t *cr)
 134 {
 135         conn_t *lconnp, *econnp;
 136         tcp_t *listener, *eager;
 137 
 138         /*
 139          * KSSL can move a socket from one listener to another, in which
 140          * case `lproto_handle' points to the new listener. To ensure that
 141          * the original listener is used the information is obtained from
 142          * the eager.
 143          */
 144         econnp = (conn_t *)eproto_handle;
 145         eager = econnp->conn_tcp;
 146         ASSERT(IPCL_IS_NONSTR(econnp));
 147         ASSERT(eager->tcp_listener != NULL);
 148         listener = eager->tcp_listener;
 149         lconnp = (conn_t *)listener->tcp_connp;
 150         ASSERT(listener->tcp_state == TCPS_LISTEN);
 151         ASSERT(lconnp->conn_upper_handle != NULL);
 152 
 153         /*
 154          * It is possible for the accept thread to race with the thread that
 155          * made the su_newconn upcall in tcp_newconn_notify. Both
 156          * tcp_newconn_notify and tcp_accept require that conn_upper_handle
 157          * and conn_upcalls be set before returning, so they both write to
 158          * them. However, we're guaranteed that the value written is the same
 159          * for both threads.
 160          */
 161         ASSERT(econnp->conn_upper_handle == NULL ||
 162             econnp->conn_upper_handle == sock_handle);
 163         ASSERT(econnp->conn_upcalls == NULL ||
 164             econnp->conn_upcalls == lconnp->conn_upcalls);
 165         econnp->conn_upper_handle = sock_handle;
 166         econnp->conn_upcalls = lconnp->conn_upcalls;
 167 
 168         ASSERT(econnp->conn_netstack ==
 169             listener->tcp_connp->conn_netstack);
 170         ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 171 
 172         /*
 173          * We should have a minimum of 2 references on the conn at this
 174          * point. One for TCP and one for the newconn notification
 175          * (which is now taken over by IP). In the normal case we would
 176          * also have another reference (making a total of 3) for the conn
 177          * being in the classifier hash list. However the eager could have
 178          * received an RST subsequently and tcp_closei_local could have
 179          * removed the eager from the classifier hash list, hence we can't
 180          * assert that reference.
 181          */
 182         ASSERT(econnp->conn_ref >= 2);
 183 
 184         mutex_enter(&listener->tcp_eager_lock);
 185         /*
 186          * Non-STREAMS listeners never defer the notification of new
 187          * connections.
 188          */
 189         ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
 190         tcp_eager_unlink(eager);
 191         mutex_exit(&listener->tcp_eager_lock);
 192         CONN_DEC_REF(listener->tcp_connp);
 193 
 194         return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
 195 }
 196 
 197 static int
 198 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 199     socklen_t len, cred_t *cr)
 200 {
 201         int             error;
 202         conn_t          *connp = (conn_t *)proto_handle;
 203 
 204         /* All Solaris components should pass a cred for this operation. */
 205         ASSERT(cr != NULL);
 206         ASSERT(connp->conn_upper_handle != NULL);
 207 
 208         error = squeue_synch_enter(connp, NULL);
 209         if (error != 0) {
 210                 /* failed to enter */
 211                 return (ENOSR);
 212         }
 213 
 214         /* binding to a NULL address really means unbind */
 215         if (sa == NULL) {
 216                 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
 217                         error = tcp_do_unbind(connp);
 218                 else
 219                         error = EINVAL;
 220         } else {
 221                 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
 222         }
 223 
 224         squeue_synch_exit(connp);
 225 
 226         if (error < 0) {
 227                 if (error == -TOUTSTATE)
 228                         error = EINVAL;
 229                 else
 230                         error = proto_tlitosyserr(-error);
 231         }
 232 
 233         return (error);
 234 }
 235 
 236 /* ARGSUSED */
 237 static int
 238 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 239 {
 240         conn_t  *connp = (conn_t *)proto_handle;
 241         tcp_t   *tcp = connp->conn_tcp;
 242         int     error;
 243 
 244         ASSERT(connp->conn_upper_handle != NULL);
 245 
 246         /* All Solaris components should pass a cred for this operation. */
 247         ASSERT(cr != NULL);
 248 
 249         error = squeue_synch_enter(connp, NULL);
 250         if (error != 0) {
 251                 /* failed to enter */
 252                 return (ENOBUFS);
 253         }
 254 
 255         error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 256         if (error == 0) {
 257                 /*
 258                  * sockfs needs to know what's the maximum number of socket
 259                  * that can be queued on the listener.
 260                  */
 261                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 262                     SOCK_OPCTL_ENAB_ACCEPT,
 263                     (uintptr_t)(tcp->tcp_conn_req_max +
 264                     tcp->tcp_tcps->tcps_conn_req_max_q0));
 265         } else if (error < 0) {
 266                 if (error == -TOUTSTATE)
 267                         error = EINVAL;
 268                 else
 269                         error = proto_tlitosyserr(-error);
 270         }
 271         squeue_synch_exit(connp);
 272         return (error);
 273 }
 274 
 275 static int
 276 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 277     socklen_t len, sock_connid_t *id, cred_t *cr)
 278 {
 279         conn_t          *connp = (conn_t *)proto_handle;
 280         int             error;
 281 
 282         ASSERT(connp->conn_upper_handle != NULL);
 283 
 284         /* All Solaris components should pass a cred for this operation. */
 285         ASSERT(cr != NULL);
 286 
 287         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 288         if (error != 0) {
 289                 return (error);
 290         }
 291 
 292         error = squeue_synch_enter(connp, NULL);
 293         if (error != 0) {
 294                 /* failed to enter */
 295                 return (ENOSR);
 296         }
 297 
 298         /*
 299          * TCP supports quick connect, so no need to do an implicit bind
 300          */
 301         error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
 302         if (error == 0) {
 303                 *id = connp->conn_tcp->tcp_connid;
 304         } else if (error < 0) {
 305                 if (error == -TOUTSTATE) {
 306                         switch (connp->conn_tcp->tcp_state) {
 307                         case TCPS_SYN_SENT:
 308                                 error = EALREADY;
 309                                 break;
 310                         case TCPS_ESTABLISHED:
 311                                 error = EISCONN;
 312                                 break;
 313                         case TCPS_LISTEN:
 314                                 error = EOPNOTSUPP;
 315                                 break;
 316                         default:
 317                                 error = EINVAL;
 318                                 break;
 319                         }
 320                 } else {
 321                         error = proto_tlitosyserr(-error);
 322                 }
 323         }
 324 
 325         if (connp->conn_tcp->tcp_loopback) {
 326                 struct sock_proto_props sopp;
 327 
 328                 sopp.sopp_flags = SOCKOPT_LOOPBACK;
 329                 sopp.sopp_loopback = B_TRUE;
 330 
 331                 (*connp->conn_upcalls->su_set_proto_props)(
 332                     connp->conn_upper_handle, &sopp);
 333         }
 334         squeue_synch_exit(connp);
 335 
 336         return ((error == 0) ? EINPROGRESS : error);
 337 }
 338 
 339 /* ARGSUSED3 */
 340 int
 341 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 342     socklen_t *addrlenp, cred_t *cr)
 343 {
 344         conn_t  *connp = (conn_t *)proto_handle;
 345         tcp_t   *tcp = connp->conn_tcp;
 346 
 347         /* All Solaris components should pass a cred for this operation. */
 348         ASSERT(cr != NULL);
 349 
 350         ASSERT(tcp != NULL);
 351         if (tcp->tcp_state < TCPS_SYN_RCVD)
 352                 return (ENOTCONN);
 353 
 354         return (conn_getpeername(connp, addr, addrlenp));
 355 }
 356 
 357 /* ARGSUSED3 */
 358 int
 359 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 360     socklen_t *addrlenp, cred_t *cr)
 361 {
 362         conn_t  *connp = (conn_t *)proto_handle;
 363 
 364         /* All Solaris components should pass a cred for this operation. */
 365         ASSERT(cr != NULL);
 366 
 367         return (conn_getsockname(connp, addr, addrlenp));
 368 }
 369 
 370 /* returns UNIX error, the optlen is a value-result arg */
 371 static int
 372 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 373     void *optvalp, socklen_t *optlen, cred_t *cr)
 374 {
 375         conn_t          *connp = (conn_t *)proto_handle;
 376         int             error;
 377         t_uscalar_t     max_optbuf_len;
 378         void            *optvalp_buf;
 379         int             len;
 380 
 381         ASSERT(connp->conn_upper_handle != NULL);
 382 
 383         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 384             tcp_opt_obj.odb_opt_des_arr,
 385             tcp_opt_obj.odb_opt_arr_cnt,
 386             B_FALSE, B_TRUE, cr);
 387         if (error != 0) {
 388                 if (error < 0) {
 389                         error = proto_tlitosyserr(-error);
 390                 }
 391                 return (error);
 392         }
 393 
 394         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
 395 
 396         error = squeue_synch_enter(connp, NULL);
 397         if (error == ENOMEM) {
 398                 kmem_free(optvalp_buf, max_optbuf_len);
 399                 return (ENOMEM);
 400         }
 401 
 402         len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 403         squeue_synch_exit(connp);
 404 
 405         if (len == -1) {
 406                 kmem_free(optvalp_buf, max_optbuf_len);
 407                 return (EINVAL);
 408         }
 409 
 410         /*
 411          * update optlen and copy option value
 412          */
 413         t_uscalar_t size = MIN(len, *optlen);
 414 
 415         bcopy(optvalp_buf, optvalp, size);
 416         bcopy(&size, optlen, sizeof (size));
 417 
 418         kmem_free(optvalp_buf, max_optbuf_len);
 419         return (0);
 420 }
 421 
 422 static int
 423 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 424     const void *optvalp, socklen_t optlen, cred_t *cr)
 425 {
 426         conn_t          *connp = (conn_t *)proto_handle;
 427         int             error;
 428 
 429         ASSERT(connp->conn_upper_handle != NULL);
 430         /*
 431          * Entering the squeue synchronously can result in a context switch,
 432          * which can cause a rather sever performance degradation. So we try to
 433          * handle whatever options we can without entering the squeue.
 434          */
 435         if (level == IPPROTO_TCP) {
 436                 switch (option_name) {
 437                 case TCP_NODELAY:
 438                         if (optlen != sizeof (int32_t))
 439                                 return (EINVAL);
 440                         mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
 441                         connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
 442                             connp->conn_tcp->tcp_mss;
 443                         mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
 444                         return (0);
 445                 default:
 446                         break;
 447                 }
 448         }
 449 
 450         error = squeue_synch_enter(connp, NULL);
 451         if (error == ENOMEM) {
 452                 return (ENOMEM);
 453         }
 454 
 455         error = proto_opt_check(level, option_name, optlen, NULL,
 456             tcp_opt_obj.odb_opt_des_arr,
 457             tcp_opt_obj.odb_opt_arr_cnt,
 458             B_TRUE, B_FALSE, cr);
 459 
 460         if (error != 0) {
 461                 if (error < 0) {
 462                         error = proto_tlitosyserr(-error);
 463                 }
 464                 squeue_synch_exit(connp);
 465                 return (error);
 466         }
 467 
 468         error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 469             optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 470             NULL, cr);
 471         squeue_synch_exit(connp);
 472 
 473         ASSERT(error >= 0);
 474 
 475         return (error);
 476 }
 477 
 478 /* ARGSUSED */
 479 static int
 480 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 481     cred_t *cr)
 482 {
 483         tcp_t           *tcp;
 484         uint32_t        msize;
 485         conn_t *connp = (conn_t *)proto_handle;
 486         int32_t         tcpstate;
 487 
 488         /* All Solaris components should pass a cred for this operation. */
 489         ASSERT(cr != NULL);
 490 
 491         ASSERT(connp->conn_ref >= 2);
 492         ASSERT(connp->conn_upper_handle != NULL);
 493 
 494         if (msg->msg_controllen != 0) {
 495                 freemsg(mp);
 496                 return (EOPNOTSUPP);
 497         }
 498 
 499         switch (DB_TYPE(mp)) {
 500         case M_DATA:
 501                 tcp = connp->conn_tcp;
 502                 ASSERT(tcp != NULL);
 503 
 504                 tcpstate = tcp->tcp_state;
 505                 if (tcpstate < TCPS_ESTABLISHED) {
 506                         freemsg(mp);
 507                         /*
 508                          * We return ENOTCONN if the endpoint is trying to
 509                          * connect or has never been connected, and EPIPE if it
 510                          * has been disconnected. The connection id helps us
 511                          * distinguish between the last two cases.
 512                          */
 513                         return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
 514                             ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
 515                 } else if (tcpstate > TCPS_CLOSE_WAIT) {
 516                         freemsg(mp);
 517                         return (EPIPE);
 518                 }
 519 
 520                 msize = msgdsize(mp);
 521 
 522                 mutex_enter(&tcp->tcp_non_sq_lock);
 523                 tcp->tcp_squeue_bytes += msize;
 524                 /*
 525                  * Squeue Flow Control
 526                  */
 527                 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 528                         tcp_setqfull(tcp);
 529                 }
 530                 mutex_exit(&tcp->tcp_non_sq_lock);
 531 
 532                 /*
 533                  * The application may pass in an address in the msghdr, but
 534                  * we ignore the address on connection-oriented sockets.
 535                  * Just like BSD this code does not generate an error for
 536                  * TCP (a CONNREQUIRED socket) when sending to an address
 537                  * passed in with sendto/sendmsg. Instead the data is
 538                  * delivered on the connection as if no address had been
 539                  * supplied.
 540                  */
 541                 CONN_INC_REF(connp);
 542 
 543                 if (msg->msg_flags & MSG_OOB) {
 544                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
 545                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 546                 } else {
 547                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
 548                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 549                 }
 550 
 551                 return (0);
 552 
 553         default:
 554                 ASSERT(0);
 555         }
 556 
 557         freemsg(mp);
 558         return (0);
 559 }
 560 
 561 /* ARGSUSED */
 562 static int
 563 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 564 {
 565         conn_t  *connp = (conn_t *)proto_handle;
 566         tcp_t   *tcp = connp->conn_tcp;
 567 
 568         ASSERT(connp->conn_upper_handle != NULL);
 569 
 570         /* All Solaris components should pass a cred for this operation. */
 571         ASSERT(cr != NULL);
 572 
 573         /*
 574          * X/Open requires that we check the connected state.
 575          */
 576         if (tcp->tcp_state < TCPS_SYN_SENT)
 577                 return (ENOTCONN);
 578 
 579         /* shutdown the send side */
 580         if (how != SHUT_RD) {
 581                 mblk_t *bp;
 582 
 583                 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 584                 CONN_INC_REF(connp);
 585                 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
 586                     connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 587 
 588                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 589                     SOCK_OPCTL_SHUT_SEND, 0);
 590         }
 591 
 592         /* shutdown the recv side */
 593         if (how != SHUT_WR)
 594                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 595                     SOCK_OPCTL_SHUT_RECV, 0);
 596 
 597         return (0);
 598 }
 599 
 600 static void
 601 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 602 {
 603         conn_t  *connp = (conn_t *)proto_handle;
 604         tcp_t   *tcp = connp->conn_tcp;
 605         mblk_t *mp;
 606         int error;
 607 
 608         ASSERT(connp->conn_upper_handle != NULL);
 609 
 610         /*
 611          * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
 612          * is currently running.
 613          */
 614         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 615         if ((mp = tcp->tcp_rsrv_mp) == NULL) {
 616                 mutex_exit(&tcp->tcp_rsrv_mp_lock);
 617                 return;
 618         }
 619         tcp->tcp_rsrv_mp = NULL;
 620         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 621 
 622         error = squeue_synch_enter(connp, mp);
 623         ASSERT(error == 0);
 624 
 625         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 626         tcp->tcp_rsrv_mp = mp;
 627         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 628 
 629         if (tcp->tcp_fused) {
 630                 tcp_fuse_backenable(tcp);
 631         } else {
 632                 tcp->tcp_rwnd = connp->conn_rcvbuf;
 633                 /*
 634                  * Send back a window update immediately if TCP is above
 635                  * ESTABLISHED state and the increase of the rcv window
 636                  * that the other side knows is at least 1 MSS after flow
 637                  * control is lifted.
 638                  */
 639                 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 640                     tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 641                         tcp_xmit_ctl(NULL, tcp,
 642                             (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
 643                             tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 644                 }
 645         }
 646 
 647         squeue_synch_exit(connp);
 648 }
 649 
 650 /* ARGSUSED */
 651 static int
 652 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 653     int mode, int32_t *rvalp, cred_t *cr)
 654 {
 655         conn_t          *connp = (conn_t *)proto_handle;
 656         int             error;
 657 
 658         ASSERT(connp->conn_upper_handle != NULL);
 659 
 660         /* All Solaris components should pass a cred for this operation. */
 661         ASSERT(cr != NULL);
 662 
 663         /*
 664          * If we don't have a helper stream then create one.
 665          * ip_create_helper_stream takes care of locking the conn_t,
 666          * so this check for NULL is just a performance optimization.
 667          */
 668         if (connp->conn_helper_info == NULL) {
 669                 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
 670 
 671                 /*
 672                  * Create a helper stream for non-STREAMS socket.
 673                  */
 674                 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
 675                 if (error != 0) {
 676                         ip0dbg(("tcp_ioctl: create of IP helper stream "
 677                             "failed %d\n", error));
 678                         return (error);
 679                 }
 680         }
 681 
 682         switch (cmd) {
 683                 case ND_SET:
 684                 case ND_GET:
 685                 case _SIOCSOCKFALLBACK:
 686                 case TCP_IOC_ABORT_CONN:
 687                 case TI_GETPEERNAME:
 688                 case TI_GETMYNAME:
 689                         ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
 690                             cmd));
 691                         error = EINVAL;
 692                         break;
 693                 default:
 694                         /*
 695                          * If the conn is not closing, pass on to IP using
 696                          * helper stream. Bump the ioctlref to prevent tcp_close
 697                          * from closing the rq/wq out from underneath the ioctl
 698                          * if it ends up queued or aborted/interrupted.
 699                          */
 700                         mutex_enter(&connp->conn_lock);
 701                         if (connp->conn_state_flags & (CONN_CLOSING)) {
 702                                 mutex_exit(&connp->conn_lock);
 703                                 error = EINVAL;
 704                                 break;
 705                         }
 706                         CONN_INC_IOCTLREF_LOCKED(connp);
 707                         error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
 708                             cmd, arg, mode, cr, rvalp);
 709                         CONN_DEC_IOCTLREF(connp);
 710                         break;
 711         }
 712         return (error);
 713 }
 714 
 715 /* ARGSUSED */
 716 static int
 717 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 718 {
 719         conn_t *connp = (conn_t *)proto_handle;
 720 
 721         ASSERT(connp->conn_upper_handle != NULL);
 722 
 723         /* All Solaris components should pass a cred for this operation. */
 724         ASSERT(cr != NULL);
 725 
 726         tcp_close_common(connp, flags);
 727 
 728         ip_free_helper_stream(connp);
 729 
 730         /*
 731          * Drop IP's reference on the conn. This is the last reference
 732          * on the connp if the state was less than established. If the
 733          * connection has gone into timewait state, then we will have
 734          * one ref for the TCP and one more ref (total of two) for the
 735          * classifier connected hash list (a timewait connections stays
 736          * in connected hash till closed).
 737          *
 738          * We can't assert the references because there might be other
 739          * transient reference places because of some walkers or queued
 740          * packets in squeue for the timewait state.
 741          */
 742         CONN_DEC_REF(connp);
 743 
 744         /*
 745          * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
 746          * freeing the socket.
 747          */
 748         return (EINPROGRESS);
 749 }
 750 
 751 /* ARGSUSED */
 752 sock_lower_handle_t
 753 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 754     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 755 {
 756         conn_t          *connp;
 757         boolean_t       isv6 = family == AF_INET6;
 758 
 759         if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
 760             (proto != 0 && proto != IPPROTO_TCP)) {
 761                 *errorp = EPROTONOSUPPORT;
 762                 return (NULL);
 763         }
 764 
 765         connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 766         if (connp == NULL) {
 767                 return (NULL);
 768         }
 769 
 770         /*
 771          * Put the ref for TCP. Ref for IP was already put
 772          * by ipcl_conn_create. Also make the conn_t globally
 773          * visible to walkers.
 774          */
 775         mutex_enter(&connp->conn_lock);
 776         CONN_INC_REF_LOCKED(connp);
 777         ASSERT(connp->conn_ref == 2);
 778         connp->conn_state_flags &= ~CONN_INCIPIENT;
 779 
 780         connp->conn_flags |= IPCL_NONSTR;
 781         mutex_exit(&connp->conn_lock);
 782 
 783         ASSERT(errorp != NULL);
 784         *errorp = 0;
 785         *sock_downcalls = &sock_tcp_downcalls;
 786         *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
 787             SM_SENDFILESUPP;
 788 
 789         return ((sock_lower_handle_t)connp);
 790 }
 791 
 792 /*
 793  * tcp_fallback
 794  *
 795  * A direct socket is falling back to using STREAMS. The queue
 796  * that is being passed down was created using tcp_open() with
 797  * the SO_FALLBACK flag set. As a result, the queue is not
 798  * associated with a conn, and the q_ptrs instead contain the
 799  * dev and minor area that should be used.
 800  *
 801  * The 'issocket' flag indicates whether the FireEngine
 802  * optimizations should be used. The common case would be that
 803  * optimizations are enabled, and they might be subsequently
 804  * disabled using the _SIOCSOCKFALLBACK ioctl.
 805  */
 806 
 807 /*
 808  * An active connection is falling back to TPI. Gather all the information
 809  * required by the STREAM head and TPI sonode and send it up.
 810  */
 811 static void
 812 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 813     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
 814     sock_quiesce_arg_t *arg)
 815 {
 816         conn_t                  *connp = tcp->tcp_connp;
 817         struct stroptions       *stropt;
 818         struct T_capability_ack tca;
 819         struct sockaddr_in6     laddr, faddr;
 820         socklen_t               laddrlen, faddrlen;
 821         short                   opts;
 822         int                     error;
 823         mblk_t                  *mp, *mpnext;
 824 
 825         connp->conn_dev = (dev_t)RD(q)->q_ptr;
 826         connp->conn_minor_arena = WR(q)->q_ptr;
 827 
 828         RD(q)->q_ptr = WR(q)->q_ptr = connp;
 829 
 830         connp->conn_rq = RD(q);
 831         connp->conn_wq = WR(q);
 832 
 833         WR(q)->q_qinfo = &tcp_sock_winit;
 834 
 835         if (!issocket)
 836                 tcp_use_pure_tpi(tcp);
 837 
 838         /*
 839          * free the helper stream
 840          */
 841         ip_free_helper_stream(connp);
 842 
 843         /*
 844          * Notify the STREAM head about options
 845          */
 846         DB_TYPE(stropt_mp) = M_SETOPTS;
 847         stropt = (struct stroptions *)stropt_mp->b_rptr;
 848         stropt_mp->b_wptr += sizeof (struct stroptions);
 849         stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 850 
 851         stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 852             tcp->tcp_tcps->tcps_wroff_xtra);
 853         if (tcp->tcp_snd_sack_ok)
 854                 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
 855         stropt->so_hiwat = connp->conn_rcvbuf;
 856         stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 857 
 858         putnext(RD(q), stropt_mp);
 859 
 860         /*
 861          * Collect the information needed to sync with the sonode
 862          */
 863         tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 864 
 865         laddrlen = faddrlen = sizeof (sin6_t);
 866         (void) tcp_getsockname((sock_lower_handle_t)connp,
 867             (struct sockaddr *)&laddr, &laddrlen, CRED());
 868         error = tcp_getpeername((sock_lower_handle_t)connp,
 869             (struct sockaddr *)&faddr, &faddrlen, CRED());
 870         if (error != 0)
 871                 faddrlen = 0;
 872 
 873         opts = 0;
 874         if (connp->conn_oobinline)
 875                 opts |= SO_OOBINLINE;
 876         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 877                 opts |= SO_DONTROUTE;
 878 
 879         /*
 880          * Notify the socket that the protocol is now quiescent,
 881          * and it's therefore safe move data from the socket
 882          * to the stream head.
 883          */
 884         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 885             (struct sockaddr *)&laddr, laddrlen,
 886             (struct sockaddr *)&faddr, faddrlen, opts);
 887 
 888         while (mp != NULL) {
 889                 mpnext = mp->b_next;
 890                 tcp->tcp_rcv_list = mp->b_next;
 891                 mp->b_next = NULL;
 892                 putnext(q, mp);
 893                 mp = mpnext;
 894         }
 895         ASSERT(tcp->tcp_rcv_last_head == NULL);
 896         ASSERT(tcp->tcp_rcv_last_tail == NULL);
 897         ASSERT(tcp->tcp_rcv_cnt == 0);
 898 
 899         /*
 900          * All eagers in q0 are marked as being non-STREAM, so they will
 901          * make su_newconn upcalls when the handshake completes, which
 902          * will fail (resulting in the conn being closed). So we just blow
 903          * off everything in q0 instead of waiting for the inevitable.
 904          */
 905         if (tcp->tcp_conn_req_cnt_q0 != 0)
 906                 tcp_eager_cleanup(tcp, B_TRUE);
 907 }
 908 
 909 /*
 910  * An eager is falling back to TPI. All we have to do is send
 911  * up a T_CONN_IND.
 912  */
 913 static void
 914 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
 915     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
 916 {
 917         conn_t *connp = eager->tcp_connp;
 918         tcp_t *listener = eager->tcp_listener;
 919         mblk_t *mp;
 920 
 921         ASSERT(listener != NULL);
 922 
 923         /*
 924          * Notify the socket that the protocol is now quiescent,
 925          * and it's therefore safe move data from the socket
 926          * to tcp's rcv queue.
 927          */
 928         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
 929             NULL, 0, 0);
 930 
 931         if (mp != NULL) {
 932                 ASSERT(eager->tcp_rcv_cnt == 0);
 933 
 934                 eager->tcp_rcv_list = mp;
 935                 eager->tcp_rcv_cnt = msgdsize(mp);
 936                 while (mp->b_next != NULL) {
 937                         mp = mp->b_next;
 938                         eager->tcp_rcv_cnt += msgdsize(mp);
 939                 }
 940                 eager->tcp_rcv_last_head = mp;
 941                 while (mp->b_cont)
 942                         mp = mp->b_cont;
 943                 eager->tcp_rcv_last_tail = mp;
 944                 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
 945                         eager->tcp_rwnd = 0;
 946                 else
 947                         eager->tcp_rwnd -= eager->tcp_rcv_cnt;
 948         }
 949 
 950         if (!issocket)
 951                 eager->tcp_issocket = B_FALSE;
 952         /*
 953          * The stream for this eager does not yet exist, so mark it as
 954          * being detached.
 955          */
 956         eager->tcp_detached = B_TRUE;
 957         eager->tcp_hard_binding = B_TRUE;
 958         connp->conn_rq = listener->tcp_connp->conn_rq;
 959         connp->conn_wq = listener->tcp_connp->conn_wq;
 960 
 961         /* Send up the connection indication */
 962         mp = eager->tcp_conn.tcp_eager_conn_ind;
 963         ASSERT(mp != NULL);
 964         eager->tcp_conn.tcp_eager_conn_ind = NULL;
 965 
 966         /*
 967          * TLI/XTI applications will get confused by
 968          * sending eager as an option since it violates
 969          * the option semantics. So remove the eager as
 970          * option since TLI/XTI app doesn't need it anyway.
 971          */
 972         if (!issocket) {
 973                 struct T_conn_ind *conn_ind;
 974 
 975                 conn_ind = (struct T_conn_ind *)mp->b_rptr;
 976                 conn_ind->OPT_length = 0;
 977                 conn_ind->OPT_offset = 0;
 978         }
 979 
 980         /*
 981          * Sockfs guarantees that the listener will not be closed
 982          * during fallback. So we can safely use the listener's queue.
 983          */
 984         putnext(listener->tcp_connp->conn_rq, mp);
 985 }
 986 
 987 
 988 int
 989 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 990     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
 991     sock_quiesce_arg_t *arg)
 992 {
 993         tcp_t                   *tcp;
 994         conn_t                  *connp = (conn_t *)proto_handle;
 995         int                     error;
 996         mblk_t                  *stropt_mp;
 997         mblk_t                  *ordrel_mp;
 998 
 999         tcp = connp->conn_tcp;
1000 
1001         stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1002             NULL);
1003 
1004         /* Pre-allocate the T_ordrel_ind mblk. */
1005         ASSERT(tcp->tcp_ordrel_mp == NULL);
1006         ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1007             STR_NOSIG, NULL);
1008         ordrel_mp->b_datap->db_type = M_PROTO;
1009         ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1010         ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1011 
1012         /*
1013          * Enter the squeue so that no new packets can come in
1014          */
1015         error = squeue_synch_enter(connp, NULL);
1016         if (error != 0) {
1017                 /* failed to enter, free all the pre-allocated messages. */
1018                 freeb(stropt_mp);
1019                 freeb(ordrel_mp);
1020                 return (ENOMEM);
1021         }
1022 
1023         /*
1024          * Both endpoints must be of the same type (either STREAMS or
1025          * non-STREAMS) for fusion to be enabled. So if we are fused,
1026          * we have to unfuse.
1027          */
1028         if (tcp->tcp_fused)
1029                 tcp_unfuse(tcp);
1030 
1031         if (tcp->tcp_listener != NULL) {
1032                 /* The eager will deal with opts when accept() is called */
1033                 freeb(stropt_mp);
1034                 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1035         } else {
1036                 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1037                     quiesced_cb, arg);
1038         }
1039 
1040         /*
1041          * No longer a direct socket
1042          *
1043          * Note that we intentionally leave the upper_handle and upcalls
1044          * intact, since eagers may still be using them.
1045          */
1046         connp->conn_flags &= ~IPCL_NONSTR;
1047         tcp->tcp_ordrel_mp = ordrel_mp;
1048 
1049         /*
1050          * There should be atleast two ref's (IP + TCP)
1051          */
1052         ASSERT(connp->conn_ref >= 2);
1053         squeue_synch_exit(connp);
1054 
1055         return (0);
1056 }
1057 
1058 /*
1059  * Notifies a non-STREAMS based listener about a new connection. This
1060  * function is executed on the *eager*'s squeue once the 3 way handshake
1061  * has completed. Note that the behavior differs from STREAMS, where the
1062  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1063  * squeue.
1064  *
1065  * Returns B_TRUE if the notification succeeded and an upper handle was
1066  * obtained. `tcp' should be closed on failure.
1067  */
1068 boolean_t
1069 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1070 {
1071         tcp_t *listener = tcp->tcp_listener;
1072         conn_t *lconnp = listener->tcp_connp;
1073         conn_t *econnp = tcp->tcp_connp;
1074         tcp_t *tail;
1075         ipaddr_t *addr_cache;
1076         sock_upper_handle_t upper;
1077         struct sock_proto_props sopp;
1078 
1079         mutex_enter(&listener->tcp_eager_lock);
1080         /*
1081          * Take the eager out, if it is in the list of droppable eagers
1082          * as we are here because the 3W handshake is over.
1083          */
1084         MAKE_UNDROPPABLE(tcp);
1085         /*
1086          * The eager already has an extra ref put in tcp_input_data
1087          * so that it stays till accept comes back even though it
1088          * might get into TCPS_CLOSED as a result of a TH_RST etc.
1089          */
1090         ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1091         listener->tcp_conn_req_cnt_q0--;
1092         listener->tcp_conn_req_cnt_q++;
1093 
1094         /* Move from SYN_RCVD to ESTABLISHED list  */
1095         tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1096         tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1097         tcp->tcp_eager_prev_q0 = NULL;
1098         tcp->tcp_eager_next_q0 = NULL;
1099 
1100         /*
1101          * Insert at end of the queue because connections are accepted
1102          * in chronological order. Leaving the older connections at front
1103          * of the queue helps reducing search time.
1104          */
1105         tail = listener->tcp_eager_last_q;
1106         if (tail != NULL)
1107                 tail->tcp_eager_next_q = tcp;
1108         else
1109                 listener->tcp_eager_next_q = tcp;
1110         listener->tcp_eager_last_q = tcp;
1111         tcp->tcp_eager_next_q = NULL;
1112 
1113         /* we have timed out before */
1114         if (tcp->tcp_syn_rcvd_timeout != 0) {
1115                 tcp->tcp_syn_rcvd_timeout = 0;
1116                 listener->tcp_syn_rcvd_timeout--;
1117                 if (listener->tcp_syn_defense &&
1118                     listener->tcp_syn_rcvd_timeout <=
1119                     (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1120                     10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1121                     listener->tcp_last_rcv_lbolt)) {
1122                         /*
1123                          * Turn off the defense mode if we
1124                          * believe the SYN attack is over.
1125                          */
1126                         listener->tcp_syn_defense = B_FALSE;
1127                         if (listener->tcp_ip_addr_cache) {
1128                                 kmem_free((void *)listener->tcp_ip_addr_cache,
1129                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1130                                 listener->tcp_ip_addr_cache = NULL;
1131                         }
1132                 }
1133         }
1134         addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1135         if (addr_cache != NULL) {
1136                 /*
1137                  * We have finished a 3-way handshake with this
1138                  * remote host. This proves the IP addr is good.
1139                  * Cache it!
1140                  */
1141                 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1142                     tcp->tcp_connp->conn_faddr_v4;
1143         }
1144         mutex_exit(&listener->tcp_eager_lock);
1145 
1146         /*
1147          * Notify the ULP about the newconn. It is guaranteed that no
1148          * tcp_accept() call will be made for the eager if the
1149          * notification fails.
1150          */
1151         if ((upper = (*lconnp->conn_upcalls->su_newconn)
1152             (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1153             &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1154             &econnp->conn_upcalls)) == NULL) {
1155                 return (B_FALSE);
1156         }
1157         econnp->conn_upper_handle = upper;
1158 
1159         tcp->tcp_detached = B_FALSE;
1160         tcp->tcp_hard_binding = B_FALSE;
1161         tcp->tcp_tconnind_started = B_TRUE;
1162 
1163         if (econnp->conn_keepalive) {
1164                 tcp->tcp_ka_last_intrvl = 0;
1165                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1166                     tcp->tcp_ka_interval);
1167         }
1168 
1169         /* Update the necessary parameters */
1170         tcp_get_proto_props(tcp, &sopp);
1171 
1172         (*econnp->conn_upcalls->su_set_proto_props)
1173             (econnp->conn_upper_handle, &sopp);
1174 
1175         return (B_TRUE);
1176 }