1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* This file contains all TCP kernel socket related functions. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/strlog.h>
  30 #include <sys/policy.h>
  31 #include <sys/sockio.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/strsun.h>
  34 #include <sys/squeue_impl.h>
  35 #include <sys/squeue.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/timod.h>
  39 #include <sys/tpicommon.h>
  40 #include <sys/socketvar.h>
  41 
  42 #include <inet/common.h>
  43 #include <inet/proto_set.h>
  44 #include <inet/ip.h>
  45 #include <inet/tcp.h>
  46 #include <inet/tcp_impl.h>
  47 
  48 static void     tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
  49                     sock_upcalls_t *, int, cred_t *);
  50 static int      tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
  51                     sock_upper_handle_t, cred_t *);
  52 static int      tcp_bind(sock_lower_handle_t, struct sockaddr *,
  53                     socklen_t, cred_t *);
  54 static int      tcp_listen(sock_lower_handle_t, int, cred_t *);
  55 static int      tcp_connect(sock_lower_handle_t, const struct sockaddr *,
  56                     socklen_t, sock_connid_t *, cred_t *);
  57 static int      tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
  58                     socklen_t *, cred_t *);
  59 static int      tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
  60                     socklen_t *, cred_t *);
  61 static int      tcp_getsockopt(sock_lower_handle_t, int, int, void *,
  62                     socklen_t *, cred_t *);
  63 static int      tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
  64                     socklen_t, cred_t *);
  65 static int      tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
  66                     cred_t *);
  67 static int      tcp_shutdown(sock_lower_handle_t, int, cred_t *);
  68 static void     tcp_clr_flowctrl(sock_lower_handle_t);
  69 static int      tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  70                     cred_t *);
  71 static int      tcp_close(sock_lower_handle_t, int, cred_t *);
  72 
  73 sock_downcalls_t sock_tcp_downcalls = {
  74         tcp_activate,
  75         tcp_accept,
  76         tcp_bind,
  77         tcp_listen,
  78         tcp_connect,
  79         tcp_getpeername,
  80         tcp_getsockname,
  81         tcp_getsockopt,
  82         tcp_setsockopt,
  83         tcp_sendmsg,
  84         NULL,
  85         NULL,
  86         NULL,
  87         tcp_shutdown,
  88         tcp_clr_flowctrl,
  89         tcp_ioctl,
  90         tcp_close,
  91 };
  92 
  93 /* ARGSUSED */
  94 static void
  95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
  96     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
  97 {
  98         conn_t *connp = (conn_t *)proto_handle;
  99         struct sock_proto_props sopp;
 100         extern struct module_info tcp_rinfo;
 101 
 102         ASSERT(connp->conn_upper_handle == NULL);
 103 
 104         /* All Solaris components should pass a cred for this operation. */
 105         ASSERT(cr != NULL);
 106 
 107         sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 108             SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
 109             SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
 110 
 111         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
 112         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
 113         sopp.sopp_maxpsz = INFPSZ;
 114         sopp.sopp_maxblk = INFPSZ;
 115         sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
 116         sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
 117         sopp.sopp_maxaddrlen = sizeof (sin6_t);
 118         sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
 119             tcp_rinfo.mi_minpsz;
 120 
 121         connp->conn_upcalls = sock_upcalls;
 122         connp->conn_upper_handle = sock_handle;
 123 
 124         ASSERT(connp->conn_rcvbuf != 0 &&
 125             connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 126         (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 127 }
 128 
 129 /*ARGSUSED*/
 130 static int
 131 tcp_accept(sock_lower_handle_t lproto_handle,
 132     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
 133     cred_t *cr)
 134 {
 135         conn_t *lconnp, *econnp;
 136         tcp_t *listener, *eager;
 137 
 138         /*
 139          * KSSL can move a socket from one listener to another, in which
 140          * case `lproto_handle' points to the new listener. To ensure that
 141          * the original listener is used the information is obtained from
 142          * the eager.
 143          */
 144         econnp = (conn_t *)eproto_handle;
 145         eager = econnp->conn_tcp;
 146         ASSERT(IPCL_IS_NONSTR(econnp));
 147         ASSERT(eager->tcp_listener != NULL);
 148         listener = eager->tcp_listener;
 149         lconnp = (conn_t *)listener->tcp_connp;
 150         ASSERT(listener->tcp_state == TCPS_LISTEN);
 151         ASSERT(lconnp->conn_upper_handle != NULL);
 152 
 153         /*
 154          * It is possible for the accept thread to race with the thread that
 155          * made the su_newconn upcall in tcp_newconn_notify. Both
 156          * tcp_newconn_notify and tcp_accept require that conn_upper_handle
 157          * and conn_upcalls be set before returning, so they both write to
 158          * them. However, we're guaranteed that the value written is the same
 159          * for both threads.
 160          */
 161         ASSERT(econnp->conn_upper_handle == NULL ||
 162             econnp->conn_upper_handle == sock_handle);
 163         ASSERT(econnp->conn_upcalls == NULL ||
 164             econnp->conn_upcalls == lconnp->conn_upcalls);
 165         econnp->conn_upper_handle = sock_handle;
 166         econnp->conn_upcalls = lconnp->conn_upcalls;
 167 
 168         ASSERT(econnp->conn_netstack ==
 169             listener->tcp_connp->conn_netstack);
 170         ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 171 
 172         /*
 173          * We should have a minimum of 2 references on the conn at this
 174          * point. One for TCP and one for the newconn notification
 175          * (which is now taken over by IP). In the normal case we would
 176          * also have another reference (making a total of 3) for the conn
 177          * being in the classifier hash list. However the eager could have
 178          * received an RST subsequently and tcp_closei_local could have
 179          * removed the eager from the classifier hash list, hence we can't
 180          * assert that reference.
 181          */
 182         ASSERT(econnp->conn_ref >= 2);
 183 
 184         mutex_enter(&listener->tcp_eager_lock);
 185         /*
 186          * Non-STREAMS listeners never defer the notification of new
 187          * connections.
 188          */
 189         ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
 190         tcp_eager_unlink(eager);
 191         mutex_exit(&listener->tcp_eager_lock);
 192         CONN_DEC_REF(listener->tcp_connp);
 193 
 194         return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
 195 }
 196 
 197 static int
 198 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 199     socklen_t len, cred_t *cr)
 200 {
 201         int             error;
 202         conn_t          *connp = (conn_t *)proto_handle;
 203 
 204         /* All Solaris components should pass a cred for this operation. */
 205         ASSERT(cr != NULL);
 206         ASSERT(connp->conn_upper_handle != NULL);
 207 
 208         error = squeue_synch_enter(connp, NULL);
 209         if (error != 0) {
 210                 /* failed to enter */
 211                 return (ENOSR);
 212         }
 213 
 214         /* binding to a NULL address really means unbind */
 215         if (sa == NULL) {
 216                 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
 217                         error = tcp_do_unbind(connp);
 218                 else
 219                         error = EINVAL;
 220         } else {
 221                 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
 222         }
 223 
 224         squeue_synch_exit(connp);
 225 
 226         if (error < 0) {
 227                 if (error == -TOUTSTATE)
 228                         error = EINVAL;
 229                 else
 230                         error = proto_tlitosyserr(-error);
 231         }
 232 
 233         return (error);
 234 }
 235 
 236 /* ARGSUSED */
 237 static int
 238 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 239 {
 240         conn_t  *connp = (conn_t *)proto_handle;
 241         tcp_t   *tcp = connp->conn_tcp;
 242         int     error;
 243 
 244         ASSERT(connp->conn_upper_handle != NULL);
 245 
 246         /* All Solaris components should pass a cred for this operation. */
 247         ASSERT(cr != NULL);
 248 
 249         error = squeue_synch_enter(connp, NULL);
 250         if (error != 0) {
 251                 /* failed to enter */
 252                 return (ENOBUFS);
 253         }
 254 
 255         error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 256         if (error == 0) {
 257                 /*
 258                  * sockfs needs to know what's the maximum number of socket
 259                  * that can be queued on the listener.
 260                  */
 261                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 262                     SOCK_OPCTL_ENAB_ACCEPT,
 263                     (uintptr_t)(tcp->tcp_conn_req_max +
 264                     tcp->tcp_tcps->tcps_conn_req_max_q0));
 265         } else if (error < 0) {
 266                 if (error == -TOUTSTATE)
 267                         error = EINVAL;
 268                 else
 269                         error = proto_tlitosyserr(-error);
 270         }
 271         squeue_synch_exit(connp);
 272         return (error);
 273 }
 274 
 275 static int
 276 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 277     socklen_t len, sock_connid_t *id, cred_t *cr)
 278 {
 279         conn_t          *connp = (conn_t *)proto_handle;
 280         int             error;
 281 
 282         ASSERT(connp->conn_upper_handle != NULL);
 283 
 284         /* All Solaris components should pass a cred for this operation. */
 285         ASSERT(cr != NULL);
 286 
 287         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 288         if (error != 0) {
 289                 return (error);
 290         }
 291 
 292         error = squeue_synch_enter(connp, NULL);
 293         if (error != 0) {
 294                 /* failed to enter */
 295                 return (ENOSR);
 296         }
 297 
 298         /*
 299          * TCP supports quick connect, so no need to do an implicit bind
 300          */
 301         error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
 302         if (error == 0) {
 303                 *id = connp->conn_tcp->tcp_connid;
 304         } else if (error < 0) {
 305                 if (error == -TOUTSTATE) {
 306                         switch (connp->conn_tcp->tcp_state) {
 307                         case TCPS_SYN_SENT:
 308                                 error = EALREADY;
 309                                 break;
 310                         case TCPS_ESTABLISHED:
 311                                 error = EISCONN;
 312                                 break;
 313                         case TCPS_LISTEN:
 314                                 error = EOPNOTSUPP;
 315                                 break;
 316                         default:
 317                                 error = EINVAL;
 318                                 break;
 319                         }
 320                 } else {
 321                         error = proto_tlitosyserr(-error);
 322                 }
 323         }
 324 
 325         if (connp->conn_tcp->tcp_loopback) {
 326                 struct sock_proto_props sopp;
 327 
 328                 sopp.sopp_flags = SOCKOPT_LOOPBACK;
 329                 sopp.sopp_loopback = B_TRUE;
 330 
 331                 (*connp->conn_upcalls->su_set_proto_props)(
 332                     connp->conn_upper_handle, &sopp);
 333         }
 334 done:
 335         squeue_synch_exit(connp);
 336 
 337         return ((error == 0) ? EINPROGRESS : error);
 338 }
 339 
 340 /* ARGSUSED3 */
 341 int
 342 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 343     socklen_t *addrlenp, cred_t *cr)
 344 {
 345         conn_t  *connp = (conn_t *)proto_handle;
 346         tcp_t   *tcp = connp->conn_tcp;
 347 
 348         /* All Solaris components should pass a cred for this operation. */
 349         ASSERT(cr != NULL);
 350 
 351         ASSERT(tcp != NULL);
 352         if (tcp->tcp_state < TCPS_SYN_RCVD)
 353                 return (ENOTCONN);
 354 
 355         return (conn_getpeername(connp, addr, addrlenp));
 356 }
 357 
 358 /* ARGSUSED3 */
 359 int
 360 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 361     socklen_t *addrlenp, cred_t *cr)
 362 {
 363         conn_t  *connp = (conn_t *)proto_handle;
 364 
 365         /* All Solaris components should pass a cred for this operation. */
 366         ASSERT(cr != NULL);
 367 
 368         return (conn_getsockname(connp, addr, addrlenp));
 369 }
 370 
 371 /* returns UNIX error, the optlen is a value-result arg */
 372 static int
 373 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 374     void *optvalp, socklen_t *optlen, cred_t *cr)
 375 {
 376         conn_t          *connp = (conn_t *)proto_handle;
 377         int             error;
 378         t_uscalar_t     max_optbuf_len;
 379         void            *optvalp_buf;
 380         int             len;
 381 
 382         ASSERT(connp->conn_upper_handle != NULL);
 383 
 384         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 385             tcp_opt_obj.odb_opt_des_arr,
 386             tcp_opt_obj.odb_opt_arr_cnt,
 387             B_FALSE, B_TRUE, cr);
 388         if (error != 0) {
 389                 if (error < 0) {
 390                         error = proto_tlitosyserr(-error);
 391                 }
 392                 return (error);
 393         }
 394 
 395         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
 396 
 397         error = squeue_synch_enter(connp, NULL);
 398         if (error == ENOMEM) {
 399                 kmem_free(optvalp_buf, max_optbuf_len);
 400                 return (ENOMEM);
 401         }
 402 
 403         len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 404         squeue_synch_exit(connp);
 405 
 406         if (len == -1) {
 407                 kmem_free(optvalp_buf, max_optbuf_len);
 408                 return (EINVAL);
 409         }
 410 
 411         /*
 412          * update optlen and copy option value
 413          */
 414         t_uscalar_t size = MIN(len, *optlen);
 415 
 416         bcopy(optvalp_buf, optvalp, size);
 417         bcopy(&size, optlen, sizeof (size));
 418 
 419         kmem_free(optvalp_buf, max_optbuf_len);
 420         return (0);
 421 }
 422 
 423 static int
 424 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 425     const void *optvalp, socklen_t optlen, cred_t *cr)
 426 {
 427         conn_t          *connp = (conn_t *)proto_handle;
 428         int             error;
 429 
 430         ASSERT(connp->conn_upper_handle != NULL);
 431         /*
 432          * Entering the squeue synchronously can result in a context switch,
 433          * which can cause a rather sever performance degradation. So we try to
 434          * handle whatever options we can without entering the squeue.
 435          */
 436         if (level == IPPROTO_TCP) {
 437                 switch (option_name) {
 438                 case TCP_NODELAY:
 439                         if (optlen != sizeof (int32_t))
 440                                 return (EINVAL);
 441                         mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
 442                         connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
 443                             connp->conn_tcp->tcp_mss;
 444                         mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
 445                         return (0);
 446                 default:
 447                         break;
 448                 }
 449         }
 450 
 451         error = squeue_synch_enter(connp, NULL);
 452         if (error == ENOMEM) {
 453                 return (ENOMEM);
 454         }
 455 
 456         error = proto_opt_check(level, option_name, optlen, NULL,
 457             tcp_opt_obj.odb_opt_des_arr,
 458             tcp_opt_obj.odb_opt_arr_cnt,
 459             B_TRUE, B_FALSE, cr);
 460 
 461         if (error != 0) {
 462                 if (error < 0) {
 463                         error = proto_tlitosyserr(-error);
 464                 }
 465                 squeue_synch_exit(connp);
 466                 return (error);
 467         }
 468 
 469         error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 470             optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 471             NULL, cr);
 472         squeue_synch_exit(connp);
 473 
 474         ASSERT(error >= 0);
 475 
 476         return (error);
 477 }
 478 
 479 /* ARGSUSED */
 480 static int
 481 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 482     cred_t *cr)
 483 {
 484         tcp_t           *tcp;
 485         uint32_t        msize;
 486         conn_t *connp = (conn_t *)proto_handle;
 487         int32_t         tcpstate;
 488 
 489         /* All Solaris components should pass a cred for this operation. */
 490         ASSERT(cr != NULL);
 491 
 492         ASSERT(connp->conn_ref >= 2);
 493         ASSERT(connp->conn_upper_handle != NULL);
 494 
 495         if (msg->msg_controllen != 0) {
 496                 freemsg(mp);
 497                 return (EOPNOTSUPP);
 498         }
 499 
 500         switch (DB_TYPE(mp)) {
 501         case M_DATA:
 502                 tcp = connp->conn_tcp;
 503                 ASSERT(tcp != NULL);
 504 
 505                 tcpstate = tcp->tcp_state;
 506                 if (tcpstate < TCPS_ESTABLISHED) {
 507                         freemsg(mp);
 508                         /*
 509                          * We return ENOTCONN if the endpoint is trying to
 510                          * connect or has never been connected, and EPIPE if it
 511                          * has been disconnected. The connection id helps us
 512                          * distinguish between the last two cases.
 513                          */
 514                         return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
 515                             ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
 516                 } else if (tcpstate > TCPS_CLOSE_WAIT) {
 517                         freemsg(mp);
 518                         return (EPIPE);
 519                 }
 520 
 521                 msize = msgdsize(mp);
 522 
 523                 mutex_enter(&tcp->tcp_non_sq_lock);
 524                 tcp->tcp_squeue_bytes += msize;
 525                 /*
 526                  * Squeue Flow Control
 527                  */
 528                 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 529                         tcp_setqfull(tcp);
 530                 }
 531                 mutex_exit(&tcp->tcp_non_sq_lock);
 532 
 533                 /*
 534                  * The application may pass in an address in the msghdr, but
 535                  * we ignore the address on connection-oriented sockets.
 536                  * Just like BSD this code does not generate an error for
 537                  * TCP (a CONNREQUIRED socket) when sending to an address
 538                  * passed in with sendto/sendmsg. Instead the data is
 539                  * delivered on the connection as if no address had been
 540                  * supplied.
 541                  */
 542                 CONN_INC_REF(connp);
 543 
 544                 if (msg->msg_flags & MSG_OOB) {
 545                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
 546                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 547                 } else {
 548                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
 549                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 550                 }
 551 
 552                 return (0);
 553 
 554         default:
 555                 ASSERT(0);
 556         }
 557 
 558         freemsg(mp);
 559         return (0);
 560 }
 561 
 562 /* ARGSUSED */
 563 static int
 564 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 565 {
 566         conn_t  *connp = (conn_t *)proto_handle;
 567         tcp_t   *tcp = connp->conn_tcp;
 568 
 569         ASSERT(connp->conn_upper_handle != NULL);
 570 
 571         /* All Solaris components should pass a cred for this operation. */
 572         ASSERT(cr != NULL);
 573 
 574         /*
 575          * X/Open requires that we check the connected state.
 576          */
 577         if (tcp->tcp_state < TCPS_SYN_SENT)
 578                 return (ENOTCONN);
 579 
 580         /* shutdown the send side */
 581         if (how != SHUT_RD) {
 582                 mblk_t *bp;
 583 
 584                 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 585                 CONN_INC_REF(connp);
 586                 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
 587                     connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 588 
 589                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 590                     SOCK_OPCTL_SHUT_SEND, 0);
 591         }
 592 
 593         /* shutdown the recv side */
 594         if (how != SHUT_WR)
 595                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 596                     SOCK_OPCTL_SHUT_RECV, 0);
 597 
 598         return (0);
 599 }
 600 
 601 static void
 602 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 603 {
 604         conn_t  *connp = (conn_t *)proto_handle;
 605         tcp_t   *tcp = connp->conn_tcp;
 606         mblk_t *mp;
 607         int error;
 608 
 609         ASSERT(connp->conn_upper_handle != NULL);
 610 
 611         /*
 612          * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
 613          * is currently running.
 614          */
 615         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 616         if ((mp = tcp->tcp_rsrv_mp) == NULL) {
 617                 mutex_exit(&tcp->tcp_rsrv_mp_lock);
 618                 return;
 619         }
 620         tcp->tcp_rsrv_mp = NULL;
 621         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 622 
 623         error = squeue_synch_enter(connp, mp);
 624         ASSERT(error == 0);
 625 
 626         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 627         tcp->tcp_rsrv_mp = mp;
 628         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 629 
 630         if (tcp->tcp_fused) {
 631                 tcp_fuse_backenable(tcp);
 632         } else {
 633                 tcp->tcp_rwnd = connp->conn_rcvbuf;
 634                 /*
 635                  * Send back a window update immediately if TCP is above
 636                  * ESTABLISHED state and the increase of the rcv window
 637                  * that the other side knows is at least 1 MSS after flow
 638                  * control is lifted.
 639                  */
 640                 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 641                     tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 642                         tcp_xmit_ctl(NULL, tcp,
 643                             (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
 644                             tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 645                 }
 646         }
 647 
 648         squeue_synch_exit(connp);
 649 }
 650 
 651 /* ARGSUSED */
 652 static int
 653 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 654     int mode, int32_t *rvalp, cred_t *cr)
 655 {
 656         conn_t          *connp = (conn_t *)proto_handle;
 657         int             error;
 658 
 659         ASSERT(connp->conn_upper_handle != NULL);
 660 
 661         /* All Solaris components should pass a cred for this operation. */
 662         ASSERT(cr != NULL);
 663 
 664         /*
 665          * If we don't have a helper stream then create one.
 666          * ip_create_helper_stream takes care of locking the conn_t,
 667          * so this check for NULL is just a performance optimization.
 668          */
 669         if (connp->conn_helper_info == NULL) {
 670                 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
 671 
 672                 /*
 673                  * Create a helper stream for non-STREAMS socket.
 674                  */
 675                 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
 676                 if (error != 0) {
 677                         ip0dbg(("tcp_ioctl: create of IP helper stream "
 678                             "failed %d\n", error));
 679                         return (error);
 680                 }
 681         }
 682 
 683         switch (cmd) {
 684                 case ND_SET:
 685                 case ND_GET:
 686                 case _SIOCSOCKFALLBACK:
 687                 case TCP_IOC_ABORT_CONN:
 688                 case TI_GETPEERNAME:
 689                 case TI_GETMYNAME:
 690                         ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
 691                             cmd));
 692                         error = EINVAL;
 693                         break;
 694                 default:
 695                         /*
 696                          * If the conn is not closing, pass on to IP using
 697                          * helper stream. Bump the ioctlref to prevent tcp_close
 698                          * from closing the rq/wq out from underneath the ioctl
 699                          * if it ends up queued or aborted/interrupted.
 700                          */
 701                         mutex_enter(&connp->conn_lock);
 702                         if (connp->conn_state_flags & (CONN_CLOSING)) {
 703                                 mutex_exit(&connp->conn_lock);
 704                                 error = EINVAL;
 705                                 break;
 706                         }
 707                         CONN_INC_IOCTLREF_LOCKED(connp);
 708                         error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
 709                             cmd, arg, mode, cr, rvalp);
 710                         CONN_DEC_IOCTLREF(connp);
 711                         break;
 712         }
 713         return (error);
 714 }
 715 
 716 /* ARGSUSED */
 717 static int
 718 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 719 {
 720         conn_t *connp = (conn_t *)proto_handle;
 721 
 722         ASSERT(connp->conn_upper_handle != NULL);
 723 
 724         /* All Solaris components should pass a cred for this operation. */
 725         ASSERT(cr != NULL);
 726 
 727         tcp_close_common(connp, flags);
 728 
 729         ip_free_helper_stream(connp);
 730 
 731         /*
 732          * Drop IP's reference on the conn. This is the last reference
 733          * on the connp if the state was less than established. If the
 734          * connection has gone into timewait state, then we will have
 735          * one ref for the TCP and one more ref (total of two) for the
 736          * classifier connected hash list (a timewait connections stays
 737          * in connected hash till closed).
 738          *
 739          * We can't assert the references because there might be other
 740          * transient reference places because of some walkers or queued
 741          * packets in squeue for the timewait state.
 742          */
 743         CONN_DEC_REF(connp);
 744 
 745         /*
 746          * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
 747          * freeing the socket.
 748          */
 749         return (EINPROGRESS);
 750 }
 751 
 752 /* ARGSUSED */
 753 sock_lower_handle_t
 754 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 755     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 756 {
 757         conn_t          *connp;
 758         boolean_t       isv6 = family == AF_INET6;
 759 
 760         if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
 761             (proto != 0 && proto != IPPROTO_TCP)) {
 762                 *errorp = EPROTONOSUPPORT;
 763                 return (NULL);
 764         }
 765 
 766         connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 767         if (connp == NULL) {
 768                 return (NULL);
 769         }
 770 
 771         /*
 772          * Put the ref for TCP. Ref for IP was already put
 773          * by ipcl_conn_create. Also make the conn_t globally
 774          * visible to walkers.
 775          */
 776         mutex_enter(&connp->conn_lock);
 777         CONN_INC_REF_LOCKED(connp);
 778         ASSERT(connp->conn_ref == 2);
 779         connp->conn_state_flags &= ~CONN_INCIPIENT;
 780 
 781         connp->conn_flags |= IPCL_NONSTR;
 782         mutex_exit(&connp->conn_lock);
 783 
 784         ASSERT(errorp != NULL);
 785         *errorp = 0;
 786         *sock_downcalls = &sock_tcp_downcalls;
 787         *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
 788             SM_SENDFILESUPP;
 789 
 790         return ((sock_lower_handle_t)connp);
 791 }
 792 
 793 /*
 794  * tcp_fallback
 795  *
 796  * A direct socket is falling back to using STREAMS. The queue
 797  * that is being passed down was created using tcp_open() with
 798  * the SO_FALLBACK flag set. As a result, the queue is not
 799  * associated with a conn, and the q_ptrs instead contain the
 800  * dev and minor area that should be used.
 801  *
 802  * The 'issocket' flag indicates whether the FireEngine
 803  * optimizations should be used. The common case would be that
 804  * optimizations are enabled, and they might be subsequently
 805  * disabled using the _SIOCSOCKFALLBACK ioctl.
 806  */
 807 
 808 /*
 809  * An active connection is falling back to TPI. Gather all the information
 810  * required by the STREAM head and TPI sonode and send it up.
 811  */
 812 static void
 813 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 814     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
 815     sock_quiesce_arg_t *arg)
 816 {
 817         conn_t                  *connp = tcp->tcp_connp;
 818         struct stroptions       *stropt;
 819         struct T_capability_ack tca;
 820         struct sockaddr_in6     laddr, faddr;
 821         socklen_t               laddrlen, faddrlen;
 822         short                   opts;
 823         int                     error;
 824         mblk_t                  *mp, *mpnext;
 825 
 826         connp->conn_dev = (dev_t)RD(q)->q_ptr;
 827         connp->conn_minor_arena = WR(q)->q_ptr;
 828 
 829         RD(q)->q_ptr = WR(q)->q_ptr = connp;
 830 
 831         connp->conn_rq = RD(q);
 832         connp->conn_wq = WR(q);
 833 
 834         WR(q)->q_qinfo = &tcp_sock_winit;
 835 
 836         if (!issocket)
 837                 tcp_use_pure_tpi(tcp);
 838 
 839         /*
 840          * free the helper stream
 841          */
 842         ip_free_helper_stream(connp);
 843 
 844         /*
 845          * Notify the STREAM head about options
 846          */
 847         DB_TYPE(stropt_mp) = M_SETOPTS;
 848         stropt = (struct stroptions *)stropt_mp->b_rptr;
 849         stropt_mp->b_wptr += sizeof (struct stroptions);
 850         stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 851 
 852         stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 853             tcp->tcp_tcps->tcps_wroff_xtra);
 854         if (tcp->tcp_snd_sack_ok)
 855                 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
 856         stropt->so_hiwat = connp->conn_rcvbuf;
 857         stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 858 
 859         putnext(RD(q), stropt_mp);
 860 
 861         /*
 862          * Collect the information needed to sync with the sonode
 863          */
 864         tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 865 
 866         laddrlen = faddrlen = sizeof (sin6_t);
 867         (void) tcp_getsockname((sock_lower_handle_t)connp,
 868             (struct sockaddr *)&laddr, &laddrlen, CRED());
 869         error = tcp_getpeername((sock_lower_handle_t)connp,
 870             (struct sockaddr *)&faddr, &faddrlen, CRED());
 871         if (error != 0)
 872                 faddrlen = 0;
 873 
 874         opts = 0;
 875         if (connp->conn_oobinline)
 876                 opts |= SO_OOBINLINE;
 877         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 878                 opts |= SO_DONTROUTE;
 879 
 880         /*
 881          * Notify the socket that the protocol is now quiescent,
 882          * and it's therefore safe move data from the socket
 883          * to the stream head.
 884          */
 885         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 886             (struct sockaddr *)&laddr, laddrlen,
 887             (struct sockaddr *)&faddr, faddrlen, opts);
 888 
 889         while (mp != NULL) {
 890                 mpnext = mp->b_next;
 891                 tcp->tcp_rcv_list = mp->b_next;
 892                 mp->b_next = NULL;
 893                 putnext(q, mp);
 894                 mp = mpnext;
 895         }
 896         ASSERT(tcp->tcp_rcv_last_head == NULL);
 897         ASSERT(tcp->tcp_rcv_last_tail == NULL);
 898         ASSERT(tcp->tcp_rcv_cnt == 0);
 899 
 900         /*
 901          * All eagers in q0 are marked as being non-STREAM, so they will
 902          * make su_newconn upcalls when the handshake completes, which
 903          * will fail (resulting in the conn being closed). So we just blow
 904          * off everything in q0 instead of waiting for the inevitable.
 905          */
 906         if (tcp->tcp_conn_req_cnt_q0 != 0)
 907                 tcp_eager_cleanup(tcp, B_TRUE);
 908 }
 909 
 910 /*
 911  * An eager is falling back to TPI. All we have to do is send
 912  * up a T_CONN_IND.
 913  */
 914 static void
 915 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
 916     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
 917 {
 918         conn_t *connp = eager->tcp_connp;
 919         tcp_t *listener = eager->tcp_listener;
 920         mblk_t *mp;
 921 
 922         ASSERT(listener != NULL);
 923 
 924         /*
 925          * Notify the socket that the protocol is now quiescent,
 926          * and it's therefore safe move data from the socket
 927          * to tcp's rcv queue.
 928          */
 929         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
 930             NULL, 0, 0);
 931 
 932         if (mp != NULL) {
 933                 ASSERT(eager->tcp_rcv_cnt == 0);
 934 
 935                 eager->tcp_rcv_list = mp;
 936                 eager->tcp_rcv_cnt = msgdsize(mp);
 937                 while (mp->b_next != NULL) {
 938                         mp = mp->b_next;
 939                         eager->tcp_rcv_cnt += msgdsize(mp);
 940                 }
 941                 eager->tcp_rcv_last_head = mp;
 942                 while (mp->b_cont)
 943                         mp = mp->b_cont;
 944                 eager->tcp_rcv_last_tail = mp;
 945                 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
 946                         eager->tcp_rwnd = 0;
 947                 else
 948                         eager->tcp_rwnd -= eager->tcp_rcv_cnt;
 949         }
 950 
 951         if (!issocket)
 952                 eager->tcp_issocket = B_FALSE;
 953         /*
 954          * The stream for this eager does not yet exist, so mark it as
 955          * being detached.
 956          */
 957         eager->tcp_detached = B_TRUE;
 958         eager->tcp_hard_binding = B_TRUE;
 959         connp->conn_rq = listener->tcp_connp->conn_rq;
 960         connp->conn_wq = listener->tcp_connp->conn_wq;
 961 
 962         /* Send up the connection indication */
 963         mp = eager->tcp_conn.tcp_eager_conn_ind;
 964         ASSERT(mp != NULL);
 965         eager->tcp_conn.tcp_eager_conn_ind = NULL;
 966 
 967         /*
 968          * TLI/XTI applications will get confused by
 969          * sending eager as an option since it violates
 970          * the option semantics. So remove the eager as
 971          * option since TLI/XTI app doesn't need it anyway.
 972          */
 973         if (!issocket) {
 974                 struct T_conn_ind *conn_ind;
 975 
 976                 conn_ind = (struct T_conn_ind *)mp->b_rptr;
 977                 conn_ind->OPT_length = 0;
 978                 conn_ind->OPT_offset = 0;
 979         }
 980 
 981         /*
 982          * Sockfs guarantees that the listener will not be closed
 983          * during fallback. So we can safely use the listener's queue.
 984          */
 985         putnext(listener->tcp_connp->conn_rq, mp);
 986 }
 987 
 988 
 989 int
 990 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 991     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
 992     sock_quiesce_arg_t *arg)
 993 {
 994         tcp_t                   *tcp;
 995         conn_t                  *connp = (conn_t *)proto_handle;
 996         int                     error;
 997         mblk_t                  *stropt_mp;
 998         mblk_t                  *ordrel_mp;
 999 
1000         tcp = connp->conn_tcp;
1001 
1002         stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1003             NULL);
1004 
1005         /* Pre-allocate the T_ordrel_ind mblk. */
1006         ASSERT(tcp->tcp_ordrel_mp == NULL);
1007         ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1008             STR_NOSIG, NULL);
1009         ordrel_mp->b_datap->db_type = M_PROTO;
1010         ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1011         ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1012 
1013         /*
1014          * Enter the squeue so that no new packets can come in
1015          */
1016         error = squeue_synch_enter(connp, NULL);
1017         if (error != 0) {
1018                 /* failed to enter, free all the pre-allocated messages. */
1019                 freeb(stropt_mp);
1020                 freeb(ordrel_mp);
1021                 return (ENOMEM);
1022         }
1023 
1024         /*
1025          * Both endpoints must be of the same type (either STREAMS or
1026          * non-STREAMS) for fusion to be enabled. So if we are fused,
1027          * we have to unfuse.
1028          */
1029         if (tcp->tcp_fused)
1030                 tcp_unfuse(tcp);
1031 
1032         if (tcp->tcp_listener != NULL) {
1033                 /* The eager will deal with opts when accept() is called */
1034                 freeb(stropt_mp);
1035                 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1036         } else {
1037                 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1038                     quiesced_cb, arg);
1039         }
1040 
1041         /*
1042          * No longer a direct socket
1043          *
1044          * Note that we intentionally leave the upper_handle and upcalls
1045          * intact, since eagers may still be using them.
1046          */
1047         connp->conn_flags &= ~IPCL_NONSTR;
1048         tcp->tcp_ordrel_mp = ordrel_mp;
1049 
1050         /*
1051          * There should be atleast two ref's (IP + TCP)
1052          */
1053         ASSERT(connp->conn_ref >= 2);
1054         squeue_synch_exit(connp);
1055 
1056         return (0);
1057 }
1058 
1059 /*
1060  * Notifies a non-STREAMS based listener about a new connection. This
1061  * function is executed on the *eager*'s squeue once the 3 way handshake
1062  * has completed. Note that the behavior differs from STREAMS, where the
1063  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1064  * squeue.
1065  *
1066  * Returns B_TRUE if the notification succeeded and an upper handle was
1067  * obtained. `tcp' should be closed on failure.
1068  */
1069 boolean_t
1070 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1071 {
1072         tcp_t *listener = tcp->tcp_listener;
1073         conn_t *lconnp = listener->tcp_connp;
1074         conn_t *econnp = tcp->tcp_connp;
1075         tcp_t *tail;
1076         ipaddr_t *addr_cache;
1077         sock_upper_handle_t upper;
1078         struct sock_proto_props sopp;
1079 
1080         mutex_enter(&listener->tcp_eager_lock);
1081         /*
1082          * Take the eager out, if it is in the list of droppable eagers
1083          * as we are here because the 3W handshake is over.
1084          */
1085         MAKE_UNDROPPABLE(tcp);
1086         /*
1087          * The eager already has an extra ref put in tcp_input_data
1088          * so that it stays till accept comes back even though it
1089          * might get into TCPS_CLOSED as a result of a TH_RST etc.
1090          */
1091         ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1092         listener->tcp_conn_req_cnt_q0--;
1093         listener->tcp_conn_req_cnt_q++;
1094 
1095         /* Move from SYN_RCVD to ESTABLISHED list  */
1096         tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1097         tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1098         tcp->tcp_eager_prev_q0 = NULL;
1099         tcp->tcp_eager_next_q0 = NULL;
1100 
1101         /*
1102          * Insert at end of the queue because connections are accepted
1103          * in chronological order. Leaving the older connections at front
1104          * of the queue helps reducing search time.
1105          */
1106         tail = listener->tcp_eager_last_q;
1107         if (tail != NULL)
1108                 tail->tcp_eager_next_q = tcp;
1109         else
1110                 listener->tcp_eager_next_q = tcp;
1111         listener->tcp_eager_last_q = tcp;
1112         tcp->tcp_eager_next_q = NULL;
1113 
1114         /* we have timed out before */
1115         if (tcp->tcp_syn_rcvd_timeout != 0) {
1116                 tcp->tcp_syn_rcvd_timeout = 0;
1117                 listener->tcp_syn_rcvd_timeout--;
1118                 if (listener->tcp_syn_defense &&
1119                     listener->tcp_syn_rcvd_timeout <=
1120                     (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1121                     10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1122                     listener->tcp_last_rcv_lbolt)) {
1123                         /*
1124                          * Turn off the defense mode if we
1125                          * believe the SYN attack is over.
1126                          */
1127                         listener->tcp_syn_defense = B_FALSE;
1128                         if (listener->tcp_ip_addr_cache) {
1129                                 kmem_free((void *)listener->tcp_ip_addr_cache,
1130                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1131                                 listener->tcp_ip_addr_cache = NULL;
1132                         }
1133                 }
1134         }
1135         addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1136         if (addr_cache != NULL) {
1137                 /*
1138                  * We have finished a 3-way handshake with this
1139                  * remote host. This proves the IP addr is good.
1140                  * Cache it!
1141                  */
1142                 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1143                     tcp->tcp_connp->conn_faddr_v4;
1144         }
1145         mutex_exit(&listener->tcp_eager_lock);
1146 
1147         /*
1148          * Notify the ULP about the newconn. It is guaranteed that no
1149          * tcp_accept() call will be made for the eager if the
1150          * notification fails.
1151          */
1152         if ((upper = (*lconnp->conn_upcalls->su_newconn)
1153             (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1154             &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1155             &econnp->conn_upcalls)) == NULL) {
1156                 return (B_FALSE);
1157         }
1158         econnp->conn_upper_handle = upper;
1159 
1160         tcp->tcp_detached = B_FALSE;
1161         tcp->tcp_hard_binding = B_FALSE;
1162         tcp->tcp_tconnind_started = B_TRUE;
1163 
1164         if (econnp->conn_keepalive) {
1165                 tcp->tcp_ka_last_intrvl = 0;
1166                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1167                     tcp->tcp_ka_interval);
1168         }
1169 
1170         /* Update the necessary parameters */
1171         tcp_get_proto_props(tcp, &sopp);
1172 
1173         (*econnp->conn_upcalls->su_set_proto_props)
1174             (econnp->conn_upper_handle, &sopp);
1175 
1176         return (B_TRUE);
1177 }