1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* This file contains all TCP kernel socket related functions. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/strlog.h>
  30 #include <sys/policy.h>
  31 #include <sys/sockio.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/strsun.h>
  34 #include <sys/squeue_impl.h>
  35 #include <sys/squeue.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/timod.h>
  39 #include <sys/tpicommon.h>
  40 #include <sys/socketvar.h>
  41 
  42 #include <inet/common.h>
  43 #include <inet/proto_set.h>
  44 #include <inet/ip.h>
  45 #include <inet/tcp.h>
  46 #include <inet/tcp_impl.h>
  47 
  48 static void     tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
  49                     sock_upcalls_t *, int, cred_t *);
  50 static int      tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
  51                     sock_upper_handle_t, cred_t *);
  52 static int      tcp_bind(sock_lower_handle_t, struct sockaddr *,
  53                     socklen_t, cred_t *);
  54 static int      tcp_listen(sock_lower_handle_t, int, cred_t *);
  55 static int      tcp_connect(sock_lower_handle_t, const struct sockaddr *,
  56                     socklen_t, sock_connid_t *, cred_t *);
  57 static int      tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
  58                     socklen_t *, cred_t *);
  59 static int      tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
  60                     socklen_t *, cred_t *);
  61 static int      tcp_getsockopt(sock_lower_handle_t, int, int, void *,
  62                     socklen_t *, cred_t *);
  63 static int      tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
  64                     socklen_t, cred_t *);
  65 static int      tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
  66                     cred_t *);
  67 static int      tcp_shutdown(sock_lower_handle_t, int, cred_t *);
  68 static void     tcp_clr_flowctrl(sock_lower_handle_t);
  69 static int      tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  70                     cred_t *);
  71 static int      tcp_close(sock_lower_handle_t, int, cred_t *);
  72 
  73 sock_downcalls_t sock_tcp_downcalls = {
  74         tcp_activate,
  75         tcp_accept,
  76         tcp_bind,
  77         tcp_listen,
  78         tcp_connect,
  79         tcp_getpeername,
  80         tcp_getsockname,
  81         tcp_getsockopt,
  82         tcp_setsockopt,
  83         tcp_sendmsg,
  84         NULL,
  85         NULL,
  86         NULL,
  87         tcp_shutdown,
  88         tcp_clr_flowctrl,
  89         tcp_ioctl,
  90         tcp_close,
  91 };
  92 
  93 /* ARGSUSED */
  94 static void
  95 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
  96     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
  97 {
  98         conn_t *connp = (conn_t *)proto_handle;
  99         struct sock_proto_props sopp;
 100         extern struct module_info tcp_rinfo;
 101 
 102         ASSERT(connp->conn_upper_handle == NULL);
 103 
 104         /* All Solaris components should pass a cred for this operation. */
 105         ASSERT(cr != NULL);
 106 
 107         sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 108             SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
 109             SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
 110 
 111         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
 112         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
 113         sopp.sopp_maxpsz = INFPSZ;
 114         sopp.sopp_maxblk = INFPSZ;
 115         sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
 116         sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
 117         sopp.sopp_maxaddrlen = sizeof (sin6_t);
 118         sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
 119             tcp_rinfo.mi_minpsz;
 120 
 121         connp->conn_upcalls = sock_upcalls;
 122         connp->conn_upper_handle = sock_handle;
 123 
 124         ASSERT(connp->conn_rcvbuf != 0 &&
 125             connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 126         (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 127 }
 128 
 129 /*ARGSUSED*/
 130 static int
 131 tcp_accept(sock_lower_handle_t lproto_handle,
 132     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
 133     cred_t *cr)
 134 {
 135         conn_t *lconnp, *econnp;
 136         tcp_t *listener, *eager;
 137 
 138         /* All Solaris components should pass a cred for this operation. */
 139         ASSERT(cr != NULL);
 140 
 141         /*
 142          * KSSL can move a socket from one listener to another, in which
 143          * case `lproto_handle' points to the new listener. To ensure that
 144          * the original listener is used the information is obtained from
 145          * the eager.
 146          */
 147         econnp = (conn_t *)eproto_handle;
 148         eager = econnp->conn_tcp;
 149         ASSERT(IPCL_IS_NONSTR(econnp));
 150         ASSERT(eager->tcp_listener != NULL);
 151         listener = eager->tcp_listener;
 152         lconnp = (conn_t *)listener->tcp_connp;
 153         ASSERT(listener->tcp_state == TCPS_LISTEN);
 154         ASSERT(lconnp->conn_upper_handle != NULL);
 155 
 156         /*
 157          * It is possible for the accept thread to race with the thread that
 158          * made the su_newconn upcall in tcp_newconn_notify. Both
 159          * tcp_newconn_notify and tcp_accept require that conn_upper_handle
 160          * and conn_upcalls be set before returning, so they both write to
 161          * them. However, we're guaranteed that the value written is the same
 162          * for both threads.
 163          */
 164         ASSERT(econnp->conn_upper_handle == NULL ||
 165             econnp->conn_upper_handle == sock_handle);
 166         ASSERT(econnp->conn_upcalls == NULL ||
 167             econnp->conn_upcalls == lconnp->conn_upcalls);
 168         econnp->conn_upper_handle = sock_handle;
 169         econnp->conn_upcalls = lconnp->conn_upcalls;
 170 
 171         ASSERT(econnp->conn_netstack ==
 172             listener->tcp_connp->conn_netstack);
 173         ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 174 
 175         /*
 176          * We should have a minimum of 2 references on the conn at this
 177          * point. One for TCP and one for the newconn notification
 178          * (which is now taken over by IP). In the normal case we would
 179          * also have another reference (making a total of 3) for the conn
 180          * being in the classifier hash list. However the eager could have
 181          * received an RST subsequently and tcp_closei_local could have
 182          * removed the eager from the classifier hash list, hence we can't
 183          * assert that reference.
 184          */
 185         ASSERT(econnp->conn_ref >= 2);
 186 
 187         mutex_enter(&listener->tcp_eager_lock);
 188         /*
 189          * Non-STREAMS listeners never defer the notification of new
 190          * connections.
 191          */
 192         ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
 193         tcp_eager_unlink(eager);
 194         mutex_exit(&listener->tcp_eager_lock);
 195         CONN_DEC_REF(listener->tcp_connp);
 196 
 197         return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
 198 }
 199 
 200 static int
 201 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 202     socklen_t len, cred_t *cr)
 203 {
 204         int             error;
 205         conn_t          *connp = (conn_t *)proto_handle;
 206 
 207         /* All Solaris components should pass a cred for this operation. */
 208         ASSERT(cr != NULL);
 209         ASSERT(connp->conn_upper_handle != NULL);
 210 
 211         error = squeue_synch_enter(connp, NULL);
 212         if (error != 0) {
 213                 /* failed to enter */
 214                 return (ENOSR);
 215         }
 216 
 217         /* binding to a NULL address really means unbind */
 218         if (sa == NULL) {
 219                 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
 220                         error = tcp_do_unbind(connp);
 221                 else
 222                         error = EINVAL;
 223         } else {
 224                 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
 225         }
 226 
 227         squeue_synch_exit(connp);
 228 
 229         if (error < 0) {
 230                 if (error == -TOUTSTATE)
 231                         error = EINVAL;
 232                 else
 233                         error = proto_tlitosyserr(-error);
 234         }
 235 
 236         return (error);
 237 }
 238 
 239 /* ARGSUSED */
 240 static int
 241 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 242 {
 243         conn_t  *connp = (conn_t *)proto_handle;
 244         tcp_t   *tcp = connp->conn_tcp;
 245         int     error;
 246 
 247         ASSERT(connp->conn_upper_handle != NULL);
 248 
 249         /* All Solaris components should pass a cred for this operation. */
 250         ASSERT(cr != NULL);
 251 
 252         error = squeue_synch_enter(connp, NULL);
 253         if (error != 0) {
 254                 /* failed to enter */
 255                 return (ENOBUFS);
 256         }
 257 
 258         error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 259         if (error == 0) {
 260                 /*
 261                  * sockfs needs to know what's the maximum number of socket
 262                  * that can be queued on the listener.
 263                  */
 264                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 265                     SOCK_OPCTL_ENAB_ACCEPT,
 266                     (uintptr_t)(tcp->tcp_conn_req_max +
 267                     tcp->tcp_tcps->tcps_conn_req_max_q0));
 268         } else if (error < 0) {
 269                 if (error == -TOUTSTATE)
 270                         error = EINVAL;
 271                 else
 272                         error = proto_tlitosyserr(-error);
 273         }
 274         squeue_synch_exit(connp);
 275         return (error);
 276 }
 277 
 278 static int
 279 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 280     socklen_t len, sock_connid_t *id, cred_t *cr)
 281 {
 282         conn_t          *connp = (conn_t *)proto_handle;
 283         int             error;
 284 
 285         ASSERT(connp->conn_upper_handle != NULL);
 286 
 287         /* All Solaris components should pass a cred for this operation. */
 288         ASSERT(cr != NULL);
 289 
 290         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 291         if (error != 0) {
 292                 return (error);
 293         }
 294 
 295         error = squeue_synch_enter(connp, NULL);
 296         if (error != 0) {
 297                 /* failed to enter */
 298                 return (ENOSR);
 299         }
 300 
 301         /*
 302          * TCP supports quick connect, so no need to do an implicit bind
 303          */
 304         error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
 305         if (error == 0) {
 306                 *id = connp->conn_tcp->tcp_connid;
 307         } else if (error < 0) {
 308                 if (error == -TOUTSTATE) {
 309                         switch (connp->conn_tcp->tcp_state) {
 310                         case TCPS_SYN_SENT:
 311                                 error = EALREADY;
 312                                 break;
 313                         case TCPS_ESTABLISHED:
 314                                 error = EISCONN;
 315                                 break;
 316                         case TCPS_LISTEN:
 317                                 error = EOPNOTSUPP;
 318                                 break;
 319                         default:
 320                                 error = EINVAL;
 321                                 break;
 322                         }
 323                 } else {
 324                         error = proto_tlitosyserr(-error);
 325                 }
 326         }
 327 
 328         if (connp->conn_tcp->tcp_loopback) {
 329                 struct sock_proto_props sopp;
 330 
 331                 sopp.sopp_flags = SOCKOPT_LOOPBACK;
 332                 sopp.sopp_loopback = B_TRUE;
 333 
 334                 (*connp->conn_upcalls->su_set_proto_props)(
 335                     connp->conn_upper_handle, &sopp);
 336         }
 337 done:
 338         squeue_synch_exit(connp);
 339 
 340         return ((error == 0) ? EINPROGRESS : error);
 341 }
 342 
 343 /* ARGSUSED3 */
 344 static int
 345 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 346     socklen_t *addrlenp, cred_t *cr)
 347 {
 348         conn_t  *connp = (conn_t *)proto_handle;
 349         tcp_t   *tcp = connp->conn_tcp;
 350 
 351         /* All Solaris components should pass a cred for this operation. */
 352         ASSERT(cr != NULL);
 353 
 354         ASSERT(tcp != NULL);
 355         if (tcp->tcp_state < TCPS_SYN_RCVD)
 356                 return (ENOTCONN);
 357 
 358         return (conn_getpeername(connp, addr, addrlenp));
 359 }
 360 
 361 /* ARGSUSED3 */
 362 static int
 363 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 364     socklen_t *addrlenp, cred_t *cr)
 365 {
 366         conn_t  *connp = (conn_t *)proto_handle;
 367 
 368         /* All Solaris components should pass a cred for this operation. */
 369         ASSERT(cr != NULL);
 370 
 371         return (conn_getsockname(connp, addr, addrlenp));
 372 }
 373 
 374 /* returns UNIX error, the optlen is a value-result arg */
 375 static int
 376 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 377     void *optvalp, socklen_t *optlen, cred_t *cr)
 378 {
 379         conn_t          *connp = (conn_t *)proto_handle;
 380         int             error;
 381         t_uscalar_t     max_optbuf_len;
 382         void            *optvalp_buf;
 383         int             len;
 384 
 385         ASSERT(connp->conn_upper_handle != NULL);
 386 
 387         /* All Solaris components should pass a cred for this operation. */
 388         ASSERT(cr != NULL);
 389 
 390         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 391             tcp_opt_obj.odb_opt_des_arr,
 392             tcp_opt_obj.odb_opt_arr_cnt,
 393             B_FALSE, B_TRUE, cr);
 394         if (error != 0) {
 395                 if (error < 0) {
 396                         error = proto_tlitosyserr(-error);
 397                 }
 398                 return (error);
 399         }
 400 
 401         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
 402 
 403         error = squeue_synch_enter(connp, NULL);
 404         if (error == ENOMEM) {
 405                 kmem_free(optvalp_buf, max_optbuf_len);
 406                 return (ENOMEM);
 407         }
 408 
 409         len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 410         squeue_synch_exit(connp);
 411 
 412         if (len == -1) {
 413                 kmem_free(optvalp_buf, max_optbuf_len);
 414                 return (EINVAL);
 415         }
 416 
 417         /*
 418          * update optlen and copy option value
 419          */
 420         t_uscalar_t size = MIN(len, *optlen);
 421 
 422         bcopy(optvalp_buf, optvalp, size);
 423         bcopy(&size, optlen, sizeof (size));
 424 
 425         kmem_free(optvalp_buf, max_optbuf_len);
 426         return (0);
 427 }
 428 
 429 static int
 430 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 431     const void *optvalp, socklen_t optlen, cred_t *cr)
 432 {
 433         conn_t          *connp = (conn_t *)proto_handle;
 434         int             error;
 435 
 436         ASSERT(connp->conn_upper_handle != NULL);
 437 
 438         /* All Solaris components should pass a cred for this operation. */
 439         ASSERT(cr != NULL);
 440 
 441         /*
 442          * Entering the squeue synchronously can result in a context switch,
 443          * which can cause a rather sever performance degradation. So we try to
 444          * handle whatever options we can without entering the squeue.
 445          */
 446         if (level == IPPROTO_TCP) {
 447                 switch (option_name) {
 448                 case TCP_NODELAY:
 449                         if (optlen != sizeof (int32_t))
 450                                 return (EINVAL);
 451                         mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
 452                         connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
 453                             connp->conn_tcp->tcp_mss;
 454                         mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
 455                         return (0);
 456                 default:
 457                         break;
 458                 }
 459         }
 460 
 461         error = squeue_synch_enter(connp, NULL);
 462         if (error == ENOMEM) {
 463                 return (ENOMEM);
 464         }
 465 
 466         error = proto_opt_check(level, option_name, optlen, NULL,
 467             tcp_opt_obj.odb_opt_des_arr,
 468             tcp_opt_obj.odb_opt_arr_cnt,
 469             B_TRUE, B_FALSE, cr);
 470 
 471         if (error != 0) {
 472                 if (error < 0) {
 473                         error = proto_tlitosyserr(-error);
 474                 }
 475                 squeue_synch_exit(connp);
 476                 return (error);
 477         }
 478 
 479         error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 480             optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 481             NULL, cr);
 482         squeue_synch_exit(connp);
 483 
 484         ASSERT(error >= 0);
 485 
 486         return (error);
 487 }
 488 
 489 /* ARGSUSED */
 490 static int
 491 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 492     cred_t *cr)
 493 {
 494         tcp_t           *tcp;
 495         uint32_t        msize;
 496         conn_t *connp = (conn_t *)proto_handle;
 497         int32_t         tcpstate;
 498 
 499         /* All Solaris components should pass a cred for this operation. */
 500         ASSERT(cr != NULL);
 501 
 502         ASSERT(connp->conn_ref >= 2);
 503         ASSERT(connp->conn_upper_handle != NULL);
 504 
 505         if (msg->msg_controllen != 0) {
 506                 freemsg(mp);
 507                 return (EOPNOTSUPP);
 508         }
 509 
 510         switch (DB_TYPE(mp)) {
 511         case M_DATA:
 512                 tcp = connp->conn_tcp;
 513                 ASSERT(tcp != NULL);
 514 
 515                 tcpstate = tcp->tcp_state;
 516                 if (tcpstate < TCPS_ESTABLISHED) {
 517                         freemsg(mp);
 518                         /*
 519                          * We return ENOTCONN if the endpoint is trying to
 520                          * connect or has never been connected, and EPIPE if it
 521                          * has been disconnected. The connection id helps us
 522                          * distinguish between the last two cases.
 523                          */
 524                         return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
 525                             ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
 526                 } else if (tcpstate > TCPS_CLOSE_WAIT) {
 527                         freemsg(mp);
 528                         return (EPIPE);
 529                 }
 530 
 531                 msize = msgdsize(mp);
 532 
 533                 mutex_enter(&tcp->tcp_non_sq_lock);
 534                 tcp->tcp_squeue_bytes += msize;
 535                 /*
 536                  * Squeue Flow Control
 537                  */
 538                 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 539                         tcp_setqfull(tcp);
 540                 }
 541                 mutex_exit(&tcp->tcp_non_sq_lock);
 542 
 543                 /*
 544                  * The application may pass in an address in the msghdr, but
 545                  * we ignore the address on connection-oriented sockets.
 546                  * Just like BSD this code does not generate an error for
 547                  * TCP (a CONNREQUIRED socket) when sending to an address
 548                  * passed in with sendto/sendmsg. Instead the data is
 549                  * delivered on the connection as if no address had been
 550                  * supplied.
 551                  */
 552                 CONN_INC_REF(connp);
 553 
 554                 if (msg->msg_flags & MSG_OOB) {
 555                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
 556                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 557                 } else {
 558                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
 559                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 560                 }
 561 
 562                 return (0);
 563 
 564         default:
 565                 ASSERT(0);
 566         }
 567 
 568         freemsg(mp);
 569         return (0);
 570 }
 571 
 572 /* ARGSUSED */
 573 static int
 574 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 575 {
 576         conn_t  *connp = (conn_t *)proto_handle;
 577         tcp_t   *tcp = connp->conn_tcp;
 578 
 579         ASSERT(connp->conn_upper_handle != NULL);
 580 
 581         /* All Solaris components should pass a cred for this operation. */
 582         ASSERT(cr != NULL);
 583 
 584         /*
 585          * X/Open requires that we check the connected state.
 586          */
 587         if (tcp->tcp_state < TCPS_SYN_SENT)
 588                 return (ENOTCONN);
 589 
 590         /* shutdown the send side */
 591         if (how != SHUT_RD) {
 592                 mblk_t *bp;
 593 
 594                 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 595                 CONN_INC_REF(connp);
 596                 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
 597                     connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 598 
 599                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 600                     SOCK_OPCTL_SHUT_SEND, 0);
 601         }
 602 
 603         /* shutdown the recv side */
 604         if (how != SHUT_WR)
 605                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 606                     SOCK_OPCTL_SHUT_RECV, 0);
 607 
 608         return (0);
 609 }
 610 
 611 static void
 612 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 613 {
 614         conn_t  *connp = (conn_t *)proto_handle;
 615         tcp_t   *tcp = connp->conn_tcp;
 616         mblk_t *mp;
 617         int error;
 618 
 619         ASSERT(connp->conn_upper_handle != NULL);
 620 
 621         /*
 622          * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
 623          * is currently running.
 624          */
 625         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 626         if ((mp = tcp->tcp_rsrv_mp) == NULL) {
 627                 mutex_exit(&tcp->tcp_rsrv_mp_lock);
 628                 return;
 629         }
 630         tcp->tcp_rsrv_mp = NULL;
 631         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 632 
 633         error = squeue_synch_enter(connp, mp);
 634         ASSERT(error == 0);
 635 
 636         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 637         tcp->tcp_rsrv_mp = mp;
 638         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 639 
 640         if (tcp->tcp_fused) {
 641                 tcp_fuse_backenable(tcp);
 642         } else {
 643                 tcp->tcp_rwnd = connp->conn_rcvbuf;
 644                 /*
 645                  * Send back a window update immediately if TCP is above
 646                  * ESTABLISHED state and the increase of the rcv window
 647                  * that the other side knows is at least 1 MSS after flow
 648                  * control is lifted.
 649                  */
 650                 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 651                     tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 652                         tcp_xmit_ctl(NULL, tcp,
 653                             (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
 654                             tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 655                 }
 656         }
 657 
 658         squeue_synch_exit(connp);
 659 }
 660 
 661 /* ARGSUSED */
 662 static int
 663 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 664     int mode, int32_t *rvalp, cred_t *cr)
 665 {
 666         conn_t          *connp = (conn_t *)proto_handle;
 667         int             error;
 668 
 669         ASSERT(connp->conn_upper_handle != NULL);
 670 
 671         /* All Solaris components should pass a cred for this operation. */
 672         ASSERT(cr != NULL);
 673 
 674         /*
 675          * If we don't have a helper stream then create one.
 676          * ip_create_helper_stream takes care of locking the conn_t,
 677          * so this check for NULL is just a performance optimization.
 678          */
 679         if (connp->conn_helper_info == NULL) {
 680                 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
 681 
 682                 /*
 683                  * Create a helper stream for non-STREAMS socket.
 684                  */
 685                 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
 686                 if (error != 0) {
 687                         ip0dbg(("tcp_ioctl: create of IP helper stream "
 688                             "failed %d\n", error));
 689                         return (error);
 690                 }
 691         }
 692 
 693         switch (cmd) {
 694                 case ND_SET:
 695                 case ND_GET:
 696                 case _SIOCSOCKFALLBACK:
 697                 case TCP_IOC_ABORT_CONN:
 698                 case TI_GETPEERNAME:
 699                 case TI_GETMYNAME:
 700                         ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
 701                             cmd));
 702                         error = EINVAL;
 703                         break;
 704                 default:
 705                         /*
 706                          * If the conn is not closing, pass on to IP using
 707                          * helper stream. Bump the ioctlref to prevent tcp_close
 708                          * from closing the rq/wq out from underneath the ioctl
 709                          * if it ends up queued or aborted/interrupted.
 710                          */
 711                         mutex_enter(&connp->conn_lock);
 712                         if (connp->conn_state_flags & (CONN_CLOSING)) {
 713                                 mutex_exit(&connp->conn_lock);
 714                                 error = EINVAL;
 715                                 break;
 716                         }
 717                         CONN_INC_IOCTLREF_LOCKED(connp);
 718                         error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
 719                             cmd, arg, mode, cr, rvalp);
 720                         CONN_DEC_IOCTLREF(connp);
 721                         break;
 722         }
 723         return (error);
 724 }
 725 
 726 /* ARGSUSED */
 727 static int
 728 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 729 {
 730         conn_t *connp = (conn_t *)proto_handle;
 731 
 732         ASSERT(connp->conn_upper_handle != NULL);
 733 
 734         /* All Solaris components should pass a cred for this operation. */
 735         ASSERT(cr != NULL);
 736 
 737         tcp_close_common(connp, flags);
 738 
 739         ip_free_helper_stream(connp);
 740 
 741         /*
 742          * Drop IP's reference on the conn. This is the last reference
 743          * on the connp if the state was less than established. If the
 744          * connection has gone into timewait state, then we will have
 745          * one ref for the TCP and one more ref (total of two) for the
 746          * classifier connected hash list (a timewait connections stays
 747          * in connected hash till closed).
 748          *
 749          * We can't assert the references because there might be other
 750          * transient reference places because of some walkers or queued
 751          * packets in squeue for the timewait state.
 752          */
 753         CONN_DEC_REF(connp);
 754 
 755         /*
 756          * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
 757          * freeing the socket.
 758          */
 759         return (EINPROGRESS);
 760 }
 761 
 762 /* ARGSUSED */
 763 sock_lower_handle_t
 764 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 765     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 766 {
 767         conn_t          *connp;
 768         boolean_t       isv6 = family == AF_INET6;
 769 
 770         if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
 771             (proto != 0 && proto != IPPROTO_TCP)) {
 772                 *errorp = EPROTONOSUPPORT;
 773                 return (NULL);
 774         }
 775 
 776         connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 777         if (connp == NULL) {
 778                 return (NULL);
 779         }
 780 
 781         /*
 782          * Put the ref for TCP. Ref for IP was already put
 783          * by ipcl_conn_create. Also make the conn_t globally
 784          * visible to walkers
 785          */
 786         mutex_enter(&connp->conn_lock);
 787         CONN_INC_REF_LOCKED(connp);
 788         ASSERT(connp->conn_ref == 2);
 789         connp->conn_state_flags &= ~CONN_INCIPIENT;
 790 
 791         connp->conn_flags |= IPCL_NONSTR;
 792         mutex_exit(&connp->conn_lock);
 793 
 794         ASSERT(errorp != NULL);
 795         *errorp = 0;
 796         *sock_downcalls = &sock_tcp_downcalls;
 797         *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
 798             SM_SENDFILESUPP;
 799 
 800         return ((sock_lower_handle_t)connp);
 801 }
 802 
 803 /*
 804  * tcp_fallback
 805  *
 806  * A direct socket is falling back to using STREAMS. The queue
 807  * that is being passed down was created using tcp_open() with
 808  * the SO_FALLBACK flag set. As a result, the queue is not
 809  * associated with a conn, and the q_ptrs instead contain the
 810  * dev and minor area that should be used.
 811  *
 812  * The 'issocket' flag indicates whether the FireEngine
 813  * optimizations should be used. The common case would be that
 814  * optimizations are enabled, and they might be subsequently
 815  * disabled using the _SIOCSOCKFALLBACK ioctl.
 816  */
 817 
 818 /*
 819  * An active connection is falling back to TPI. Gather all the information
 820  * required by the STREAM head and TPI sonode and send it up.
 821  */
 822 static void
 823 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 824     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
 825     sock_quiesce_arg_t *arg)
 826 {
 827         conn_t                  *connp = tcp->tcp_connp;
 828         struct stroptions       *stropt;
 829         struct T_capability_ack tca;
 830         struct sockaddr_in6     laddr, faddr;
 831         socklen_t               laddrlen, faddrlen;
 832         short                   opts;
 833         int                     error;
 834         mblk_t                  *mp, *mpnext;
 835 
 836         connp->conn_dev = (dev_t)RD(q)->q_ptr;
 837         connp->conn_minor_arena = WR(q)->q_ptr;
 838 
 839         RD(q)->q_ptr = WR(q)->q_ptr = connp;
 840 
 841         connp->conn_rq = RD(q);
 842         connp->conn_wq = WR(q);
 843 
 844         WR(q)->q_qinfo = &tcp_sock_winit;
 845 
 846         if (!issocket)
 847                 tcp_use_pure_tpi(tcp);
 848 
 849         /*
 850          * free the helper stream
 851          */
 852         ip_free_helper_stream(connp);
 853 
 854         /*
 855          * Notify the STREAM head about options
 856          */
 857         DB_TYPE(stropt_mp) = M_SETOPTS;
 858         stropt = (struct stroptions *)stropt_mp->b_rptr;
 859         stropt_mp->b_wptr += sizeof (struct stroptions);
 860         stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 861 
 862         stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 863             tcp->tcp_tcps->tcps_wroff_xtra);
 864         if (tcp->tcp_snd_sack_ok)
 865                 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
 866         stropt->so_hiwat = connp->conn_rcvbuf;
 867         stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 868 
 869         putnext(RD(q), stropt_mp);
 870 
 871         /*
 872          * Collect the information needed to sync with the sonode
 873          */
 874         tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 875 
 876         laddrlen = faddrlen = sizeof (sin6_t);
 877         (void) tcp_getsockname((sock_lower_handle_t)connp,
 878             (struct sockaddr *)&laddr, &laddrlen, CRED());
 879         error = tcp_getpeername((sock_lower_handle_t)connp,
 880             (struct sockaddr *)&faddr, &faddrlen, CRED());
 881         if (error != 0)
 882                 faddrlen = 0;
 883 
 884         opts = 0;
 885         if (connp->conn_oobinline)
 886                 opts |= SO_OOBINLINE;
 887         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 888                 opts |= SO_DONTROUTE;
 889 
 890         /*
 891          * Notify the socket that the protocol is now quiescent,
 892          * and it's therefore safe move data from the socket
 893          * to the stream head.
 894          */
 895         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 896             (struct sockaddr *)&laddr, laddrlen,
 897             (struct sockaddr *)&faddr, faddrlen, opts);
 898 
 899         while (mp != NULL) {
 900                 mpnext = mp->b_next;
 901                 tcp->tcp_rcv_list = mp->b_next;
 902                 mp->b_next = NULL;
 903                 putnext(q, mp);
 904                 mp = mpnext;
 905         }
 906         ASSERT(tcp->tcp_rcv_last_head == NULL);
 907         ASSERT(tcp->tcp_rcv_last_tail == NULL);
 908         ASSERT(tcp->tcp_rcv_cnt == 0);
 909 
 910         /*
 911          * All eagers in q0 are marked as being non-STREAM, so they will
 912          * make su_newconn upcalls when the handshake completes, which
 913          * will fail (resulting in the conn being closed). So we just blow
 914          * off everything in q0 instead of waiting for the inevitable.
 915          */
 916         if (tcp->tcp_conn_req_cnt_q0 != 0)
 917                 tcp_eager_cleanup(tcp, B_TRUE);
 918 }
 919 
 920 /*
 921  * An eager is falling back to TPI. All we have to do is send
 922  * up a T_CONN_IND.
 923  */
 924 static void
 925 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
 926     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
 927 {
 928         conn_t *connp = eager->tcp_connp;
 929         tcp_t *listener = eager->tcp_listener;
 930         mblk_t *mp;
 931 
 932         ASSERT(listener != NULL);
 933 
 934         /*
 935          * Notify the socket that the protocol is now quiescent,
 936          * and it's therefore safe move data from the socket
 937          * to tcp's rcv queue.
 938          */
 939         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
 940             NULL, 0, 0);
 941 
 942         if (mp != NULL) {
 943                 ASSERT(eager->tcp_rcv_cnt == 0);
 944 
 945                 eager->tcp_rcv_list = mp;
 946                 eager->tcp_rcv_cnt = msgdsize(mp);
 947                 while (mp->b_next != NULL) {
 948                         mp = mp->b_next;
 949                         eager->tcp_rcv_cnt += msgdsize(mp);
 950                 }
 951                 eager->tcp_rcv_last_head = mp;
 952                 while (mp->b_cont)
 953                         mp = mp->b_cont;
 954                 eager->tcp_rcv_last_tail = mp;
 955                 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
 956                         eager->tcp_rwnd = 0;
 957                 else
 958                         eager->tcp_rwnd -= eager->tcp_rcv_cnt;
 959         }
 960 
 961         if (!issocket)
 962                 eager->tcp_issocket = B_FALSE;
 963         /*
 964          * The stream for this eager does not yet exist, so mark it as
 965          * being detached.
 966          */
 967         eager->tcp_detached = B_TRUE;
 968         eager->tcp_hard_binding = B_TRUE;
 969         connp->conn_rq = listener->tcp_connp->conn_rq;
 970         connp->conn_wq = listener->tcp_connp->conn_wq;
 971 
 972         /* Send up the connection indication */
 973         mp = eager->tcp_conn.tcp_eager_conn_ind;
 974         ASSERT(mp != NULL);
 975         eager->tcp_conn.tcp_eager_conn_ind = NULL;
 976 
 977         /*
 978          * TLI/XTI applications will get confused by
 979          * sending eager as an option since it violates
 980          * the option semantics. So remove the eager as
 981          * option since TLI/XTI app doesn't need it anyway.
 982          */
 983         if (!issocket) {
 984                 struct T_conn_ind *conn_ind;
 985 
 986                 conn_ind = (struct T_conn_ind *)mp->b_rptr;
 987                 conn_ind->OPT_length = 0;
 988                 conn_ind->OPT_offset = 0;
 989         }
 990 
 991         /*
 992          * Sockfs guarantees that the listener will not be closed
 993          * during fallback. So we can safely use the listener's queue.
 994          */
 995         putnext(listener->tcp_connp->conn_rq, mp);
 996 }
 997 
 998 
 999 int
1000 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
1001     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
1002     sock_quiesce_arg_t *arg)
1003 {
1004         tcp_t                   *tcp;
1005         conn_t                  *connp = (conn_t *)proto_handle;
1006         int                     error;
1007         mblk_t                  *stropt_mp;
1008         mblk_t                  *ordrel_mp;
1009 
1010         tcp = connp->conn_tcp;
1011 
1012         stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
1013             NULL);
1014 
1015         /* Pre-allocate the T_ordrel_ind mblk. */
1016         ASSERT(tcp->tcp_ordrel_mp == NULL);
1017         ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1018             STR_NOSIG, NULL);
1019         ordrel_mp->b_datap->db_type = M_PROTO;
1020         ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1021         ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1022 
1023         /*
1024          * Enter the squeue so that no new packets can come in
1025          */
1026         error = squeue_synch_enter(connp, NULL);
1027         if (error != 0) {
1028                 /* failed to enter, free all the pre-allocated messages. */
1029                 freeb(stropt_mp);
1030                 freeb(ordrel_mp);
1031                 return (ENOMEM);
1032         }
1033 
1034         /*
1035          * Both endpoints must be of the same type (either STREAMS or
1036          * non-STREAMS) for fusion to be enabled. So if we are fused,
1037          * we have to unfuse.
1038          */
1039         if (tcp->tcp_fused)
1040                 tcp_unfuse(tcp);
1041 
1042         if (tcp->tcp_listener != NULL) {
1043                 /* The eager will deal with opts when accept() is called */
1044                 freeb(stropt_mp);
1045                 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1046         } else {
1047                 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1048                     quiesced_cb, arg);
1049         }
1050 
1051         /*
1052          * No longer a direct socket
1053          *
1054          * Note that we intentionally leave the upper_handle and upcalls
1055          * intact, since eagers may still be using them.
1056          */
1057         connp->conn_flags &= ~IPCL_NONSTR;
1058         tcp->tcp_ordrel_mp = ordrel_mp;
1059 
1060         /*
1061          * There should be atleast two ref's (IP + TCP)
1062          */
1063         ASSERT(connp->conn_ref >= 2);
1064         squeue_synch_exit(connp);
1065 
1066         return (0);
1067 }
1068 
1069 /*
1070  * Notifies a non-STREAMS based listener about a new connection. This
1071  * function is executed on the *eager*'s squeue once the 3 way handshake
1072  * has completed. Note that the behavior differs from STREAMS, where the
1073  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1074  * squeue.
1075  *
1076  * Returns B_TRUE if the notification succeeded and an upper handle was
1077  * obtained. `tcp' should be closed on failure.
1078  */
1079 boolean_t
1080 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1081 {
1082         tcp_t *listener = tcp->tcp_listener;
1083         conn_t *lconnp = listener->tcp_connp;
1084         conn_t *econnp = tcp->tcp_connp;
1085         tcp_t *tail;
1086         ipaddr_t *addr_cache;
1087         sock_upper_handle_t upper;
1088         struct sock_proto_props sopp;
1089 
1090         mutex_enter(&listener->tcp_eager_lock);
1091         /*
1092          * Take the eager out, if it is in the list of droppable eagers
1093          * as we are here because the 3W handshake is over.
1094          */
1095         MAKE_UNDROPPABLE(tcp);
1096         /*
1097          * The eager already has an extra ref put in tcp_input_data
1098          * so that it stays till accept comes back even though it
1099          * might get into TCPS_CLOSED as a result of a TH_RST etc.
1100          */
1101         ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1102         listener->tcp_conn_req_cnt_q0--;
1103         listener->tcp_conn_req_cnt_q++;
1104 
1105         /* Move from SYN_RCVD to ESTABLISHED list  */
1106         tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1107         tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1108         tcp->tcp_eager_prev_q0 = NULL;
1109         tcp->tcp_eager_next_q0 = NULL;
1110 
1111         /*
1112          * Insert at end of the queue because connections are accepted
1113          * in chronological order. Leaving the older connections at front
1114          * of the queue helps reducing search time.
1115          */
1116         tail = listener->tcp_eager_last_q;
1117         if (tail != NULL)
1118                 tail->tcp_eager_next_q = tcp;
1119         else
1120                 listener->tcp_eager_next_q = tcp;
1121         listener->tcp_eager_last_q = tcp;
1122         tcp->tcp_eager_next_q = NULL;
1123 
1124         /* we have timed out before */
1125         if (tcp->tcp_syn_rcvd_timeout != 0) {
1126                 tcp->tcp_syn_rcvd_timeout = 0;
1127                 listener->tcp_syn_rcvd_timeout--;
1128                 if (listener->tcp_syn_defense &&
1129                     listener->tcp_syn_rcvd_timeout <=
1130                     (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1131                     10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1132                     listener->tcp_last_rcv_lbolt)) {
1133                         /*
1134                          * Turn off the defense mode if we
1135                          * believe the SYN attack is over.
1136                          */
1137                         listener->tcp_syn_defense = B_FALSE;
1138                         if (listener->tcp_ip_addr_cache) {
1139                                 kmem_free((void *)listener->tcp_ip_addr_cache,
1140                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1141                                 listener->tcp_ip_addr_cache = NULL;
1142                         }
1143                 }
1144         }
1145         addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1146         if (addr_cache != NULL) {
1147                 /*
1148                  * We have finished a 3-way handshake with this
1149                  * remote host. This proves the IP addr is good.
1150                  * Cache it!
1151                  */
1152                 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1153                     tcp->tcp_connp->conn_faddr_v4;
1154         }
1155         mutex_exit(&listener->tcp_eager_lock);
1156 
1157         /*
1158          * Notify the ULP about the newconn. It is guaranteed that no
1159          * tcp_accept() call will be made for the eager if the
1160          * notification fails.
1161          */
1162         if ((upper = (*lconnp->conn_upcalls->su_newconn)
1163             (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1164             &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1165             &econnp->conn_upcalls)) == NULL) {
1166                 return (B_FALSE);
1167         }
1168         econnp->conn_upper_handle = upper;
1169 
1170         tcp->tcp_detached = B_FALSE;
1171         tcp->tcp_hard_binding = B_FALSE;
1172         tcp->tcp_tconnind_started = B_TRUE;
1173 
1174         if (econnp->conn_keepalive) {
1175                 tcp->tcp_ka_last_intrvl = 0;
1176                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1177                     tcp->tcp_ka_interval);
1178         }
1179 
1180         /* Update the necessary parameters */
1181         tcp_get_proto_props(tcp, &sopp);
1182 
1183         (*econnp->conn_upcalls->su_set_proto_props)
1184             (econnp->conn_upper_handle, &sopp);
1185 
1186         return (B_TRUE);
1187 }