1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /* This file contains all TCP kernel socket related functions. */
  27 
  28 #include <sys/types.h>
  29 #include <sys/strlog.h>
  30 #include <sys/policy.h>
  31 #include <sys/sockio.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/strsun.h>
  34 #include <sys/squeue_impl.h>
  35 #include <sys/squeue.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/timod.h>
  39 #include <sys/tpicommon.h>
  40 #include <sys/socketvar.h>
  41 
  42 #include <inet/common.h>
  43 #include <inet/proto_set.h>
  44 #include <inet/ip.h>
  45 #include <inet/tcp.h>
  46 #include <inet/tcp_impl.h>
  47 
  48 static void     tcp_activate(sock_lower_handle_t, sock_upper_handle_t,
  49                     sock_upcalls_t *, int, cred_t *);
  50 static int      tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
  51                     sock_upper_handle_t, cred_t *);
  52 static int      tcp_bind(sock_lower_handle_t, struct sockaddr *,
  53                     socklen_t, cred_t *);
  54 static int      tcp_listen(sock_lower_handle_t, int, cred_t *);
  55 static int      tcp_connect(sock_lower_handle_t, const struct sockaddr *,
  56                     socklen_t, sock_connid_t *, cred_t *);
  57 static int      tcp_getsockopt(sock_lower_handle_t, int, int, void *,
  58                     socklen_t *, cred_t *);
  59 static int      tcp_setsockopt(sock_lower_handle_t, int, int, const void *,
  60                     socklen_t, cred_t *);
  61 static int      tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
  62                     cred_t *cr);
  63 static int      tcp_shutdown(sock_lower_handle_t, int, cred_t *);
  64 static void     tcp_clr_flowctrl(sock_lower_handle_t);
  65 static int      tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
  66                     cred_t *);
  67 static int      tcp_close(sock_lower_handle_t, int, cred_t *);
  68 
  69 sock_downcalls_t sock_tcp_downcalls = {
  70         tcp_activate,
  71         tcp_accept,
  72         tcp_bind,
  73         tcp_listen,
  74         tcp_connect,
  75         tcp_getpeername,
  76         tcp_getsockname,
  77         tcp_getsockopt,
  78         tcp_setsockopt,
  79         tcp_sendmsg,
  80         NULL,
  81         NULL,
  82         NULL,
  83         tcp_shutdown,
  84         tcp_clr_flowctrl,
  85         tcp_ioctl,
  86         tcp_close,
  87 };
  88 
  89 /* ARGSUSED */
  90 static void
  91 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
  92     sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
  93 {
  94         conn_t *connp = (conn_t *)proto_handle;
  95         struct sock_proto_props sopp;
  96         extern struct module_info tcp_rinfo;
  97 
  98         ASSERT(connp->conn_upper_handle == NULL);
  99 
 100         /* All Solaris components should pass a cred for this operation. */
 101         ASSERT(cr != NULL);
 102 
 103         sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
 104             SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
 105             SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
 106 
 107         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
 108         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
 109         sopp.sopp_maxpsz = INFPSZ;
 110         sopp.sopp_maxblk = INFPSZ;
 111         sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
 112         sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
 113         sopp.sopp_maxaddrlen = sizeof (sin6_t);
 114         sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
 115             tcp_rinfo.mi_minpsz;
 116 
 117         connp->conn_upcalls = sock_upcalls;
 118         connp->conn_upper_handle = sock_handle;
 119 
 120         ASSERT(connp->conn_rcvbuf != 0 &&
 121             connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
 122         (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
 123 }
 124 
 125 /*ARGSUSED*/
 126 static int
 127 tcp_accept(sock_lower_handle_t lproto_handle,
 128     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
 129     cred_t *cr)
 130 {
 131         conn_t *lconnp, *econnp;
 132         tcp_t *listener, *eager;
 133 
 134         /*
 135          * KSSL can move a socket from one listener to another, in which
 136          * case `lproto_handle' points to the new listener. To ensure that
 137          * the original listener is used the information is obtained from
 138          * the eager.
 139          */
 140         econnp = (conn_t *)eproto_handle;
 141         eager = econnp->conn_tcp;
 142         ASSERT(IPCL_IS_NONSTR(econnp));
 143         ASSERT(eager->tcp_listener != NULL);
 144         listener = eager->tcp_listener;
 145         lconnp = (conn_t *)listener->tcp_connp;
 146         ASSERT(listener->tcp_state == TCPS_LISTEN);
 147         ASSERT(lconnp->conn_upper_handle != NULL);
 148 
 149         /*
 150          * It is possible for the accept thread to race with the thread that
 151          * made the su_newconn upcall in tcp_newconn_notify. Both
 152          * tcp_newconn_notify and tcp_accept require that conn_upper_handle
 153          * and conn_upcalls be set before returning, so they both write to
 154          * them. However, we're guaranteed that the value written is the same
 155          * for both threads.
 156          */
 157         ASSERT(econnp->conn_upper_handle == NULL ||
 158             econnp->conn_upper_handle == sock_handle);
 159         ASSERT(econnp->conn_upcalls == NULL ||
 160             econnp->conn_upcalls == lconnp->conn_upcalls);
 161         econnp->conn_upper_handle = sock_handle;
 162         econnp->conn_upcalls = lconnp->conn_upcalls;
 163 
 164         ASSERT(econnp->conn_netstack ==
 165             listener->tcp_connp->conn_netstack);
 166         ASSERT(eager->tcp_tcps == listener->tcp_tcps);
 167 
 168         /*
 169          * We should have a minimum of 2 references on the conn at this
 170          * point. One for TCP and one for the newconn notification
 171          * (which is now taken over by IP). In the normal case we would
 172          * also have another reference (making a total of 3) for the conn
 173          * being in the classifier hash list. However the eager could have
 174          * received an RST subsequently and tcp_closei_local could have
 175          * removed the eager from the classifier hash list, hence we can't
 176          * assert that reference.
 177          */
 178         ASSERT(econnp->conn_ref >= 2);
 179 
 180         mutex_enter(&listener->tcp_eager_lock);
 181         /*
 182          * Non-STREAMS listeners never defer the notification of new
 183          * connections.
 184          */
 185         ASSERT(!listener->tcp_eager_prev_q0->tcp_conn_def_q0);
 186         tcp_eager_unlink(eager);
 187         mutex_exit(&listener->tcp_eager_lock);
 188         CONN_DEC_REF(listener->tcp_connp);
 189 
 190         return ((eager->tcp_state < TCPS_ESTABLISHED) ? ECONNABORTED : 0);
 191 }
 192 
 193 static int
 194 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
 195     socklen_t len, cred_t *cr)
 196 {
 197         int             error;
 198         conn_t          *connp = (conn_t *)proto_handle;
 199 
 200         /* All Solaris components should pass a cred for this operation. */
 201         ASSERT(cr != NULL);
 202         ASSERT(connp->conn_upper_handle != NULL);
 203 
 204         error = squeue_synch_enter(connp, NULL);
 205         if (error != 0) {
 206                 /* failed to enter */
 207                 return (ENOSR);
 208         }
 209 
 210         /* binding to a NULL address really means unbind */
 211         if (sa == NULL) {
 212                 if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
 213                         error = tcp_do_unbind(connp);
 214                 else
 215                         error = EINVAL;
 216         } else {
 217                 error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
 218         }
 219 
 220         squeue_synch_exit(connp);
 221 
 222         if (error < 0) {
 223                 if (error == -TOUTSTATE)
 224                         error = EINVAL;
 225                 else
 226                         error = proto_tlitosyserr(-error);
 227         }
 228 
 229         return (error);
 230 }
 231 
 232 /* ARGSUSED */
 233 static int
 234 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
 235 {
 236         conn_t  *connp = (conn_t *)proto_handle;
 237         tcp_t   *tcp = connp->conn_tcp;
 238         int     error;
 239 
 240         ASSERT(connp->conn_upper_handle != NULL);
 241 
 242         /* All Solaris components should pass a cred for this operation. */
 243         ASSERT(cr != NULL);
 244 
 245         error = squeue_synch_enter(connp, NULL);
 246         if (error != 0) {
 247                 /* failed to enter */
 248                 return (ENOBUFS);
 249         }
 250 
 251         error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE);
 252         if (error == 0) {
 253                 /*
 254                  * sockfs needs to know what's the maximum number of socket
 255                  * that can be queued on the listener.
 256                  */
 257                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 258                     SOCK_OPCTL_ENAB_ACCEPT,
 259                     (uintptr_t)(tcp->tcp_conn_req_max +
 260                     tcp->tcp_tcps->tcps_conn_req_max_q0));
 261         } else if (error < 0) {
 262                 if (error == -TOUTSTATE)
 263                         error = EINVAL;
 264                 else
 265                         error = proto_tlitosyserr(-error);
 266         }
 267         squeue_synch_exit(connp);
 268         return (error);
 269 }
 270 
 271 static int
 272 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
 273     socklen_t len, sock_connid_t *id, cred_t *cr)
 274 {
 275         conn_t          *connp = (conn_t *)proto_handle;
 276         int             error;
 277 
 278         ASSERT(connp->conn_upper_handle != NULL);
 279 
 280         /* All Solaris components should pass a cred for this operation. */
 281         ASSERT(cr != NULL);
 282 
 283         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 284         if (error != 0) {
 285                 return (error);
 286         }
 287 
 288         error = squeue_synch_enter(connp, NULL);
 289         if (error != 0) {
 290                 /* failed to enter */
 291                 return (ENOSR);
 292         }
 293 
 294         /*
 295          * TCP supports quick connect, so no need to do an implicit bind
 296          */
 297         error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
 298         if (error == 0) {
 299                 *id = connp->conn_tcp->tcp_connid;
 300         } else if (error < 0) {
 301                 if (error == -TOUTSTATE) {
 302                         switch (connp->conn_tcp->tcp_state) {
 303                         case TCPS_SYN_SENT:
 304                                 error = EALREADY;
 305                                 break;
 306                         case TCPS_ESTABLISHED:
 307                                 error = EISCONN;
 308                                 break;
 309                         case TCPS_LISTEN:
 310                                 error = EOPNOTSUPP;
 311                                 break;
 312                         default:
 313                                 error = EINVAL;
 314                                 break;
 315                         }
 316                 } else {
 317                         error = proto_tlitosyserr(-error);
 318                 }
 319         }
 320 
 321         if (connp->conn_tcp->tcp_loopback) {
 322                 struct sock_proto_props sopp;
 323 
 324                 sopp.sopp_flags = SOCKOPT_LOOPBACK;
 325                 sopp.sopp_loopback = B_TRUE;
 326 
 327                 (*connp->conn_upcalls->su_set_proto_props)(
 328                     connp->conn_upper_handle, &sopp);
 329         }
 330 done:
 331         squeue_synch_exit(connp);
 332 
 333         return ((error == 0) ? EINPROGRESS : error);
 334 }
 335 
 336 /* ARGSUSED3 */
 337 int
 338 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 339     socklen_t *addrlenp, cred_t *cr)
 340 {
 341         conn_t  *connp = (conn_t *)proto_handle;
 342         tcp_t   *tcp = connp->conn_tcp;
 343 
 344         /* All Solaris components should pass a cred for this operation. */
 345         ASSERT(cr != NULL);
 346 
 347         ASSERT(tcp != NULL);
 348         if (tcp->tcp_state < TCPS_SYN_RCVD)
 349                 return (ENOTCONN);
 350 
 351         return (conn_getpeername(connp, addr, addrlenp));
 352 }
 353 
 354 /* ARGSUSED3 */
 355 int
 356 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 357     socklen_t *addrlenp, cred_t *cr)
 358 {
 359         conn_t  *connp = (conn_t *)proto_handle;
 360 
 361         /* All Solaris components should pass a cred for this operation. */
 362         ASSERT(cr != NULL);
 363 
 364         return (conn_getsockname(connp, addr, addrlenp));
 365 }
 366 
 367 /* returns UNIX error, the optlen is a value-result arg */
 368 static int
 369 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 370     void *optvalp, socklen_t *optlen, cred_t *cr)
 371 {
 372         conn_t          *connp = (conn_t *)proto_handle;
 373         int             error;
 374         t_uscalar_t     max_optbuf_len;
 375         void            *optvalp_buf;
 376         int             len;
 377 
 378         ASSERT(connp->conn_upper_handle != NULL);
 379 
 380         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
 381             tcp_opt_obj.odb_opt_des_arr,
 382             tcp_opt_obj.odb_opt_arr_cnt,
 383             B_FALSE, B_TRUE, cr);
 384         if (error != 0) {
 385                 if (error < 0) {
 386                         error = proto_tlitosyserr(-error);
 387                 }
 388                 return (error);
 389         }
 390 
 391         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
 392 
 393         error = squeue_synch_enter(connp, NULL);
 394         if (error == ENOMEM) {
 395                 kmem_free(optvalp_buf, max_optbuf_len);
 396                 return (ENOMEM);
 397         }
 398 
 399         len = tcp_opt_get(connp, level, option_name, optvalp_buf);
 400         squeue_synch_exit(connp);
 401 
 402         if (len == -1) {
 403                 kmem_free(optvalp_buf, max_optbuf_len);
 404                 return (EINVAL);
 405         }
 406 
 407         /*
 408          * update optlen and copy option value
 409          */
 410         t_uscalar_t size = MIN(len, *optlen);
 411 
 412         bcopy(optvalp_buf, optvalp, size);
 413         bcopy(&size, optlen, sizeof (size));
 414 
 415         kmem_free(optvalp_buf, max_optbuf_len);
 416         return (0);
 417 }
 418 
 419 static int
 420 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
 421     const void *optvalp, socklen_t optlen, cred_t *cr)
 422 {
 423         conn_t          *connp = (conn_t *)proto_handle;
 424         int             error;
 425 
 426         ASSERT(connp->conn_upper_handle != NULL);
 427         /*
 428          * Entering the squeue synchronously can result in a context switch,
 429          * which can cause a rather sever performance degradation. So we try to
 430          * handle whatever options we can without entering the squeue.
 431          */
 432         if (level == IPPROTO_TCP) {
 433                 switch (option_name) {
 434                 case TCP_NODELAY:
 435                         if (optlen != sizeof (int32_t))
 436                                 return (EINVAL);
 437                         mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
 438                         connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
 439                             connp->conn_tcp->tcp_mss;
 440                         mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
 441                         return (0);
 442                 default:
 443                         break;
 444                 }
 445         }
 446 
 447         error = squeue_synch_enter(connp, NULL);
 448         if (error == ENOMEM) {
 449                 return (ENOMEM);
 450         }
 451 
 452         error = proto_opt_check(level, option_name, optlen, NULL,
 453             tcp_opt_obj.odb_opt_des_arr,
 454             tcp_opt_obj.odb_opt_arr_cnt,
 455             B_TRUE, B_FALSE, cr);
 456 
 457         if (error != 0) {
 458                 if (error < 0) {
 459                         error = proto_tlitosyserr(-error);
 460                 }
 461                 squeue_synch_exit(connp);
 462                 return (error);
 463         }
 464 
 465         error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
 466             optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
 467             NULL, cr);
 468         squeue_synch_exit(connp);
 469 
 470         ASSERT(error >= 0);
 471 
 472         return (error);
 473 }
 474 
 475 /* ARGSUSED */
 476 static int
 477 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 478     cred_t *cr)
 479 {
 480         tcp_t           *tcp;
 481         uint32_t        msize;
 482         conn_t *connp = (conn_t *)proto_handle;
 483         int32_t         tcpstate;
 484 
 485         /* All Solaris components should pass a cred for this operation. */
 486         ASSERT(cr != NULL);
 487 
 488         ASSERT(connp->conn_ref >= 2);
 489         ASSERT(connp->conn_upper_handle != NULL);
 490 
 491         if (msg->msg_controllen != 0) {
 492                 freemsg(mp);
 493                 return (EOPNOTSUPP);
 494         }
 495 
 496         switch (DB_TYPE(mp)) {
 497         case M_DATA:
 498                 tcp = connp->conn_tcp;
 499                 ASSERT(tcp != NULL);
 500 
 501                 tcpstate = tcp->tcp_state;
 502                 if (tcpstate < TCPS_ESTABLISHED) {
 503                         freemsg(mp);
 504                         /*
 505                          * We return ENOTCONN if the endpoint is trying to
 506                          * connect or has never been connected, and EPIPE if it
 507                          * has been disconnected. The connection id helps us
 508                          * distinguish between the last two cases.
 509                          */
 510                         return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN :
 511                             ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN));
 512                 } else if (tcpstate > TCPS_CLOSE_WAIT) {
 513                         freemsg(mp);
 514                         return (EPIPE);
 515                 }
 516 
 517                 msize = msgdsize(mp);
 518 
 519                 mutex_enter(&tcp->tcp_non_sq_lock);
 520                 tcp->tcp_squeue_bytes += msize;
 521                 /*
 522                  * Squeue Flow Control
 523                  */
 524                 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
 525                         tcp_setqfull(tcp);
 526                 }
 527                 mutex_exit(&tcp->tcp_non_sq_lock);
 528 
 529                 /*
 530                  * The application may pass in an address in the msghdr, but
 531                  * we ignore the address on connection-oriented sockets.
 532                  * Just like BSD this code does not generate an error for
 533                  * TCP (a CONNREQUIRED socket) when sending to an address
 534                  * passed in with sendto/sendmsg. Instead the data is
 535                  * delivered on the connection as if no address had been
 536                  * supplied.
 537                  */
 538                 CONN_INC_REF(connp);
 539 
 540                 if (msg->msg_flags & MSG_OOB) {
 541                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
 542                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 543                 } else {
 544                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
 545                             connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
 546                 }
 547 
 548                 return (0);
 549 
 550         default:
 551                 ASSERT(0);
 552         }
 553 
 554         freemsg(mp);
 555         return (0);
 556 }
 557 
 558 /* ARGSUSED */
 559 static int
 560 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 561 {
 562         conn_t  *connp = (conn_t *)proto_handle;
 563         tcp_t   *tcp = connp->conn_tcp;
 564 
 565         ASSERT(connp->conn_upper_handle != NULL);
 566 
 567         /* All Solaris components should pass a cred for this operation. */
 568         ASSERT(cr != NULL);
 569 
 570         /*
 571          * X/Open requires that we check the connected state.
 572          */
 573         if (tcp->tcp_state < TCPS_SYN_SENT)
 574                 return (ENOTCONN);
 575 
 576         /* shutdown the send side */
 577         if (how != SHUT_RD) {
 578                 mblk_t *bp;
 579 
 580                 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
 581                 CONN_INC_REF(connp);
 582                 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
 583                     connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
 584 
 585                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 586                     SOCK_OPCTL_SHUT_SEND, 0);
 587         }
 588 
 589         /* shutdown the recv side */
 590         if (how != SHUT_WR)
 591                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
 592                     SOCK_OPCTL_SHUT_RECV, 0);
 593 
 594         return (0);
 595 }
 596 
 597 static void
 598 tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
 599 {
 600         conn_t  *connp = (conn_t *)proto_handle;
 601         tcp_t   *tcp = connp->conn_tcp;
 602         mblk_t *mp;
 603         int error;
 604 
 605         ASSERT(connp->conn_upper_handle != NULL);
 606 
 607         /*
 608          * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl()
 609          * is currently running.
 610          */
 611         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 612         if ((mp = tcp->tcp_rsrv_mp) == NULL) {
 613                 mutex_exit(&tcp->tcp_rsrv_mp_lock);
 614                 return;
 615         }
 616         tcp->tcp_rsrv_mp = NULL;
 617         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 618 
 619         error = squeue_synch_enter(connp, mp);
 620         ASSERT(error == 0);
 621 
 622         mutex_enter(&tcp->tcp_rsrv_mp_lock);
 623         tcp->tcp_rsrv_mp = mp;
 624         mutex_exit(&tcp->tcp_rsrv_mp_lock);
 625 
 626         if (tcp->tcp_fused) {
 627                 tcp_fuse_backenable(tcp);
 628         } else {
 629                 tcp->tcp_rwnd = connp->conn_rcvbuf;
 630                 /*
 631                  * Send back a window update immediately if TCP is above
 632                  * ESTABLISHED state and the increase of the rcv window
 633                  * that the other side knows is at least 1 MSS after flow
 634                  * control is lifted.
 635                  */
 636                 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
 637                     tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
 638                         tcp_xmit_ctl(NULL, tcp,
 639                             (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
 640                             tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
 641                 }
 642         }
 643 
 644         squeue_synch_exit(connp);
 645 }
 646 
 647 /* ARGSUSED */
 648 static int
 649 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 650     int mode, int32_t *rvalp, cred_t *cr)
 651 {
 652         conn_t          *connp = (conn_t *)proto_handle;
 653         int             error;
 654 
 655         ASSERT(connp->conn_upper_handle != NULL);
 656 
 657         /* All Solaris components should pass a cred for this operation. */
 658         ASSERT(cr != NULL);
 659 
 660         /*
 661          * If we don't have a helper stream then create one.
 662          * ip_create_helper_stream takes care of locking the conn_t,
 663          * so this check for NULL is just a performance optimization.
 664          */
 665         if (connp->conn_helper_info == NULL) {
 666                 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
 667 
 668                 /*
 669                  * Create a helper stream for non-STREAMS socket.
 670                  */
 671                 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
 672                 if (error != 0) {
 673                         ip0dbg(("tcp_ioctl: create of IP helper stream "
 674                             "failed %d\n", error));
 675                         return (error);
 676                 }
 677         }
 678 
 679         switch (cmd) {
 680                 case ND_SET:
 681                 case ND_GET:
 682                 case _SIOCSOCKFALLBACK:
 683                 case TCP_IOC_ABORT_CONN:
 684                 case TI_GETPEERNAME:
 685                 case TI_GETMYNAME:
 686                         ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket",
 687                             cmd));
 688                         error = EINVAL;
 689                         break;
 690                 default:
 691                         /*
 692                          * If the conn is not closing, pass on to IP using
 693                          * helper stream. Bump the ioctlref to prevent tcp_close
 694                          * from closing the rq/wq out from underneath the ioctl
 695                          * if it ends up queued or aborted/interrupted.
 696                          */
 697                         mutex_enter(&connp->conn_lock);
 698                         if (connp->conn_state_flags & (CONN_CLOSING)) {
 699                                 mutex_exit(&connp->conn_lock);
 700                                 error = EINVAL;
 701                                 break;
 702                         }
 703                         CONN_INC_IOCTLREF_LOCKED(connp);
 704                         error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
 705                             cmd, arg, mode, cr, rvalp);
 706                         CONN_DEC_IOCTLREF(connp);
 707                         break;
 708         }
 709         return (error);
 710 }
 711 
 712 /* ARGSUSED */
 713 static int
 714 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
 715 {
 716         conn_t *connp = (conn_t *)proto_handle;
 717 
 718         ASSERT(connp->conn_upper_handle != NULL);
 719 
 720         /* All Solaris components should pass a cred for this operation. */
 721         ASSERT(cr != NULL);
 722 
 723         tcp_close_common(connp, flags);
 724 
 725         ip_free_helper_stream(connp);
 726 
 727         /*
 728          * Drop IP's reference on the conn. This is the last reference
 729          * on the connp if the state was less than established. If the
 730          * connection has gone into timewait state, then we will have
 731          * one ref for the TCP and one more ref (total of two) for the
 732          * classifier connected hash list (a timewait connections stays
 733          * in connected hash till closed).
 734          *
 735          * We can't assert the references because there might be other
 736          * transient reference places because of some walkers or queued
 737          * packets in squeue for the timewait state.
 738          */
 739         CONN_DEC_REF(connp);
 740 
 741         /*
 742          * EINPROGRESS tells sockfs to wait for a 'closed' upcall before
 743          * freeing the socket.
 744          */
 745         return (EINPROGRESS);
 746 }
 747 
 748 /* ARGSUSED */
 749 sock_lower_handle_t
 750 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 751     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 752 {
 753         conn_t          *connp;
 754         boolean_t       isv6 = family == AF_INET6;
 755         if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
 756             (proto != 0 && proto != IPPROTO_TCP)) {
 757                 *errorp = EPROTONOSUPPORT;
 758                 return (NULL);
 759         }
 760 
 761         connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
 762         if (connp == NULL) {
 763                 return (NULL);
 764         }
 765 
 766         /*
 767          * Put the ref for TCP. Ref for IP was already put
 768          * by ipcl_conn_create. Also Make the conn_t globally
 769          * visible to walkers
 770          */
 771         mutex_enter(&connp->conn_lock);
 772         CONN_INC_REF_LOCKED(connp);
 773         ASSERT(connp->conn_ref == 2);
 774         connp->conn_state_flags &= ~CONN_INCIPIENT;
 775 
 776         connp->conn_flags |= IPCL_NONSTR;
 777         mutex_exit(&connp->conn_lock);
 778 
 779         ASSERT(errorp != NULL);
 780         *errorp = 0;
 781         *sock_downcalls = &sock_tcp_downcalls;
 782         *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP |
 783             SM_SENDFILESUPP;
 784 
 785         return ((sock_lower_handle_t)connp);
 786 }
 787 
 788 /*
 789  * tcp_fallback
 790  *
 791  * A direct socket is falling back to using STREAMS. The queue
 792  * that is being passed down was created using tcp_open() with
 793  * the SO_FALLBACK flag set. As a result, the queue is not
 794  * associated with a conn, and the q_ptrs instead contain the
 795  * dev and minor area that should be used.
 796  *
 797  * The 'issocket' flag indicates whether the FireEngine
 798  * optimizations should be used. The common case would be that
 799  * optimizations are enabled, and they might be subsequently
 800  * disabled using the _SIOCSOCKFALLBACK ioctl.
 801  */
 802 
 803 /*
 804  * An active connection is falling back to TPI. Gather all the information
 805  * required by the STREAM head and TPI sonode and send it up.
 806  */
 807 static void
 808 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
 809     boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb,
 810     sock_quiesce_arg_t *arg)
 811 {
 812         conn_t                  *connp = tcp->tcp_connp;
 813         struct stroptions       *stropt;
 814         struct T_capability_ack tca;
 815         struct sockaddr_in6     laddr, faddr;
 816         socklen_t               laddrlen, faddrlen;
 817         short                   opts;
 818         int                     error;
 819         mblk_t                  *mp, *mpnext;
 820 
 821         connp->conn_dev = (dev_t)RD(q)->q_ptr;
 822         connp->conn_minor_arena = WR(q)->q_ptr;
 823 
 824         RD(q)->q_ptr = WR(q)->q_ptr = connp;
 825 
 826         connp->conn_rq = RD(q);
 827         connp->conn_wq = WR(q);
 828 
 829         WR(q)->q_qinfo = &tcp_sock_winit;
 830 
 831         if (!issocket)
 832                 tcp_use_pure_tpi(tcp);
 833 
 834         /*
 835          * free the helper stream
 836          */
 837         ip_free_helper_stream(connp);
 838 
 839         /*
 840          * Notify the STREAM head about options
 841          */
 842         DB_TYPE(stropt_mp) = M_SETOPTS;
 843         stropt = (struct stroptions *)stropt_mp->b_rptr;
 844         stropt_mp->b_wptr += sizeof (struct stroptions);
 845         stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
 846 
 847         stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
 848             tcp->tcp_tcps->tcps_wroff_xtra);
 849         if (tcp->tcp_snd_sack_ok)
 850                 stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
 851         stropt->so_hiwat = connp->conn_rcvbuf;
 852         stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
 853 
 854         putnext(RD(q), stropt_mp);
 855 
 856         /*
 857          * Collect the information needed to sync with the sonode
 858          */
 859         tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 860 
 861         laddrlen = faddrlen = sizeof (sin6_t);
 862         (void) tcp_getsockname((sock_lower_handle_t)connp,
 863             (struct sockaddr *)&laddr, &laddrlen, CRED());
 864         error = tcp_getpeername((sock_lower_handle_t)connp,
 865             (struct sockaddr *)&faddr, &faddrlen, CRED());
 866         if (error != 0)
 867                 faddrlen = 0;
 868 
 869         opts = 0;
 870         if (connp->conn_oobinline)
 871                 opts |= SO_OOBINLINE;
 872         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
 873                 opts |= SO_DONTROUTE;
 874 
 875         /*
 876          * Notify the socket that the protocol is now quiescent,
 877          * and it's therefore safe move data from the socket
 878          * to the stream head.
 879          */
 880         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
 881             (struct sockaddr *)&laddr, laddrlen,
 882             (struct sockaddr *)&faddr, faddrlen, opts);
 883 
 884         while (mp != NULL) {
 885                 mpnext = mp->b_next;
 886                 tcp->tcp_rcv_list = mp->b_next;
 887                 mp->b_next = NULL;
 888                 putnext(q, mp);
 889                 mp = mpnext;
 890         }
 891         ASSERT(tcp->tcp_rcv_last_head == NULL);
 892         ASSERT(tcp->tcp_rcv_last_tail == NULL);
 893         ASSERT(tcp->tcp_rcv_cnt == 0);
 894 
 895         /*
 896          * All eagers in q0 are marked as being non-STREAM, so they will
 897          * make su_newconn upcalls when the handshake completes, which
 898          * will fail (resulting in the conn being closed). So we just blow
 899          * off everything in q0 instead of waiting for the inevitable.
 900          */
 901         if (tcp->tcp_conn_req_cnt_q0 != 0)
 902                 tcp_eager_cleanup(tcp, B_TRUE);
 903 }
 904 
 905 /*
 906  * An eager is falling back to TPI. All we have to do is send
 907  * up a T_CONN_IND.
 908  */
 909 static void
 910 tcp_fallback_eager(tcp_t *eager, boolean_t issocket,
 911     so_proto_quiesced_cb_t quiesced_cb, sock_quiesce_arg_t *arg)
 912 {
 913         conn_t *connp = eager->tcp_connp;
 914         tcp_t *listener = eager->tcp_listener;
 915         mblk_t *mp;
 916 
 917         ASSERT(listener != NULL);
 918 
 919         /*
 920          * Notify the socket that the protocol is now quiescent,
 921          * and it's therefore safe move data from the socket
 922          * to tcp's rcv queue.
 923          */
 924         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, NULL, NULL, 0,
 925             NULL, 0, 0);
 926 
 927         if (mp != NULL) {
 928                 ASSERT(eager->tcp_rcv_cnt == 0);
 929 
 930                 eager->tcp_rcv_list = mp;
 931                 eager->tcp_rcv_cnt = msgdsize(mp);
 932                 while (mp->b_next != NULL) {
 933                         mp = mp->b_next;
 934                         eager->tcp_rcv_cnt += msgdsize(mp);
 935                 }
 936                 eager->tcp_rcv_last_head = mp;
 937                 while (mp->b_cont)
 938                         mp = mp->b_cont;
 939                 eager->tcp_rcv_last_tail = mp;
 940                 if (eager->tcp_rcv_cnt > eager->tcp_rwnd)
 941                         eager->tcp_rwnd = 0;
 942                 else
 943                         eager->tcp_rwnd -= eager->tcp_rcv_cnt;
 944         }
 945 
 946         if (!issocket)
 947                 eager->tcp_issocket = B_FALSE;
 948         /*
 949          * The stream for this eager does not yet exist, so mark it as
 950          * being detached.
 951          */
 952         eager->tcp_detached = B_TRUE;
 953         eager->tcp_hard_binding = B_TRUE;
 954         connp->conn_rq = listener->tcp_connp->conn_rq;
 955         connp->conn_wq = listener->tcp_connp->conn_wq;
 956 
 957         /* Send up the connection indication */
 958         mp = eager->tcp_conn.tcp_eager_conn_ind;
 959         ASSERT(mp != NULL);
 960         eager->tcp_conn.tcp_eager_conn_ind = NULL;
 961 
 962         /*
 963          * TLI/XTI applications will get confused by
 964          * sending eager as an option since it violates
 965          * the option semantics. So remove the eager as
 966          * option since TLI/XTI app doesn't need it anyway.
 967          */
 968         if (!issocket) {
 969                 struct T_conn_ind *conn_ind;
 970 
 971                 conn_ind = (struct T_conn_ind *)mp->b_rptr;
 972                 conn_ind->OPT_length = 0;
 973                 conn_ind->OPT_offset = 0;
 974         }
 975 
 976         /*
 977          * Sockfs guarantees that the listener will not be closed
 978          * during fallback. So we can safely use the listener's queue.
 979          */
 980         putnext(listener->tcp_connp->conn_rq, mp);
 981 }
 982 
 983 
 984 int
 985 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 986     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
 987     sock_quiesce_arg_t *arg)
 988 {
 989         tcp_t                   *tcp;
 990         conn_t                  *connp = (conn_t *)proto_handle;
 991         int                     error;
 992         mblk_t                  *stropt_mp;
 993         mblk_t                  *ordrel_mp;
 994 
 995         tcp = connp->conn_tcp;
 996 
 997         stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
 998             NULL);
 999 
1000         /* Pre-allocate the T_ordrel_ind mblk. */
1001         ASSERT(tcp->tcp_ordrel_mp == NULL);
1002         ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
1003             STR_NOSIG, NULL);
1004         ordrel_mp->b_datap->db_type = M_PROTO;
1005         ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
1006         ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
1007 
1008         /*
1009          * Enter the squeue so that no new packets can come in
1010          */
1011         error = squeue_synch_enter(connp, NULL);
1012         if (error != 0) {
1013                 /* failed to enter, free all the pre-allocated messages. */
1014                 freeb(stropt_mp);
1015                 freeb(ordrel_mp);
1016                 return (ENOMEM);
1017         }
1018 
1019         /*
1020          * Both endpoints must be of the same type (either STREAMS or
1021          * non-STREAMS) for fusion to be enabled. So if we are fused,
1022          * we have to unfuse.
1023          */
1024         if (tcp->tcp_fused)
1025                 tcp_unfuse(tcp);
1026 
1027         if (tcp->tcp_listener != NULL) {
1028                 /* The eager will deal with opts when accept() is called */
1029                 freeb(stropt_mp);
1030                 tcp_fallback_eager(tcp, direct_sockfs, quiesced_cb, arg);
1031         } else {
1032                 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
1033                     quiesced_cb, arg);
1034         }
1035 
1036         /*
1037          * No longer a direct socket
1038          *
1039          * Note that we intentionally leave the upper_handle and upcalls
1040          * intact, since eagers may still be using them.
1041          */
1042         connp->conn_flags &= ~IPCL_NONSTR;
1043         tcp->tcp_ordrel_mp = ordrel_mp;
1044 
1045         /*
1046          * There should be atleast two ref's (IP + TCP)
1047          */
1048         ASSERT(connp->conn_ref >= 2);
1049         squeue_synch_exit(connp);
1050 
1051         return (0);
1052 }
1053 
1054 /*
1055  * Notifies a non-STREAMS based listener about a new connection. This
1056  * function is executed on the *eager*'s squeue once the 3 way handshake
1057  * has completed. Note that the behavior differs from STREAMS, where the
1058  * T_CONN_IND is sent up by tcp_send_conn_ind() while on the *listener*'s
1059  * squeue.
1060  *
1061  * Returns B_TRUE if the notification succeeded and an upper handle was
1062  * obtained. `tcp' should be closed on failure.
1063  */
1064 boolean_t
1065 tcp_newconn_notify(tcp_t *tcp, ip_recv_attr_t *ira)
1066 {
1067         tcp_t *listener = tcp->tcp_listener;
1068         conn_t *lconnp = listener->tcp_connp;
1069         conn_t *econnp = tcp->tcp_connp;
1070         tcp_t *tail;
1071         ipaddr_t *addr_cache;
1072         sock_upper_handle_t upper;
1073         struct sock_proto_props sopp;
1074 
1075         mutex_enter(&listener->tcp_eager_lock);
1076         /*
1077          * Take the eager out, if it is in the list of droppable eagers
1078          * as we are here because the 3W handshake is over.
1079          */
1080         MAKE_UNDROPPABLE(tcp);
1081         /*
1082          * The eager already has an extra ref put in tcp_input_data
1083          * so that it stays till accept comes back even though it
1084          * might get into TCPS_CLOSED as a result of a TH_RST etc.
1085          */
1086         ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
1087         listener->tcp_conn_req_cnt_q0--;
1088         listener->tcp_conn_req_cnt_q++;
1089 
1090         /* Move from SYN_RCVD to ESTABLISHED list  */
1091         tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0;
1092         tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
1093         tcp->tcp_eager_prev_q0 = NULL;
1094         tcp->tcp_eager_next_q0 = NULL;
1095 
1096         /*
1097          * Insert at end of the queue because connections are accepted
1098          * in chronological order. Leaving the older connections at front
1099          * of the queue helps reducing search time.
1100          */
1101         tail = listener->tcp_eager_last_q;
1102         if (tail != NULL)
1103                 tail->tcp_eager_next_q = tcp;
1104         else
1105                 listener->tcp_eager_next_q = tcp;
1106         listener->tcp_eager_last_q = tcp;
1107         tcp->tcp_eager_next_q = NULL;
1108 
1109         /* we have timed out before */
1110         if (tcp->tcp_syn_rcvd_timeout != 0) {
1111                 tcp->tcp_syn_rcvd_timeout = 0;
1112                 listener->tcp_syn_rcvd_timeout--;
1113                 if (listener->tcp_syn_defense &&
1114                     listener->tcp_syn_rcvd_timeout <=
1115                     (listener->tcp_tcps->tcps_conn_req_max_q0 >> 5) &&
1116                     10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
1117                     listener->tcp_last_rcv_lbolt)) {
1118                         /*
1119                          * Turn off the defense mode if we
1120                          * believe the SYN attack is over.
1121                          */
1122                         listener->tcp_syn_defense = B_FALSE;
1123                         if (listener->tcp_ip_addr_cache) {
1124                                 kmem_free((void *)listener->tcp_ip_addr_cache,
1125                                     IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
1126                                 listener->tcp_ip_addr_cache = NULL;
1127                         }
1128                 }
1129         }
1130         addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
1131         if (addr_cache != NULL) {
1132                 /*
1133                  * We have finished a 3-way handshake with this
1134                  * remote host. This proves the IP addr is good.
1135                  * Cache it!
1136                  */
1137                 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
1138                     tcp->tcp_connp->conn_faddr_v4;
1139         }
1140         mutex_exit(&listener->tcp_eager_lock);
1141 
1142         /*
1143          * Notify the ULP about the newconn. It is guaranteed that no
1144          * tcp_accept() call will be made for the eager if the
1145          * notification fails.
1146          */
1147         if ((upper = (*lconnp->conn_upcalls->su_newconn)
1148             (lconnp->conn_upper_handle, (sock_lower_handle_t)econnp,
1149             &sock_tcp_downcalls, ira->ira_cred, ira->ira_cpid,
1150             &econnp->conn_upcalls)) == NULL) {
1151                 return (B_FALSE);
1152         }
1153         econnp->conn_upper_handle = upper;
1154 
1155         tcp->tcp_detached = B_FALSE;
1156         tcp->tcp_hard_binding = B_FALSE;
1157         tcp->tcp_tconnind_started = B_TRUE;
1158 
1159         if (econnp->conn_keepalive) {
1160                 tcp->tcp_ka_last_intrvl = 0;
1161                 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
1162                     tcp->tcp_ka_interval);
1163         }
1164 
1165         /* Update the necessary parameters */
1166         tcp_get_proto_props(tcp, &sopp);
1167 
1168         (*econnp->conn_upcalls->su_set_proto_props)
1169             (econnp->conn_upper_handle, &sopp);
1170 
1171         return (B_TRUE);
1172 }