1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/debug.h>
  31 #include <sys/cmn_err.h>
  32 #include <sys/vfs.h>
  33 #include <sys/policy.h>
  34 #include <sys/modctl.h>
  35 
  36 #include <sys/sunddi.h>
  37 
  38 #include <sys/strsun.h>
  39 #include <sys/stropts.h>
  40 #include <sys/strsubr.h>
  41 #include <sys/socket.h>
  42 #include <sys/socketvar.h>
  43 #include <sys/uio.h>
  44 
  45 #include <inet/ipclassifier.h>
  46 #include <fs/sockfs/sockcommon.h>
  47 #include <fs/sockfs/sockfilter_impl.h>
  48 #include <fs/sockfs/nl7c.h>
  49 #include <fs/sockfs/socktpi.h>
  50 #include <fs/sockfs/sodirect.h>
  51 #include <inet/ip.h>
  52 
  53 extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
  54 
  55 /*
  56  * Common socket access functions.
  57  *
  58  * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
  59  * the socket_xxx() function should be used.
  60  */
  61 
  62 /*
  63  * Try to create a new sonode of the requested <family, type, protocol>.
  64  */
  65 /* ARGSUSED */
  66 struct sonode *
  67 socket_create(int family, int type, int protocol, char *devpath, char *mod,
  68     int flags, int version, struct cred *cr, int *errorp)
  69 {
  70         struct sonode *so;
  71         struct sockparams *sp = NULL;
  72         int saved_error;
  73 
  74         /*
  75          * Look for a sockparams entry that match the given criteria.
  76          * solookup() returns with the entry held.
  77          */
  78         *errorp = solookup(family, type, protocol, &sp);
  79         saved_error = *errorp;
  80         if (sp == NULL) {
  81                 int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
  82                 /*
  83                  * There is no matching sockparams entry. An ephemeral entry is
  84                  * created if the caller specifies a device or a socket module.
  85                  */
  86                 if (devpath != NULL) {
  87                         saved_error = 0;
  88                         sp = sockparams_hold_ephemeral_bydev(family, type,
  89                             protocol, devpath, kmflags, errorp);
  90                 } else if (mod != NULL) {
  91                         saved_error = 0;
  92                         sp = sockparams_hold_ephemeral_bymod(family, type,
  93                             protocol, mod, kmflags, errorp);
  94                 } else {
  95                         *errorp = solookup(family, type, 0, &sp);
  96                 }
  97 
  98                 if (sp == NULL) {
  99                         if (saved_error && (*errorp == EPROTONOSUPPORT ||
 100                             *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
 101                                 *errorp = saved_error;
 102                         return (NULL);
 103                 }
 104         }
 105 
 106         ASSERT(sp->sp_smod_info != NULL);
 107         ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
 108         sp->sp_stats.sps_ncreate.value.ui64++;
 109         so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
 110             protocol, version, flags, errorp, cr);
 111         if (so == NULL) {
 112                 SOCKPARAMS_DEC_REF(sp);
 113         } else {
 114                 if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
 115                         /* Cannot fail, only bumps so_count */
 116                         (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
 117                 } else {
 118                         if (saved_error && (*errorp == EPROTONOSUPPORT ||
 119                             *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
 120                                 *errorp = saved_error;
 121                         socket_destroy(so);
 122                         so = NULL;
 123                 }
 124         }
 125         return (so);
 126 }
 127 
 128 struct sonode *
 129 socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
 130     sock_downcalls_t *dc, int flags, int *errorp)
 131 {
 132         struct sonode *so;
 133         struct sockparams *sp;
 134         struct cred *cr;
 135 
 136         if ((cr = CRED()) == NULL)
 137                 cr = kcred;
 138 
 139         sp = parent->so_sockparams;
 140         ASSERT(sp != NULL);
 141 
 142         sp->sp_stats.sps_ncreate.value.ui64++;
 143         so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
 144             parent->so_type, parent->so_protocol, parent->so_version, flags,
 145             errorp, cr);
 146         if (so != NULL) {
 147                 SOCKPARAMS_INC_REF(sp);
 148 
 149                 so->so_proto_handle = lh;
 150                 so->so_downcalls = dc;
 151                 /*
 152                  * This function may be called in interrupt context, and CRED()
 153                  * will be NULL. In this case, pass in kcred.
 154                  */
 155                 if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
 156                         /* Cannot fail, only bumps so_count */
 157                         (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
 158                 } else  {
 159                         socket_destroy(so);
 160                         so = NULL;
 161                 }
 162         }
 163 
 164         return (so);
 165 }
 166 
 167 /*
 168  * Bind local endpoint.
 169  */
 170 int
 171 socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
 172     int flags, cred_t *cr)
 173 {
 174         return (SOP_BIND(so, name, namelen, flags, cr));
 175 }
 176 
 177 /*
 178  * Turn socket into a listen socket.
 179  */
 180 int
 181 socket_listen(struct sonode *so, int backlog, cred_t *cr)
 182 {
 183         if (backlog < 0) {
 184                 backlog = 0;
 185         }
 186 
 187         /*
 188          * Use the same qlimit as in BSD. BSD checks the qlimit
 189          * before queuing the next connection implying that a
 190          * listen(sock, 0) allows one connection to be queued.
 191          * BSD also uses 1.5 times the requested backlog.
 192          *
 193          * XNS Issue 4 required a strict interpretation of the backlog.
 194          * This has been waived subsequently for Issue 4 and the change
 195          * incorporated in XNS Issue 5. So we aren't required to do
 196          * anything special for XPG apps.
 197          */
 198         if (backlog >= (INT_MAX - 1) / 3)
 199                 backlog = INT_MAX;
 200         else
 201                 backlog = backlog * 3 / 2 + 1;
 202 
 203         return (SOP_LISTEN(so, backlog, cr));
 204 }
 205 
 206 /*
 207  * Accept incoming connection.
 208  */
 209 int
 210 socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
 211 {
 212         return (SOP_ACCEPT(lso, fflag, cr, nsop));
 213 }
 214 
 215 /*
 216  * Active open.
 217  */
 218 int
 219 socket_connect(struct sonode *so, struct sockaddr *name,
 220     socklen_t namelen, int fflag, int flags, cred_t *cr)
 221 {
 222         int error;
 223 
 224         /*
 225          * Handle a connect to a name parameter of type AF_UNSPEC like a
 226          * connect to a null address. This is the portable method to
 227          * unconnect a socket.
 228          */
 229         if ((namelen >= sizeof (sa_family_t)) &&
 230             (name->sa_family == AF_UNSPEC)) {
 231                 name = NULL;
 232                 namelen = 0;
 233         }
 234 
 235         error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
 236 
 237         if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
 238                 /*
 239                  * X/Open specification contains a requirement that
 240                  * ENETUNREACH be returned but does not require
 241                  * EHOSTUNREACH. In order to keep the test suite
 242                  * happy we mess with the errno here.
 243                  */
 244                 error = ENETUNREACH;
 245         }
 246 
 247         return (error);
 248 }
 249 
 250 /*
 251  * Get address of remote node.
 252  */
 253 int
 254 socket_getpeername(struct sonode *so, struct sockaddr *addr,
 255     socklen_t *addrlen, boolean_t accept, cred_t *cr)
 256 {
 257         ASSERT(*addrlen > 0);
 258         return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
 259 
 260 }
 261 
 262 /*
 263  * Get local address.
 264  */
 265 int
 266 socket_getsockname(struct sonode *so, struct sockaddr *addr,
 267     socklen_t *addrlen, cred_t *cr)
 268 {
 269         return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
 270 
 271 }
 272 
 273 /*
 274  * Called from shutdown().
 275  */
 276 int
 277 socket_shutdown(struct sonode *so, int how, cred_t *cr)
 278 {
 279         return (SOP_SHUTDOWN(so, how, cr));
 280 }
 281 
 282 /*
 283  * Get socket options.
 284  */
 285 /*ARGSUSED*/
 286 int
 287 socket_getsockopt(struct sonode *so, int level, int option_name,
 288     void *optval, socklen_t *optlenp, int flags, cred_t *cr)
 289 {
 290         return (SOP_GETSOCKOPT(so, level, option_name, optval,
 291             optlenp, flags, cr));
 292 }
 293 
 294 /*
 295  * Set socket options
 296  */
 297 int
 298 socket_setsockopt(struct sonode *so, int level, int option_name,
 299     const void *optval, t_uscalar_t optlen, cred_t *cr)
 300 {
 301         int val = 1;
 302         /* Caller allocates aligned optval, or passes null */
 303         ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
 304         /* If optval is null optlen is 0, and vice-versa */
 305         ASSERT(optval != NULL || optlen == 0);
 306         ASSERT(optlen != 0 || optval == NULL);
 307 
 308         if (optval == NULL && optlen == 0)
 309                 optval = &val;
 310 
 311         return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
 312 }
 313 
 314 int
 315 socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 316     cred_t *cr)
 317 {
 318         int error = 0;
 319         ssize_t orig_resid = uiop->uio_resid;
 320 
 321         /*
 322          * Do not bypass the cache if we are doing a local (AF_UNIX) write.
 323          */
 324         if (so->so_family == AF_UNIX)
 325                 uiop->uio_extflg |= UIO_COPY_CACHED;
 326         else
 327                 uiop->uio_extflg &= ~UIO_COPY_CACHED;
 328 
 329         error = SOP_SENDMSG(so, msg, uiop, cr);
 330         switch (error) {
 331         default:
 332                 break;
 333         case EINTR:
 334         case ENOMEM:
 335         /* EAGAIN is EWOULDBLOCK */
 336         case EWOULDBLOCK:
 337                 /* We did a partial send */
 338                 if (uiop->uio_resid != orig_resid)
 339                         error = 0;
 340                 break;
 341         case EPIPE:
 342                 if ((so->so_mode & SM_KERNEL) == 0)
 343                         tsignal(curthread, SIGPIPE);
 344                 break;
 345         }
 346 
 347         return (error);
 348 }
 349 
 350 int
 351 socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
 352     struct cred *cr, mblk_t **mpp)
 353 {
 354         int error = 0;
 355 
 356         error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
 357         if (error == EPIPE) {
 358                 tsignal(curthread, SIGPIPE);
 359         }
 360         return (error);
 361 }
 362 
 363 int
 364 socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
 365     cred_t *cr)
 366 {
 367         int error;
 368         ssize_t orig_resid = uiop->uio_resid;
 369 
 370         /*
 371          * Do not bypass the cache when reading data, as the application
 372          * is likely to access the data shortly.
 373          */
 374         uiop->uio_extflg |= UIO_COPY_CACHED;
 375 
 376         error = SOP_RECVMSG(so, msg, uiop, cr);
 377 
 378         switch (error) {
 379         case EINTR:
 380         /* EAGAIN is EWOULDBLOCK */
 381         case EWOULDBLOCK:
 382                 /* We did a partial read */
 383                 if (uiop->uio_resid != orig_resid)
 384                         error = 0;
 385                 break;
 386         default:
 387                 break;
 388         }
 389         return (error);
 390 }
 391 
 392 int
 393 socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
 394     struct cred *cr, int32_t *rvalp)
 395 {
 396         return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
 397 }
 398 
 399 int
 400 socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
 401     struct pollhead **phpp)
 402 {
 403         return (SOP_POLL(so, events, anyyet, reventsp, phpp));
 404 }
 405 
 406 int
 407 socket_close(struct sonode *so, int flag, struct cred *cr)
 408 {
 409         return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
 410 }
 411 
 412 int
 413 socket_close_internal(struct sonode *so, int flag, cred_t *cr)
 414 {
 415         ASSERT(so->so_count == 0);
 416 
 417         return (SOP_CLOSE(so, flag, cr));
 418 }
 419 
 420 void
 421 socket_destroy(struct sonode *so)
 422 {
 423         vn_invalid(SOTOV(so));
 424         VN_RELE(SOTOV(so));
 425 }
 426 
 427 /* ARGSUSED */
 428 void
 429 socket_destroy_internal(struct sonode *so, cred_t *cr)
 430 {
 431         struct sockparams *sp = so->so_sockparams;
 432         ASSERT(so->so_count == 0 && sp != NULL);
 433 
 434         sp->sp_smod_info->smod_sock_destroy_func(so);
 435 
 436         SOCKPARAMS_DEC_REF(sp);
 437 }
 438 
 439 /*
 440  * TODO Once the common vnode ops is available, then the vnops argument
 441  * should be removed.
 442  */
 443 /*ARGSUSED*/
 444 int
 445 sonode_constructor(void *buf, void *cdrarg, int kmflags)
 446 {
 447         struct sonode *so = buf;
 448         struct vnode *vp;
 449 
 450         vp = so->so_vnode = vn_alloc(kmflags);
 451         if (vp == NULL) {
 452                 return (-1);
 453         }
 454         vp->v_data = so;
 455         vn_setops(vp, socket_vnodeops);
 456 
 457         so->so_priv          = NULL;
 458         so->so_oobmsg                = NULL;
 459 
 460         so->so_proto_handle  = NULL;
 461 
 462         so->so_peercred      = NULL;
 463 
 464         so->so_rcv_queued    = 0;
 465         so->so_rcv_q_head    = NULL;
 466         so->so_rcv_q_last_head       = NULL;
 467         so->so_rcv_head              = NULL;
 468         so->so_rcv_last_head = NULL;
 469         so->so_rcv_wanted    = 0;
 470         so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
 471         so->so_rcv_timer_tid = 0;
 472         so->so_rcv_thresh    = 0;
 473 
 474         list_create(&so->so_acceptq_list, sizeof (struct sonode),
 475             offsetof(struct sonode, so_acceptq_node));
 476         list_create(&so->so_acceptq_defer, sizeof (struct sonode),
 477             offsetof(struct sonode, so_acceptq_node));
 478         list_link_init(&so->so_acceptq_node);
 479         so->so_acceptq_len   = 0;
 480         so->so_backlog               = 0;
 481         so->so_listener              = NULL;
 482 
 483         so->so_snd_qfull     = B_FALSE;
 484 
 485         so->so_filter_active = 0;
 486         so->so_filter_tx     = 0;
 487         so->so_filter_defertime = 0;
 488         so->so_filter_top    = NULL;
 489         so->so_filter_bottom = NULL;
 490 
 491         mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
 492         mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
 493         rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
 494         cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
 495         cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
 496         cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);
 497 
 498         cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
 499         cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
 500         cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
 501         cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
 502         cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
 503 
 504         return (0);
 505 }
 506 
 507 /*ARGSUSED*/
 508 void
 509 sonode_destructor(void *buf, void *cdrarg)
 510 {
 511         struct sonode *so = buf;
 512         struct vnode *vp = SOTOV(so);
 513 
 514         ASSERT(so->so_priv == NULL);
 515         ASSERT(so->so_peercred == NULL);
 516 
 517         ASSERT(so->so_oobmsg == NULL);
 518 
 519         ASSERT(so->so_rcv_q_head == NULL);
 520 
 521         list_destroy(&so->so_acceptq_list);
 522         list_destroy(&so->so_acceptq_defer);
 523         ASSERT(!list_link_active(&so->so_acceptq_node));
 524         ASSERT(so->so_listener == NULL);
 525 
 526         ASSERT(so->so_filter_active == 0);
 527         ASSERT(so->so_filter_tx == 0);
 528         ASSERT(so->so_filter_top == NULL);
 529         ASSERT(so->so_filter_bottom == NULL);
 530 
 531         ASSERT(vp->v_data == so);
 532         ASSERT(vn_matchops(vp, socket_vnodeops));
 533 
 534         vn_free(vp);
 535 
 536         mutex_destroy(&so->so_lock);
 537         mutex_destroy(&so->so_acceptq_lock);
 538         rw_destroy(&so->so_fallback_rwlock);
 539 
 540         cv_destroy(&so->so_state_cv);
 541         cv_destroy(&so->so_single_cv);
 542         cv_destroy(&so->so_read_cv);
 543         cv_destroy(&so->so_acceptq_cv);
 544         cv_destroy(&so->so_snd_cv);
 545         cv_destroy(&so->so_rcv_cv);
 546         cv_destroy(&so->so_closing_cv);
 547 }
 548 
 549 void
 550 sonode_init(struct sonode *so, struct sockparams *sp, int family,
 551     int type, int protocol, sonodeops_t *sops)
 552 {
 553         vnode_t *vp;
 554 
 555         vp = SOTOV(so);
 556 
 557         so->so_flag  = 0;
 558 
 559         so->so_state = 0;
 560         so->so_mode  = 0;
 561 
 562         so->so_count = 0;
 563 
 564         so->so_family        = family;
 565         so->so_type  = type;
 566         so->so_protocol      = protocol;
 567 
 568         SOCK_CONNID_INIT(so->so_proto_connid);
 569 
 570         so->so_options       = 0;
 571         so->so_linger.l_onoff   = 0;
 572         so->so_linger.l_linger = 0;
 573         so->so_sndbuf        = 0;
 574         so->so_error = 0;
 575         so->so_rcvtimeo      = 0;
 576         so->so_sndtimeo = 0;
 577         so->so_xpg_rcvbuf = 0;
 578 
 579         ASSERT(so->so_oobmsg == NULL);
 580         so->so_oobmark       = 0;
 581         so->so_pgrp  = 0;
 582 
 583         ASSERT(so->so_peercred == NULL);
 584 
 585         so->so_zoneid = getzoneid();
 586 
 587         so->so_sockparams = sp;
 588 
 589         so->so_ops = sops;
 590 
 591         so->so_not_str = (sops != &sotpi_sonodeops);
 592 
 593         so->so_proto_handle = NULL;
 594 
 595         so->so_downcalls = NULL;
 596 
 597         so->so_copyflag = 0;
 598 
 599         vn_reinit(vp);
 600         vp->v_vfsp   = rootvfs;
 601         vp->v_type   = VSOCK;
 602         vp->v_rdev   = sockdev;
 603 
 604         so->so_snd_qfull = B_FALSE;
 605         so->so_minpsz = 0;
 606 
 607         so->so_rcv_wakeup = B_FALSE;
 608         so->so_snd_wakeup = B_FALSE;
 609         so->so_flowctrld = B_FALSE;
 610 
 611         so->so_pollev = 0;
 612         bzero(&so->so_poll_list, sizeof (so->so_poll_list));
 613         bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
 614 
 615         bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
 616         so->so_ksock_cb_arg = NULL;
 617 
 618         so->so_max_addr_len = sizeof (struct sockaddr_storage);
 619 
 620         so->so_direct = NULL;
 621 
 622         vn_exists(vp);
 623 }
 624 
 625 void
 626 sonode_fini(struct sonode *so)
 627 {
 628         vnode_t *vp;
 629 
 630         ASSERT(so->so_count == 0);
 631 
 632         if (so->so_rcv_timer_tid) {
 633                 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
 634                 (void) untimeout(so->so_rcv_timer_tid);
 635                 so->so_rcv_timer_tid = 0;
 636         }
 637 
 638         if (so->so_poll_list.ph_list != NULL) {
 639                 pollwakeup(&so->so_poll_list, POLLERR);
 640                 pollhead_clean(&so->so_poll_list);
 641         }
 642 
 643         if (so->so_direct != NULL)
 644                 sod_sock_fini(so);
 645 
 646         vp = SOTOV(so);
 647         vn_invalid(vp);
 648 
 649         if (so->so_peercred != NULL) {
 650                 crfree(so->so_peercred);
 651                 so->so_peercred = NULL;
 652         }
 653         /* Detach and destroy filters */
 654         if (so->so_filter_top != NULL)
 655                 sof_sonode_cleanup(so);
 656 
 657         ASSERT(list_is_empty(&so->so_acceptq_list));
 658         ASSERT(list_is_empty(&so->so_acceptq_defer));
 659         ASSERT(!list_link_active(&so->so_acceptq_node));
 660 
 661         ASSERT(so->so_rcv_queued == 0);
 662         ASSERT(so->so_rcv_q_head == NULL);
 663         ASSERT(so->so_rcv_q_last_head == NULL);
 664         ASSERT(so->so_rcv_head == NULL);
 665         ASSERT(so->so_rcv_last_head == NULL);
 666 }