illumos-gate New usr/src/lib/brand/lx/lx

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <unistd.h>
  28 #include <fcntl.h>
  29 #include <errno.h>
  30 #include <signal.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <libintl.h>
  34 #include <strings.h>
  35 #include <alloca.h>
  36 #include <ucred.h>
  37 
  38 #include <sys/param.h>
  39 #include <sys/brand.h>
  40 #include <sys/syscall.h>
  41 #include <sys/socket.h>
  42 #include <sys/socketvar.h>
  43 #include <sys/un.h>
  44 #include <netinet/tcp.h>
  45 #include <netinet/igmp.h>
  46 #include <sys/types.h>
  47 #include <sys/stat.h>
  48 #include <sys/lx_debug.h>
  49 #include <sys/lx_syscall.h>
  50 #include <sys/lx_socket.h>
  51 #include <sys/lx_brand.h>
  52 #include <sys/lx_misc.h>
  53 
  54 /*
  55  * This string is used to prefix all abstract namespace unix sockets, ie all
  56  * abstract namespace sockets are converted to regular sockets in the /tmp
  57  * directory with .ABSK_ prefixed to their names.
  58  */
  59 #define ABST_PRFX "/tmp/.ABSK_"
  60 #define ABST_PRFX_LEN 11
  61 
  62 static int lx_socket(ulong_t *);
  63 static int lx_bind(ulong_t *);
  64 static int lx_connect(ulong_t *);
  65 static int lx_listen(ulong_t *);
  66 static int lx_accept(ulong_t *);
  67 static int lx_getsockname(ulong_t *);
  68 static int lx_getpeername(ulong_t *);
  69 static int lx_socketpair(ulong_t *);
  70 static int lx_send(ulong_t *);
  71 static int lx_recv(ulong_t *);
  72 static int lx_sendto(ulong_t *);
  73 static int lx_recvfrom(ulong_t *);
  74 static int lx_shutdown(ulong_t *);
  75 static int lx_setsockopt(ulong_t *);
  76 static int lx_getsockopt(ulong_t *);
  77 static int lx_sendmsg(ulong_t *);
  78 static int lx_recvmsg(ulong_t *);
  79 
  80 typedef int (*sockfn_t)(ulong_t *);
  81 
  82 static struct {
  83         sockfn_t s_fn;  /* Function implementing the subcommand */
  84         int s_nargs;    /* Number of arguments the function takes */
  85 } sockfns[] = {
  86         lx_socket, 3,
  87         lx_bind, 3,
  88         lx_connect, 3,
  89         lx_listen, 2,
  90         lx_accept, 3,
  91         lx_getsockname, 3,
  92         lx_getpeername, 3,
  93         lx_socketpair, 4,
  94         lx_send, 4,
  95         lx_recv, 4,
  96         lx_sendto, 6,
  97         lx_recvfrom, 6,
  98         lx_shutdown, 2,
  99         lx_setsockopt, 5,
 100         lx_getsockopt, 5,
 101         lx_sendmsg, 3,
 102         lx_recvmsg, 3
 103 };
 104 
 105 /*
 106  * What follows are a series of tables we use to translate Linux constants
 107  * into equivalent Solaris constants and back again.  I wish this were
 108  * cleaner, more programmatic, and generally nicer.  Sadly, life is messy,
 109  * and Unix networking even more so.
 110  */
 111 static const int ltos_family[LX_AF_MAX + 1] =  {
 112         AF_UNSPEC, AF_UNIX, AF_INET, AF_CCITT, AF_IPX,
 113         AF_APPLETALK, AF_NOTSUPPORTED, AF_OSI, AF_NOTSUPPORTED,
 114         AF_X25, AF_INET6, AF_CCITT, AF_DECnet,
 115         AF_802, AF_POLICY, AF_KEY, AF_ROUTE,
 116         AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED,
 117         AF_NOTSUPPORTED, AF_SNA, AF_NOTSUPPORTED, AF_NOTSUPPORTED,
 118         AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED,
 119         AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED, AF_NOTSUPPORTED
 120 };
 121 
 122 #define LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL)
 123 
 124 static const int ltos_socktype[LX_SOCK_PACKET + 1] = {
 125         SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW,
 126         SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED,
 127         SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED
 128 };
 129 
 130 #define LTOS_SOCKTYPE(t)        \
 131         ((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL)
 132 
 133 /*
 134  * Linux socket option type definitions
 135  *
 136  * The protocol `levels` are well defined (see in.h) The option values are
 137  * not so well defined. Linux often uses different values to Solaris
 138  * although they mean the same thing. For example, IP_TOS in Linux is
 139  * defined as value 1 but in Solaris it is defined as value 3. This table
 140  * maps all the Protocol levels to their options and maps them between
 141  * Linux and Solaris and vice versa.  Hence the reason for the complexity.
 142  */
 143 
 144 typedef struct lx_proto_opts {
 145         const int *proto;       /* Linux to Solaris mapping table */
 146         int maxentries;         /* max entries in this table */
 147 } lx_proto_opts_t;
 148 
 149 #define OPTNOTSUP       -1      /* we don't support it */
 150 
 151 static const int ltos_ip_sockopts[LX_IP_DROP_MEMBERSHIP + 1] = {
 152         OPTNOTSUP, IP_TOS, IP_TTL, IP_HDRINCL,
 153         IP_OPTIONS, OPTNOTSUP, IP_RECVOPTS, IP_RETOPTS,
 154         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 155         IP_RECVTTL, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 156         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 157         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 158         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 159         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 160         IP_MULTICAST_IF, IP_MULTICAST_TTL, IP_MULTICAST_LOOP,
 161         IP_ADD_MEMBERSHIP, IP_DROP_MEMBERSHIP
 162 };
 163 
 164 static const int ltos_tcp_sockopts[LX_TCP_QUICKACK + 1] = {
 165         OPTNOTSUP, TCP_NODELAY, TCP_MAXSEG, OPTNOTSUP,
 166         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 167         TCP_KEEPALIVE, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 168         OPTNOTSUP
 169 };
 170 
 171 static const int ltos_igmp_sockopts[IGMP_MTRACE + 1] = {
 172         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 173         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 174         IGMP_MINLEN, OPTNOTSUP, OPTNOTSUP, /* XXX: was IGMP_TIMER_SCALE */
 175         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 176         OPTNOTSUP, OPTNOTSUP, IGMP_MEMBERSHIP_QUERY,
 177         IGMP_V1_MEMBERSHIP_REPORT, IGMP_DVMRP,
 178         IGMP_PIM, OPTNOTSUP, IGMP_V2_MEMBERSHIP_REPORT,
 179         IGMP_V2_LEAVE_GROUP, OPTNOTSUP, OPTNOTSUP,
 180         OPTNOTSUP, OPTNOTSUP, OPTNOTSUP, OPTNOTSUP,
 181         IGMP_MTRACE_RESP, IGMP_MTRACE
 182 };
 183 
 184 static const int ltos_socket_sockopts[LX_SO_ACCEPTCONN + 1] = {
 185         OPTNOTSUP,      SO_DEBUG,       SO_REUSEADDR,   SO_TYPE,
 186         SO_ERROR,       SO_DONTROUTE,   SO_BROADCAST,   SO_SNDBUF,
 187         SO_RCVBUF,      SO_KEEPALIVE,   SO_OOBINLINE,   OPTNOTSUP,
 188         OPTNOTSUP,      SO_LINGER,      OPTNOTSUP,      OPTNOTSUP,
 189         OPTNOTSUP,      OPTNOTSUP,      SO_RCVLOWAT,    SO_SNDLOWAT,
 190         SO_RCVTIMEO,    SO_SNDTIMEO,    OPTNOTSUP,      OPTNOTSUP,
 191         OPTNOTSUP,      OPTNOTSUP,      OPTNOTSUP,      OPTNOTSUP,
 192         OPTNOTSUP,      OPTNOTSUP,      SO_ACCEPTCONN
 193 };
 194 
 195 #define PROTO_SOCKOPTS(opts)    \
 196         { (opts), sizeof ((opts)) / sizeof ((opts)[0]) }
 197 
 198 /*
 199  * The main Linux to Solaris protocol to options mapping table
 200  * IPPROTO_TAB_SIZE can be set up to IPPROTO_MAX. All entries above
 201  * IPPROTO_TAB_SIZE are in effect not implemented,
 202  */
 203 
 204 #define IPPROTO_TAB_SIZE        8
 205 
 206 static const lx_proto_opts_t ltos_proto_opts[IPPROTO_TAB_SIZE] = {
 207         /* IPPROTO_IP           0 */
 208         PROTO_SOCKOPTS(ltos_ip_sockopts),
 209         /* SOL_SOCKET           1 */
 210         PROTO_SOCKOPTS(ltos_socket_sockopts),
 211         /* IPPROTO_IGMP         2 */
 212         PROTO_SOCKOPTS(ltos_igmp_sockopts),
 213         /* NOT IMPLEMENTED      3 */
 214         { NULL, 0 },
 215         /* NOT IMPLEMENTED      4 */
 216         { NULL, 0 },
 217         /* NOT IMPLEMENTED      5 */
 218         { NULL, 0 },
 219         /* IPPROTO_TCP          6 */
 220         PROTO_SOCKOPTS(ltos_tcp_sockopts),
 221         /* NOT IMPLEMENTED      7 */
 222         { NULL, 0 }
 223 };
 224 
 225 /*
 226  * Lifted from socket.h, since these definitions are contained within
 227  * _KERNEL guards.
 228  */
 229 #define _CMSG_HDR_ALIGNMENT     4
 230 #define _CMSG_HDR_ALIGN(x)      (((uintptr_t)(x) + _CMSG_HDR_ALIGNMENT - 1) & \
 231                                     ~(_CMSG_HDR_ALIGNMENT - 1))
 232 #define CMSG_FIRSTHDR(m)                                                \
 233         (((m)->msg_controllen < sizeof (struct cmsghdr)) ?                \
 234             (struct cmsghdr *)0 : (struct cmsghdr *)((m)->msg_control))
 235 
 236 #define CMSG_NXTHDR(m, c)                                               \
 237         (((c) == 0) ? CMSG_FIRSTHDR(m) :                        \
 238         ((((uintptr_t)_CMSG_HDR_ALIGN((char *)(c) +                     \
 239         ((struct cmsghdr *)(c))->cmsg_len) + sizeof (struct cmsghdr)) >   \
 240         (((uintptr_t)((struct lx_msghdr *)(m))->msg_control) +               \
 241         ((uintptr_t)((struct lx_msghdr *)(m))->msg_controllen))) ?   \
 242         ((struct cmsghdr *)0) :                                         \
 243         ((struct cmsghdr *)_CMSG_HDR_ALIGN((char *)(c) +                \
 244             ((struct cmsghdr *)(c))->cmsg_len))))
 245 
 246 #define LX_TO_SOL       1
 247 #define SOL_TO_LX       2
 248 
 249 static int
 250 convert_cmsgs(int direction, struct lx_msghdr *msg, char *caller)
 251 {
 252         struct cmsghdr *cmsg, *last;
 253         int err = 0;
 254 
 255         cmsg = CMSG_FIRSTHDR(msg);
 256         while (cmsg != NULL && err == 0) {
 257                 if (direction == LX_TO_SOL) {
 258                         if (cmsg->cmsg_level == LX_SOL_SOCKET) {
 259                                 cmsg->cmsg_level = SOL_SOCKET;
 260                                 if (cmsg->cmsg_type == LX_SCM_RIGHTS)
 261                                         cmsg->cmsg_type = SCM_RIGHTS;
 262                                 else if (cmsg->cmsg_type == LX_SCM_CRED)
 263                                         cmsg->cmsg_type = SCM_UCRED;
 264                                 else
 265                                         err = ENOTSUP;
 266                         } else {
 267                                 err = ENOTSUP;
 268                         }
 269                 } else {
 270                         if (cmsg->cmsg_level == SOL_SOCKET) {
 271                                 cmsg->cmsg_level = LX_SOL_SOCKET;
 272                                 if (cmsg->cmsg_type == SCM_RIGHTS)
 273                                         cmsg->cmsg_type = LX_SCM_RIGHTS;
 274                                 else if (cmsg->cmsg_type == SCM_UCRED)
 275                                         cmsg->cmsg_type = LX_SCM_CRED;
 276                                 else
 277                                         err = ENOTSUP;
 278                         } else {
 279                                 err = ENOTSUP;
 280                         }
 281                 }
 282 
 283                 last = cmsg;
 284                 cmsg = CMSG_NXTHDR(msg, last);
 285         }
 286         if (err)
 287                 lx_unsupported("Unsupported socket control message in %s\n.",
 288                     caller);
 289 
 290         return (err);
 291 }
 292 
 293 /*
 294  * If inaddr is an abstract namespace unix socket, this function expects addr
 295  * to have enough memory to hold the expanded socket name, ie it must be of
 296  * size *len + ABST_PRFX_LEN.
 297  */
 298 static int
 299 convert_sockaddr(struct sockaddr *addr, socklen_t *len,
 300         struct sockaddr *inaddr, socklen_t inlen)
 301 {
 302         sa_family_t family;
 303         int lx_in6_len;
 304         int size;
 305         int i, orig_len;
 306 
 307         /*
 308          * Note that if the buffer at inaddr is ever smaller than inlen bytes,
 309          * we may erroneously return EFAULT rather than a possible EINVAL
 310          * as the copy comes before the various checks as to whether inlen
 311          * is of the proper length for the socket type.
 312          *
 313          * This isn't an issue at present because all callers to this routine
 314          * do meet that constraint.
 315          */
 316         if ((ssize_t)inlen < 0)
 317                 return (-EINVAL);
 318         if (uucopy(inaddr, addr, inlen) != 0)
 319                 return (-errno);
 320 
 321         family = LTOS_FAMILY(addr->sa_family);
 322 
 323         switch (family) {
 324                 case (sa_family_t)AF_NOTSUPPORTED:
 325                         return (-EPROTONOSUPPORT);
 326                 case (sa_family_t)AF_INVAL:
 327                         return (-EAFNOSUPPORT);
 328                 case AF_INET:
 329                         size = sizeof (struct sockaddr);
 330 
 331                         if (inlen < size)
 332                                 return (-EINVAL);
 333 
 334                         *len = size;
 335                         break;
 336 
 337                 case AF_INET6:
 338                         /*
 339                          * The Solaris sockaddr_in6 has one more 32-bit
 340                          * field than the Linux version.
 341                          */
 342                         size = sizeof (struct sockaddr_in6);
 343                         lx_in6_len = size - sizeof (uint32_t);
 344 
 345                         if (inlen != lx_in6_len)
 346                                 return (-EINVAL);
 347 
 348                         *len = (sizeof (struct sockaddr_in6));
 349                         bzero((char *)addr + lx_in6_len, sizeof (uint32_t));
 350                         break;
 351 
 352                 case AF_UNIX:
 353                         if (inlen > sizeof (struct sockaddr_un))
 354                                 return (-EINVAL);
 355 
 356                         *len = inlen;
 357 
 358                         /*
 359                          * Linux supports abstract unix sockets, which are
 360                          * simply sockets that do not exist on the file system.
 361                          * These sockets are denoted by beginning the path with
 362                          * a NULL character. To support these, we strip out the
 363                          * leading NULL character and change the path to point
 364                          * to a real place in /tmp directory, by prepending
 365                          * ABST_PRFX and replacing all illegal characters with
 366                          * '_'.
 367                          */
 368                         if (addr->sa_data[0] == '\0') {
 369 
 370                                 /*
 371                                  * inlen is the entire size of the sockaddr_un
 372                                  * data structure, including the sun_family, so
 373                                  * we need to subtract this out. We subtract
 374                                  * 1 since we want to overwrite the leadin NULL
 375                                  * character, and thus do not include it in the
 376                                  * length.
 377                                  */
 378                                 orig_len = inlen - sizeof (addr->sa_family) - 1;
 379 
 380                                 /*
 381                                  * Since abstract paths can contain illegal
 382                                  * filename characters, we simply replace these
 383                                  * with '_'
 384                                  */
 385                                 for (i = 1; i < orig_len + 1; i++) {
 386                                         if (addr->sa_data[i] == '\0' ||
 387                                             addr->sa_data[i] == '/')
 388                                                 addr->sa_data[i] = '_';
 389                                 }
 390 
 391                                 /*
 392                                  * prepend ABST_PRFX to file name, minus the
 393                                  * leading NULL character. This places the
 394                                  * socket as a hidden file in the /tmp
 395                                  * directory.
 396                                  */
 397                                 (void) memmove(addr->sa_data + ABST_PRFX_LEN,
 398                                     addr->sa_data + 1, orig_len);
 399                                 bcopy(ABST_PRFX, addr->sa_data, ABST_PRFX_LEN);
 400 
 401                                 /*
 402                                  * Since abstract socket paths may not be NULL
 403                                  * terminated, we must explicitly NULL terminate
 404                                  * our string.
 405                                  */
 406                                 addr->sa_data[orig_len + ABST_PRFX_LEN] = '\0';
 407 
 408                                 /*
 409                                  * Make len reflect the new len of our string.
 410                                  * Although we removed the NULL character at the
 411                                  * beginning of the string, we added a NULL
 412                                  * character to the end, so the net gain in
 413                                  * length is simply ABST_PRFX_LEN.
 414                                  */
 415                                 *len = inlen + ABST_PRFX_LEN;
 416                         }
 417                         break;
 418 
 419                 default:
 420                         *len = inlen;
 421         }
 422 
 423         addr->sa_family = family;
 424         return (0);
 425 }
 426 
 427 static int
 428 convert_sock_args(int in_dom, int in_type, int in_protocol, int *out_dom,
 429     int *out_type)
 430 {
 431         int domain, type;
 432 
 433         if (in_dom < 0 || in_type < 0 || in_protocol < 0)
 434                 return (-EINVAL);
 435 
 436         domain = LTOS_FAMILY(in_dom);
 437         if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC)
 438                 return (-EAFNOSUPPORT);
 439         if (domain == AF_INVAL)
 440                 return (-EINVAL);
 441 
 442         type = LTOS_SOCKTYPE(in_type);
 443         if (type == SOCK_NOTSUPPORTED)
 444                 return (-ESOCKTNOSUPPORT);
 445         if (type == SOCK_INVAL)
 446                 return (-EINVAL);
 447 
 448         /*
 449          * Linux does not allow the app to specify IP Protocol for raw
 450          * sockets.  Solaris does, so bail out here.
 451          */
 452         if (type == SOCK_RAW && in_protocol == IPPROTO_IP)
 453                 return (-ESOCKTNOSUPPORT);
 454 
 455         *out_dom = domain;
 456         *out_type = type;
 457         return (0);
 458 }
 459 
 460 static int
 461 convert_sockflags(int lx_flags)
 462 {
 463         int solaris_flags = 0;
 464 
 465         if (lx_flags & LX_MSG_OOB)
 466                 solaris_flags |= MSG_OOB;
 467 
 468         if (lx_flags & LX_MSG_PEEK)
 469                 solaris_flags |= MSG_PEEK;
 470 
 471         if (lx_flags & LX_MSG_DONTROUTE)
 472                 solaris_flags |= MSG_DONTROUTE;
 473 
 474         if (lx_flags & LX_MSG_CTRUNC)
 475                 solaris_flags |= MSG_CTRUNC;
 476 
 477         if (lx_flags & LX_MSG_TRUNC)
 478                 solaris_flags |= MSG_TRUNC;
 479 
 480         if (lx_flags & LX_MSG_WAITALL)
 481                 solaris_flags |= MSG_WAITALL;
 482 
 483         if (lx_flags & LX_MSG_DONTWAIT)
 484                 solaris_flags |= MSG_DONTWAIT;
 485 
 486         if (lx_flags & LX_MSG_EOR)
 487                 solaris_flags |= MSG_EOR;
 488 
 489         if (lx_flags & LX_MSG_PROXY)
 490                 lx_unsupported("socket operation with MSG_PROXY flag set");
 491 
 492         if (lx_flags & LX_MSG_FIN)
 493                 lx_unsupported("socket operation with MSG_FIN flag set");
 494 
 495         if (lx_flags & LX_MSG_SYN)
 496                 lx_unsupported("socket operation with MSG_SYN flag set");
 497 
 498         if (lx_flags & LX_MSG_CONFIRM)
 499                 lx_unsupported("socket operation with MSG_CONFIRM set");
 500 
 501         if (lx_flags & LX_MSG_RST)
 502                 lx_unsupported("socket operation with MSG_RST flag set");
 503 
 504         if (lx_flags & LX_MSG_MORE)
 505                 lx_unsupported("socket operation with MSG_MORE flag set");
 506 
 507         return (solaris_flags);
 508 }
 509 
 510 static int
 511 lx_socket(ulong_t *args)
 512 {
 513         int domain;
 514         int type;
 515         int protocol = (int)args[2];
 516         int fd;
 517         int err;
 518 
 519         err = convert_sock_args((int)args[0], (int)args[1], protocol,
 520             &domain, &type);
 521         if (err != 0)
 522                 return (err);
 523 
 524         lx_debug("\tsocket(%d, %d, %d)", domain, type, protocol);
 525 
 526         /* Right now IPv6 sockets don't work */
 527         if (domain == AF_INET6)
 528                 return (-EAFNOSUPPORT);
 529 
 530         /*
 531          * Clients of the auditing subsystem used by CentOS 4 and 5 expect to
 532          * be able to create AF_ROUTE SOCK_RAW sockets to communicate with the
 533          * auditing daemons. Failure to create these sockets will cause login,
 534          * ssh and useradd, amoung other programs to fail. To trick these
 535          * programs into working, we convert the socket domain and type to
 536          * something that we do support. Then when sendto is called on these
 537          * sockets, we return an error code. See lx_sendto.
 538          */
 539         if (domain == AF_ROUTE && type == SOCK_RAW) {
 540                 domain = AF_INET;
 541                 type = SOCK_STREAM;
 542                 protocol = 0;
 543         }
 544 
 545         fd = socket(domain, type, protocol);
 546         if (fd >= 0)
 547                 return (fd);
 548 
 549         if (errno == EPROTONOSUPPORT)
 550                 return (-ESOCKTNOSUPPORT);
 551 
 552         return (-errno);
 553 }
 554 
 555 static int
 556 lx_bind(ulong_t *args)
 557 {
 558         int sockfd = (int)args[0];
 559         struct stat64 statbuf;
 560         struct sockaddr *name, oldname;
 561         socklen_t len;
 562         int r, r2, ret, tmperrno;
 563         int abst_sock;
 564         struct stat sb;
 565 
 566         if (uucopy((struct sockaddr *)args[1], &oldname,
 567             sizeof (struct sockaddr)) != 0)
 568                 return (-errno);
 569 
 570         /*
 571          * Handle Linux abstract sockets, which are UNIX sockets whose path
 572          * begins with a NULL character.
 573          */
 574         abst_sock = (oldname.sa_family == AF_UNIX) &&
 575             (oldname.sa_data[0] == '\0');
 576 
 577         /*
 578          * convert_sockaddr will expand the socket path if it is abstract, so
 579          * we need to allocate extra memory for it now.
 580          */
 581         if ((name = SAFE_ALLOCA((socklen_t)args[2] +
 582             abst_sock * ABST_PRFX_LEN)) == NULL)
 583                 return (-EINVAL);
 584 
 585         if ((r = convert_sockaddr(name, &len, (struct sockaddr *)args[1],
 586             (socklen_t)args[2])) < 0)
 587                 return (r);
 588 
 589         /*
 590          * Linux abstract namespace unix sockets are simply socket that do not
 591          * exist on the filesystem. We emulate them by changing their paths
 592          * in covert_sockaddr so that they point real files names on the
 593          * filesystem. Because in Linux they do not exist on the filesystem
 594          * applications do not have to worry about deleting files, however in
 595          * our filesystem based emulation we do. To solve this problem, we first
 596          * check to see if the socket already exists before we create one. If it
 597          * does we attempt to connect to it to see if it is in use, or just
 598          * left over from a previous lx_bind call. If we are unable to connect,
 599          * we assume it is not in use and remove the file, then continue on
 600          * as if the file never existed.
 601          */
 602         if (abst_sock && stat(name->sa_data, &sb) == 0 &&
 603             S_ISSOCK(sb.st_mode)) {
 604                 if ((r2 = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
 605                         return (-ENOSR);
 606                 ret = connect(r2, name, len);
 607                 tmperrno = errno;
 608                 if (close(r2) < 0)
 609                         return (-EINVAL);
 610 
 611                 /*
 612                  * if we can't connect to the socket, assume no one is using it
 613                  * and remove it, otherwise assume it is in use and return
 614                  * EADDRINUSE.
 615                  */
 616                 if ((ret < 0) && (tmperrno == ECONNREFUSED)) {
 617                         if (unlink(name->sa_data) < 0) {
 618                                 return (-EADDRINUSE);
 619                         }
 620                 } else {
 621                         return (-EADDRINUSE);
 622                 }
 623         }
 624 
 625         lx_debug("\tbind(%d, 0x%p, %d)", sockfd, name, len);
 626 
 627         if (name->sa_family == AF_UNIX)
 628                 lx_debug("\t\tAF_UNIX, path = %s", name->sa_data);
 629 
 630         r = bind(sockfd, name, len);
 631 
 632         /*
 633          * Linux returns EADDRINUSE for attempts to bind to UNIX domain
 634          * sockets that aren't sockets.
 635          */
 636         if ((r < 0) && (errno == EINVAL) && (name->sa_family == AF_UNIX) &&
 637             ((stat64(name->sa_data, &statbuf) == 0) &&
 638             (!S_ISSOCK(statbuf.st_mode))))
 639                 return (-EADDRINUSE);
 640 
 641         return ((r < 0) ? -errno : r);
 642 }
 643 
 644 static int
 645 lx_connect(ulong_t *args)
 646 {
 647         int sockfd = (int)args[0];
 648         struct sockaddr *name, oldname;
 649         socklen_t len;
 650         int r;
 651         int abst_sock;
 652 
 653         if (uucopy((struct sockaddr *)args[1], &oldname,
 654             sizeof (struct sockaddr)) != 0)
 655                 return (-errno);
 656 
 657 
 658         /* Handle Linux abstract sockets */
 659         abst_sock = (oldname.sa_family == AF_UNIX) &&
 660             (oldname.sa_data[0] == '\0');
 661 
 662         /*
 663          * convert_sockaddr will expand the socket path, if it is abstract, so
 664          * we need to allocate extra memory for it now.
 665          */
 666         if ((name = SAFE_ALLOCA((socklen_t)args[2] +
 667             abst_sock * ABST_PRFX_LEN)) == NULL)
 668                 return (-EINVAL);
 669 
 670         if ((r = convert_sockaddr(name, &len, (struct sockaddr *)args[1],
 671             (socklen_t)args[2])) < 0)
 672                 return (r);
 673 
 674         lx_debug("\tconnect(%d, 0x%p, %d)", sockfd, name, len);
 675 
 676         if (name->sa_family == AF_UNIX)
 677                 lx_debug("\t\tAF_UNIX, path = %s", name->sa_data);
 678 
 679         r = connect(sockfd, name, len);
 680 
 681         return ((r < 0) ? -errno : r);
 682 }
 683 
 684 static int
 685 lx_listen(ulong_t *args)
 686 {
 687         int sockfd = (int)args[0];
 688         int backlog = (int)args[1];
 689         int r;
 690 
 691         lx_debug("\tlisten(%d, %d)", sockfd, backlog);
 692         r = listen(sockfd, backlog);
 693 
 694         return ((r < 0) ? -errno : r);
 695 }
 696 
 697 static int
 698 lx_accept(ulong_t *args)
 699 {
 700         int sockfd = (int)args[0];
 701         struct sockaddr *name = (struct sockaddr *)args[1];
 702         socklen_t namelen = 0;
 703         int r;
 704 
 705         lx_debug("\taccept(%d, 0x%p, 0x%p", sockfd, args[1], args[2]);
 706 
 707         /*
 708          * The Linux man page says that -1 is returned and errno is set to
 709          * EFAULT if the "name" address is bad, but it is silent on what to
 710          * set errno to if the "namelen" address is bad.  Experimentation
 711          * shows that Linux (at least the 2.4.21 kernel in CentOS) actually
 712          * sets errno to EINVAL in both cases.
 713          *
 714          * Note that we must first check the name pointer, as the Linux
 715          * docs state nothing is copied out if the "name" pointer is NULL.
 716          * If it is NULL, we don't care about the namelen pointer's value
 717          * or about dereferencing it.
 718          *
 719          * Happily, Solaris' accept(3SOCKET) treats NULL name pointers and
 720          * zero namelens the same way.
 721          */
 722         if ((name != NULL) &&
 723             (uucopy((void *)args[2], &namelen, sizeof (socklen_t)) != 0))
 724                 return ((errno == EFAULT) ? -EINVAL : -errno);
 725 
 726         lx_debug("\taccept namelen = %d", namelen);
 727 
 728         if ((r = accept(sockfd, name, &namelen)) < 0)
 729                 return ((errno == EFAULT) ? -EINVAL : -errno);
 730 
 731         lx_debug("\taccept namelen returned %d bytes", namelen);
 732 
 733         /*
 734          * In Linux, accept()ed sockets do not inherit anything set by
 735          * fcntl(), so filter those out.
 736          */
 737         if (fcntl(r, F_SETFL, 0) < 0)
 738                 return (-errno);
 739 
 740         /*
 741          * Once again, a bad "namelen" address sets errno to EINVAL, not
 742          * EFAULT.  If namelen was zero, there's no need to copy a zero back
 743          * out.
 744          *
 745          * Logic might dictate that we should check if we can write to
 746          * the namelen pointer earlier so we don't accept a pending connection
 747          * only to fail the call because we can't write the namelen value back
 748          * out. However, testing shows Linux does indeed fail the call after
 749          * accepting the connection so we must behave in a compatible manner.
 750          */
 751         if ((name != NULL) && (namelen != 0) &&
 752             (uucopy(&namelen, (void *)args[2], sizeof (socklen_t)) != 0))
 753                 return ((errno == EFAULT) ? -EINVAL : -errno);
 754 
 755         return (r);
 756 }
 757 
 758 static int
 759 lx_getsockname(ulong_t *args)
 760 {
 761         int sockfd = (int)args[0];
 762         struct sockaddr *name = NULL;
 763         socklen_t namelen, namelen_orig;
 764 
 765         if (uucopy((void *)args[2], &namelen, sizeof (socklen_t)) != 0)
 766                 return (-errno);
 767         namelen_orig = namelen;
 768 
 769         lx_debug("\tgetsockname(%d, 0x%p, 0x%p (=%d))",
 770             sockfd, args[1], args[2], namelen);
 771 
 772         if (namelen > 0) {
 773                 if ((name = SAFE_ALLOCA(namelen)) == NULL)
 774                         return (-EINVAL);
 775                 bzero(name, namelen);
 776         }
 777 
 778         if ((getsockname(sockfd, name, &namelen)) < 0)
 779                 return (-errno);
 780 
 781         /*
 782          * If the name that getsockname() want's to return is larger
 783          * than namelen, getsockname() will copy out the maximum amount
 784          * of data possible and then update namelen to indicate the
 785          * actually size of all the data that it wanted to copy out.
 786          */
 787         if (uucopy(name, (void *)args[1], namelen_orig) != 0)
 788                 return (-errno);
 789         if (uucopy(&namelen, (void *)args[2], sizeof (socklen_t)) != 0)
 790                 return (-errno);
 791 
 792         return (0);
 793 }
 794 
 795 static int
 796 lx_getpeername(ulong_t *args)
 797 {
 798         int sockfd = (int)args[0];
 799         struct sockaddr *name;
 800         socklen_t namelen;
 801 
 802         if (uucopy((void *)args[2], &namelen, sizeof (socklen_t)) != 0)
 803                 return (-errno);
 804 
 805         lx_debug("\tgetpeername(%d, 0x%p, 0x%p (=%d))",
 806             sockfd, args[1], args[2], namelen);
 807 
 808         /*
 809          * Linux returns EFAULT in this case, even if the namelen parameter
 810          * is 0.  This check will not catch other illegal addresses, but
 811          * the benefit catching a non-null illegal address here is not
 812          * worth the cost of another system call.
 813          */
 814         if ((void *)args[1] == NULL)
 815                 return (-EFAULT);
 816 
 817         if ((name = SAFE_ALLOCA(namelen)) == NULL)
 818                 return (-EINVAL);
 819         if ((getpeername(sockfd, name, &namelen)) < 0)
 820                 return (-errno);
 821 
 822         if (uucopy(name, (void *)args[1], namelen) != 0)
 823                 return (-errno);
 824 
 825         if (uucopy(&namelen, (void *)args[2], sizeof (socklen_t)) != 0)
 826                 return (-errno);
 827 
 828         return (0);
 829 }
 830 
 831 static int
 832 lx_socketpair(ulong_t *args)
 833 {
 834         int domain;
 835         int type;
 836         int protocol = (int)args[2];
 837         int *sv = (int *)args[3];
 838         int fds[2];
 839         int r;
 840 
 841         r = convert_sock_args((int)args[0], (int)args[1], protocol,
 842             &domain, &type);
 843         if (r != 0)
 844                 return (r);
 845 
 846         lx_debug("\tsocketpair(%d, %d, %d, 0x%p)", domain, type, protocol, sv);
 847 
 848         r = socketpair(domain, type, protocol, fds);
 849 
 850         if (r == 0) {
 851                 if (uucopy(fds, sv, sizeof (fds)) != 0) {
 852                         r = errno;
 853                         (void) close(fds[0]);
 854                         (void) close(fds[1]);
 855                         return (-r);
 856                 }
 857                 return (0);
 858         }
 859 
 860         if (errno == EPROTONOSUPPORT)
 861                 return (-ESOCKTNOSUPPORT);
 862 
 863         return (-errno);
 864 }
 865 
 866 static ssize_t
 867 lx_send(ulong_t *args)
 868 {
 869         int sockfd = (int)args[0];
 870         void *buf = (void *)args[1];
 871         size_t len = (size_t)args[2];
 872         int flags = (int)args[3];
 873         ssize_t r;
 874 
 875         int nosigpipe = flags & LX_MSG_NOSIGNAL;
 876         struct sigaction newact, oact;
 877 
 878         lx_debug("\tsend(%d, 0x%p, 0x%d, 0x%x)", sockfd, buf, len, flags);
 879 
 880         flags = convert_sockflags(flags);
 881 
 882         /*
 883          * If nosigpipe is set, we want to emulate the Linux action of
 884          * not sending a SIGPIPE to the caller if the remote socket has
 885          * already been closed.
 886          *
 887          * As SIGPIPE is a directed signal sent only to the thread that
 888          * performed the action, we can emulate this behavior by momentarily
 889          * resetting the action for SIGPIPE to SIG_IGN, performing the socket
 890          * call, and resetting the action back to its previous value.
 891          */
 892         if (nosigpipe) {
 893                 newact.sa_handler = SIG_IGN;
 894                 newact.sa_flags = 0;
 895                 (void) sigemptyset(&newact.sa_mask);
 896 
 897                 if (sigaction(SIGPIPE, &newact, &oact) < 0)
 898                         lx_err_fatal(gettext(
 899                             "%s: could not ignore SIGPIPE to emulate "
 900                             "LX_MSG_NOSIGNAL"), "send()");
 901         }
 902 
 903         r = send(sockfd, buf, len, flags);
 904 
 905         if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
 906                 lx_err_fatal(
 907                     gettext("%s: could not reset SIGPIPE handler to "
 908                     "emulate LX_MSG_NOSIGNAL"), "send()");
 909 
 910         return ((r < 0) ? -errno : r);
 911 }
 912 
 913 static ssize_t
 914 lx_recv(ulong_t *args)
 915 {
 916         int sockfd = (int)args[0];
 917         void *buf = (void *)args[1];
 918         size_t len = (size_t)args[2];
 919         int flags = (int)args[3];
 920         ssize_t r;
 921 
 922         int nosigpipe = flags & LX_MSG_NOSIGNAL;
 923         struct sigaction newact, oact;
 924 
 925         lx_debug("\trecv(%d, 0x%p, 0x%d, 0x%x)", sockfd, buf, len, flags);
 926 
 927         flags = convert_sockflags(flags);
 928 
 929         /*
 930          * If nosigpipe is set, we want to emulate the Linux action of
 931          * not sending a SIGPIPE to the caller if the remote socket has
 932          * already been closed.
 933          *
 934          * As SIGPIPE is a directed signal sent only to the thread that
 935          * performed the action, we can emulate this behavior by momentarily
 936          * resetting the action for SIGPIPE to SIG_IGN, performing the socket
 937          * call, and resetting the action back to its previous value.
 938          */
 939         if (nosigpipe) {
 940                 newact.sa_handler = SIG_IGN;
 941                 newact.sa_flags = 0;
 942                 (void) sigemptyset(&newact.sa_mask);
 943 
 944                 if (sigaction(SIGPIPE, &newact, &oact) < 0)
 945                         lx_err_fatal(gettext(
 946                             "%s: could not ignore SIGPIPE to emulate "
 947                             "LX_MSG_NOSIGNAL"), "recv()");
 948         }
 949 
 950         r = recv(sockfd, buf, len, flags);
 951 
 952         if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
 953                 lx_err_fatal(
 954                     gettext("%s: could not reset SIGPIPE handler to "
 955                     "emulate LX_MSG_NOSIGNAL"), "recv()");
 956 
 957         return ((r < 0) ? -errno : r);
 958 }
 959 
 960 static ssize_t
 961 lx_sendto(ulong_t *args)
 962 {
 963         int sockfd = (int)args[0];
 964         void *buf = (void *)args[1];
 965         size_t len = (size_t)args[2];
 966         int flags = (int)args[3];
 967         struct sockaddr *to = NULL, oldto;
 968         socklen_t tolen = 0;
 969         ssize_t r;
 970         int abst_sock;
 971 
 972         int nosigpipe = flags & LX_MSG_NOSIGNAL;
 973         struct sigaction newact, oact;
 974 
 975         if ((args[4] != NULL) && (args[5] > 0)) {
 976                 if (uucopy((struct sockaddr *)args[4], &oldto,
 977                     sizeof (struct sockaddr)) != 0)
 978                         return (-errno);
 979 
 980                 /* Handle Linux abstract sockets */
 981                 abst_sock = (oldto.sa_family == AF_UNIX) &&
 982                     (oldto.sa_data[0] == '\0');
 983 
 984                 /*
 985                  * convert_sockaddr will expand the socket path, if it is
 986                  * abstract, so we need to allocate extra memory for it now.
 987                  */
 988                 if ((to = SAFE_ALLOCA(args[5] + abst_sock * ABST_PRFX_LEN))
 989                     == NULL)
 990                         return (-EINVAL);
 991 
 992                 if ((r = convert_sockaddr(to, &tolen,
 993                     (struct sockaddr *)args[4], (socklen_t)args[5])) < 0)
 994                         return (r);
 995         }
 996 
 997 
 998         lx_debug("\tsendto(%d, 0x%p, 0x%d, 0x%x, 0x%x, %d)", sockfd, buf, len,
 999             flags, to, tolen);
1000 
1001         flags = convert_sockflags(flags);
1002 
1003         /* return this error to make auditing subsystem happy */
1004         if (to && to->sa_family == AF_ROUTE) {
1005                 return (-ECONNREFUSED);
1006         }
1007 
1008         /*
1009          * If nosigpipe is set, we want to emulate the Linux action of
1010          * not sending a SIGPIPE to the caller if the remote socket has
1011          * already been closed.
1012          *
1013          * As SIGPIPE is a directed signal sent only to the thread that
1014          * performed the action, we can emulate this behavior by momentarily
1015          * resetting the action for SIGPIPE to SIG_IGN, performing the socket
1016          * call, and resetting the action back to its previous value.
1017          */
1018         if (nosigpipe) {
1019                 newact.sa_handler = SIG_IGN;
1020                 newact.sa_flags = 0;
1021                 (void) sigemptyset(&newact.sa_mask);
1022 
1023                 if (sigaction(SIGPIPE, &newact, &oact) < 0)
1024                         lx_err_fatal(gettext(
1025                             "%s: could not ignore SIGPIPE to emulate "
1026                             "LX_MSG_NOSIGNAL"), "sendto()");
1027         }
1028 
1029         r = sendto(sockfd, buf, len, flags, to, tolen);
1030 
1031         if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
1032                 lx_err_fatal(
1033                     gettext("%s: could not reset SIGPIPE handler to "
1034                     "emulate LX_MSG_NOSIGNAL"), "sendto()");
1035 
1036         if (r < 0) {
1037                 /*
1038                  * according to the man page and LTP, the expected error in
1039                  * this case is EPIPE.
1040                  */
1041                 if (errno == ENOTCONN)
1042                         return (-EPIPE);
1043                 else
1044                         return (-errno);
1045         }
1046         return (r);
1047 }
1048 
1049 static ssize_t
1050 lx_recvfrom(ulong_t *args)
1051 {
1052         int sockfd = (int)args[0];
1053         void *buf = (void *)args[1];
1054         size_t len = (size_t)args[2];
1055         int flags = (int)args[3];
1056         struct sockaddr *from = (struct sockaddr *)args[4];
1057         socklen_t *from_lenp = (socklen_t *)args[5];
1058         ssize_t r;
1059 
1060         int nosigpipe = flags & LX_MSG_NOSIGNAL;
1061         struct sigaction newact, oact;
1062 
1063         lx_debug("\trecvfrom(%d, 0x%p, 0x%d, 0x%x, 0x%x, 0x%p)", sockfd, buf,
1064             len, flags, from, from_lenp);
1065 
1066         flags = convert_sockflags(flags);
1067 
1068         /*
1069          * If nosigpipe is set, we want to emulate the Linux action of
1070          * not sending a SIGPIPE to the caller if the remote socket has
1071          * already been closed.
1072          *
1073          * As SIGPIPE is a directed signal sent only to the thread that
1074          * performed the action, we can emulate this behavior by momentarily
1075          * resetting the action for SIGPIPE to SIG_IGN, performing the socket
1076          * call, and resetting the action back to its previous value.
1077          */
1078         if (nosigpipe) {
1079                 newact.sa_handler = SIG_IGN;
1080                 newact.sa_flags = 0;
1081                 (void) sigemptyset(&newact.sa_mask);
1082 
1083                 if (sigaction(SIGPIPE, &newact, &oact) < 0)
1084                         lx_err_fatal(gettext(
1085                             "%s: could not ignore SIGPIPE to emulate "
1086                             "LX_MSG_NOSIGNAL"), "recvfrom()");
1087         }
1088 
1089         r = recvfrom(sockfd, buf, len, flags, from, from_lenp);
1090 
1091         if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
1092                 lx_err_fatal(
1093                     gettext("%s: could not reset SIGPIPE handler to "
1094                     "emulate LX_MSG_NOSIGNAL"), "recvfrom()");
1095 
1096         return ((r < 0) ? -errno : r);
1097 }
1098 
1099 static int
1100 lx_shutdown(ulong_t *args)
1101 {
1102         int sockfd = (int)args[0];
1103         int how = (int)args[1];
1104         int r;
1105 
1106         lx_debug("\tshutdown(%d, %d)", sockfd, how);
1107         r = shutdown(sockfd, how);
1108 
1109         return ((r < 0) ? -errno : r);
1110 }
1111 
1112 static int
1113 lx_setsockopt(ulong_t *args)
1114 {
1115         int sockfd = (int)args[0];
1116         int level = (int)args[1];
1117         int optname = (int)args[2];
1118         void *optval = (void *)args[3];
1119         int optlen = (int)args[4];
1120         int internal_opt;
1121         int r;
1122 
1123         lx_debug("\tsetsockopt(%d, %d, %d, 0x%p, %d)", sockfd, level, optname,
1124             optval, optlen);
1125 
1126         /*
1127          * The kernel returns EFAULT for all invalid addresses except NULL,
1128          * for which it returns EINVAL.  Linux wants EFAULT for NULL too.
1129          */
1130         if (optval == NULL)
1131                 return (-EFAULT);
1132 
1133         /*
1134          * Do a table lookup of the Solaris equivalent of the given option
1135          */
1136         if (level < IPPROTO_IP || level >= IPPROTO_TAB_SIZE)
1137                 return (-ENOPROTOOPT);
1138 
1139         if (ltos_proto_opts[level].maxentries == 0 ||
1140             optname <= 0 || optname >= (ltos_proto_opts[level].maxentries))
1141                 return (-ENOPROTOOPT);
1142 
1143         /*
1144          * Linux sets this option when it wants to send credentials over a
1145          * socket. Currently we just ignore it to make Linux programs happy.
1146          */
1147         if ((level == LX_SOL_SOCKET) && (optname == LX_SO_PASSCRED))
1148                 return (0);
1149 
1150 
1151         if ((level == IPPROTO_TCP) && (optname == LX_TCP_CORK)) {
1152                 /*
1153                  * TCP_CORK is a Linux-only option that instructs the TCP
1154                  * stack not to send out partial frames.  Solaris doesn't
1155                  * include this option but some apps require it.  So, we do
1156                  * our best to emulate the option by disabling TCP_NODELAY.
1157                  * If the app requests that we disable TCP_CORK, we just
1158                  * ignore it since enabling TCP_NODELAY may be
1159                  * overcompensating.
1160                  */
1161                 optname = TCP_NODELAY;
1162                 if (optlen != sizeof (int))
1163                         return (-EINVAL);
1164                 if (uucopy(optval, &internal_opt, sizeof (int)) != 0)
1165                         return (-errno);
1166                 if (internal_opt == 0)
1167                         return (0);
1168                 internal_opt = 1;
1169                 optval = &internal_opt;
1170         } else {
1171                 optname = ltos_proto_opts[level].proto[optname];
1172 
1173                 if (optname == OPTNOTSUP)
1174                         return (-ENOPROTOOPT);
1175         }
1176 
1177         if (level == LX_SOL_SOCKET)
1178                 level = SOL_SOCKET;
1179 
1180         r = setsockopt(sockfd, level, optname, optval, optlen);
1181 
1182         return ((r < 0) ? -errno : r);
1183 }
1184 
1185 static int
1186 lx_getsockopt(ulong_t *args)
1187 {
1188         int sockfd = (int)args[0];
1189         int level = (int)args[1];
1190         int optname = (int)args[2];
1191         void *optval = (void *)args[3];
1192         int *optlenp = (int *)args[4];
1193         int r;
1194 
1195         lx_debug("\tgetsockopt(%d, %d, %d, 0x%p, 0x%p)", sockfd, level, optname,
1196             optval, optlenp);
1197 
1198         /*
1199          * According to the Linux man page, a NULL optval should indicate
1200          * (as in Solaris) that no return value is expected.  Instead, it
1201          * actually triggers an EFAULT error.
1202          */
1203         if (optval == NULL)
1204                 return (-EFAULT);
1205 
1206         /*
1207          * Do a table lookup of the Solaris equivalent of the given option
1208          */
1209         if (level < IPPROTO_IP || level >= IPPROTO_TAB_SIZE)
1210                 return (-EOPNOTSUPP);
1211 
1212         if (ltos_proto_opts[level].maxentries == 0 ||
1213             optname <= 0 || optname >= (ltos_proto_opts[level].maxentries))
1214                 return (-ENOPROTOOPT);
1215 
1216         if (((level == LX_SOL_SOCKET) && (optname == LX_SO_PASSCRED)) ||
1217             ((level == IPPROTO_TCP) && (optname == LX_TCP_CORK))) {
1218                 /*
1219                  * Linux sets LX_SO_PASSCRED when it wants to send credentials
1220                  * over a socket. Since we do not support it, it is never set
1221                  * and we return 0.
1222                  *
1223                  * We don't support TCP_CORK but some apps rely on it.  So,
1224                  * rather than return an error we just return 0.  This
1225                  * isn't exactly a lie, since this option really isn't set,
1226                  * but it's not the whole truth either.  Fortunately, we
1227                  * aren't under oath.
1228                  */
1229                 r = 0;
1230                 if (uucopy(&r, optval, sizeof (int)) != 0)
1231                         return (-errno);
1232                 r = sizeof (int);
1233                 if (uucopy(&r, optlenp, sizeof (int)) != 0)
1234                         return (-errno);
1235                 return (0);
1236         }
1237         if ((level == LX_SOL_SOCKET) && (optname == LX_SO_PEERCRED)) {
1238                 struct lx_ucred lx_ucred;
1239                 ucred_t         *ucp;
1240 
1241                 /*
1242                  * We don't support SO_PEERCRED, but we do have equivalent
1243                  * functionality in getpeerucred() so invoke that here.
1244                  */
1245 
1246                 /* Verify there's going to be enough room for the results. */
1247                 if (uucopy(optlenp, &r, sizeof (int)) != 0)
1248                         return (-errno);
1249                 if (r < sizeof (struct lx_ucred))
1250                         return (-EOVERFLOW);
1251 
1252                 /*
1253                  * We allocate a ucred_t ourselves rather than allow
1254                  * getpeerucred() to do it for us because getpeerucred()
1255                  * uses malloc(3C) and we'd rather use SAFE_ALLOCA().
1256                  */
1257                 if ((ucp = (ucred_t *)SAFE_ALLOCA(ucred_size())) == NULL)
1258                         return (-ENOMEM);
1259 
1260                 /* Get the credential for the remote end of this socket. */
1261                 if (getpeerucred(sockfd, &ucp) != 0)
1262                         return (-errno);
1263                 if (((lx_ucred.lxu_pid = ucred_getpid(ucp)) == -1) ||
1264                     ((lx_ucred.lxu_uid = ucred_geteuid(ucp)) == (uid_t)-1) ||
1265                     ((lx_ucred.lxu_gid = ucred_getegid(ucp)) == (gid_t)-1)) {
1266                         return (-errno);
1267                 }
1268 
1269                 /* Copy out the results. */
1270                 if ((uucopy(&lx_ucred, optval, sizeof (lx_ucred))) != 0)
1271                         return (-errno);
1272                 r = sizeof (lx_ucred);
1273                 if ((uucopy(&r, optlenp, sizeof (int))) != 0)
1274                         return (-errno);
1275                 return (0);
1276         }
1277 
1278         optname = ltos_proto_opts[level].proto[optname];
1279 
1280         if (optname == OPTNOTSUP)
1281                 return (-ENOPROTOOPT);
1282 
1283         if (level == LX_SOL_SOCKET)
1284                 level = SOL_SOCKET;
1285 
1286         r = getsockopt(sockfd, level, optname, optval, optlenp);
1287 
1288         return ((r < 0) ? -errno : r);
1289 }
1290 
1291 /*
1292  * libc routines that issue these system calls.  We bypass the libsocket
1293  * wrappers since they explicitly turn off the MSG_XPG_2 flag we need for
1294  * Linux compatibility.
1295  */
1296 extern int _so_sendmsg();
1297 extern int _so_recvmsg();
1298 
1299 static int
1300 lx_sendmsg(ulong_t *args)
1301 {
1302         int sockfd = (int)args[0];
1303         struct lx_msghdr msg;
1304         struct cmsghdr *cmsg;
1305         int flags = (int)args[2];
1306         int r;
1307 
1308         int nosigpipe = flags & LX_MSG_NOSIGNAL;
1309         struct sigaction newact, oact;
1310 
1311         lx_debug("\tsendmsg(%d, 0x%p, 0x%x)", sockfd, (void *)args[1], flags);
1312 
1313         flags = convert_sockflags(flags);
1314 
1315         if ((uucopy((void *)args[1], &msg, sizeof (msg))) != 0)
1316                 return (-errno);
1317 
1318         /*
1319          * If there are control messages bundled in this message, we need
1320          * to convert them from Linux to Solaris.
1321          */
1322         if (msg.msg_control != NULL) {
1323                 if (msg.msg_controllen == 0) {
1324                         cmsg = NULL;
1325                 } else {
1326                         cmsg = SAFE_ALLOCA(msg.msg_controllen);
1327                         if (cmsg == NULL)
1328                                 return (-EINVAL);
1329                 }
1330                 if ((uucopy(msg.msg_control, cmsg, msg.msg_controllen)) != 0)
1331                         return (-errno);
1332                 msg.msg_control = cmsg;
1333                 if ((r = convert_cmsgs(LX_TO_SOL, &msg, "sendmsg()")) != 0)
1334                         return (-r);
1335         }
1336 
1337         /*
1338          * If nosigpipe is set, we want to emulate the Linux action of
1339          * not sending a SIGPIPE to the caller if the remote socket has
1340          * already been closed.
1341          *
1342          * As SIGPIPE is a directed signal sent only to the thread that
1343          * performed the action, we can emulate this behavior by momentarily
1344          * resetting the action for SIGPIPE to SIG_IGN, performing the socket
1345          * call, and resetting the action back to its previous value.
1346          */
1347         if (nosigpipe) {
1348                 newact.sa_handler = SIG_IGN;
1349                 newact.sa_flags = 0;
1350                 (void) sigemptyset(&newact.sa_mask);
1351 
1352                 if (sigaction(SIGPIPE, &newact, &oact) < 0)
1353                         lx_err_fatal(gettext(
1354                             "%s: could not ignore SIGPIPE to emulate "
1355                             "LX_MSG_NOSIGNAL"), "sendmsg()");
1356         }
1357 
1358         r = _so_sendmsg(sockfd, (struct msghdr *)&msg, flags | MSG_XPG4_2);
1359 
1360         if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
1361                 lx_err_fatal(
1362                     gettext("%s: could not reset SIGPIPE handler to "
1363                     "emulate LX_MSG_NOSIGNAL"), "sendmsg()");
1364 
1365         if (r < 0) {
1366                 /*
1367                  * according to the man page and LTP, the expected error in
1368                  * this case is EPIPE.
1369                  */
1370                 if (errno == ENOTCONN)
1371                         return (-EPIPE);
1372                 else
1373                         return (-errno);
1374         }
1375 
1376         return (r);
1377 }
1378 
1379 static int
1380 lx_recvmsg(ulong_t *args)
1381 {
1382         int sockfd = (int)args[0];
1383         struct lx_msghdr msg;
1384         struct lx_msghdr *msgp = (struct lx_msghdr *)args[1];
1385         struct cmsghdr *cmsg = NULL;
1386         int flags = (int)args[2];
1387         int r, err;
1388 
1389         int nosigpipe = flags & LX_MSG_NOSIGNAL;
1390         struct sigaction newact, oact;
1391 
1392         lx_debug("\trecvmsg(%d, 0x%p, 0x%x)", sockfd, (void *)args[1], flags);
1393 
1394         flags = convert_sockflags(flags);
1395 
1396         if ((uucopy(msgp, &msg, sizeof (msg))) != 0)
1397                 return (-errno);
1398 
1399         /*
1400          * If we are expecting to have to convert any control messages,
1401          * then we should receive them into our address space instead of
1402          * the app's.
1403          */
1404         if (msg.msg_control != NULL) {
1405                 cmsg = msg.msg_control;
1406                 if (msg.msg_controllen == 0) {
1407                         msg.msg_control = NULL;
1408                 } else {
1409                         msg.msg_control = SAFE_ALLOCA(msg.msg_controllen);
1410                         if (msg.msg_control == NULL)
1411                                 return (-EINVAL);
1412                 }
1413         }
1414 
1415         /*
1416          * If nosigpipe is set, we want to emulate the Linux action of
1417          * not sending a SIGPIPE to the caller if the remote socket has
1418          * already been closed.
1419          *
1420          * As SIGPIPE is a directed signal sent only to the thread that
1421          * performed the action, we can emulate this behavior by momentarily
1422          * resetting the action for SIGPIPE to SIG_IGN, performing the socket
1423          * call, and resetting the action back to its previous value.
1424          */
1425         if (nosigpipe) {
1426                 newact.sa_handler = SIG_IGN;
1427                 newact.sa_flags = 0;
1428                 (void) sigemptyset(&newact.sa_mask);
1429 
1430                 if (sigaction(SIGPIPE, &newact, &oact) < 0)
1431                         lx_err_fatal(gettext(
1432                             "%s: could not ignore SIGPIPE to emulate "
1433                             "LX_MSG_NOSIGNAL"), "recvmsg()");
1434         }
1435 
1436         r = _so_recvmsg(sockfd, (struct msghdr *)&msg, flags | MSG_XPG4_2);
1437 
1438         if ((nosigpipe) && (sigaction(SIGPIPE, &oact, NULL) < 0))
1439                 lx_err_fatal(
1440                     gettext("%s: could not reset SIGPIPE handler to "
1441                     "emulate LX_MSG_NOSIGNAL"), "recvmsg()");
1442 
1443         if (r >= 0 && msg.msg_control != NULL) {
1444                 /*
1445                  * If there are control messages bundled in this message,
1446                  * we need to convert them from Linux to Solaris.
1447                  */
1448                 if ((err = convert_cmsgs(SOL_TO_LX, &msg, "recvmsg()")) != 0)
1449                         return (-err);
1450 
1451                 if ((uucopy(msg.msg_control, cmsg, msg.msg_controllen)) != 0)
1452                         return (-errno);
1453         }
1454 
1455         /*
1456          * A handful of the values in the msghdr are set by the recvmsg()
1457          * call, so copy their values back to the caller.  Rather than iterate,
1458          * just copy the whole structure back.
1459          */
1460         if (uucopy(&msg, msgp, sizeof (msg)) != 0)
1461                 return (-errno);
1462 
1463         return ((r < 0) ? -errno : r);
1464 }
1465 
1466 int
1467 lx_socketcall(uintptr_t p1, uintptr_t p2)
1468 {
1469         int subcmd = (int)p1 - 1; /* subcommands start at 1 - not 0 */
1470         ulong_t args[6];
1471         int r;
1472 
1473         if (subcmd < 0 || subcmd >= LX_RECVMSG)
1474                 return (-EINVAL);
1475 
1476         /*
1477          * Copy the arguments to the subcommand in from the app's address
1478          * space, returning EFAULT if we get a bogus pointer.
1479          */
1480         if (uucopy((void *)p2, args,
1481             sockfns[subcmd].s_nargs * sizeof (ulong_t)))
1482                 return (-errno);
1483 
1484         r = (sockfns[subcmd].s_fn)(args);
1485 
1486         return (r);
1487 }