1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/types.h>
  29 #include <sys/systm.h>
  30 #include <sys/errno.h>
  31 #include <sys/debug.h>
  32 #include <vm/as.h>
  33 #include <vm/seg.h>
  34 #include <vm/seg_vn.h>
  35 #include <vm/page.h>
  36 #include <sys/mman.h>
  37 #include <sys/timer.h>
  38 #include <sys/condvar.h>
  39 #include <sys/inttypes.h>
  40 #include <sys/lx_futex.h>
  41 
  42 /*
  43  * Futexes are a Linux-specific implementation of inter-process mutexes.
  44  * They are designed to use shared memory for simple, uncontested
  45  * operations, and rely on the kernel to resolve any contention issues.
  46  *
  47  * Most of the information in this section comes from the paper "Futexes
  48  * Are Tricky", by Ulrich Drepper.  This paper is currently available at:
  49  * http://people.redhat.com/~drepper/futex.pdf.
  50  *
  51  * A futex itself a 4-byte integer, which must be 4-byte aligned.  The
  52  * value of this integer is expected to be modified using user-level atomic
  53  * operations.  The futex(4) design itself does not impose any semantic
  54  * constraints on the value stored in the futex; it is up to the
  55  * application to define its own protocol.
  56  *
  57  * When the application decides that kernel intervention is required, it
  58  * will use the futex(2) system call.  There are 5 different operations
  59  * that can be performed on a futex, using this system call.  Since this
  60  * interface has evolved over time, there are several different prototypes
  61  * available to the user.  Fortunately, there is only a single kernel-level
  62  * interface:
  63  *
  64  * long sys_futex(void *futex1, int cmd, int val1,
  65  *      struct timespec *timeout, void *futex2, int val2)
  66  *
  67  * The kernel-level operations that may be performed on a futex are:
  68  *
  69  * FUTEX_WAIT
  70  *
  71  *      Atomically verify that futex1 contains the value val1.  If it
  72  *      doesn't, return EWOULDBLOCK.  If it does contain the expected
  73  *      value, the thread will sleep until somebody performs a FUTEX_WAKE
  74  *      on the futex.  The caller may also specify a timeout, indicating
  75  *      the maximum time the thread should sleep.  If the timer expires,
  76  *      the call returns ETIMEDOUT.  If the thread is awoken with a signal,
  77  *      the call returns EINTR.  Otherwise, the call returns 0.
  78  *
  79  * FUTEX_WAKE
  80  *
  81  *      Wake up val1 processes that are waiting on futex1.  The call
  82  *      returns the number of blocked threads that were woken up.
  83  *
  84  * FUTEX_CMP_REQUEUE
  85  *
  86  *      If the value stored in futex1 matches that passed in in val2, wake
  87  *      up val1 processes that are waiting on futex1.  Otherwise, return
  88  *      EAGAIN.
  89  *
  90  *      If there are more than val1 threads waiting on the futex, remove
  91  *      the remaining threads from this futex, and requeue them on futex2.
  92  *      The caller can limit the number of threads being requeued by
  93  *      encoding an integral numerical value in the position usually used
  94  *      for the timeout pointer.
  95  *
  96  *      The call returns the number of blocked threads that were woken up
  97  *      or requeued.
  98  *
  99  * FUTEX_REQUEUE
 100  *
 101  *       Identical to FUTEX_CMP_REQUEUE except that it does not use val2.
 102  *       This command has been declared broken and obsolete, but we still
 103  *       need to support it.
 104  *
 105  * FUTEX_FD
 106  *
 107  *      Return a file descriptor, which can be used to refer to the futex.
 108  *      We don't support this operation.
 109  */
 110 
 111 /*
 112  * This structure is used to track all the threads currently waiting on a
 113  * futex.  There is one fwaiter_t for each blocked thread.  We store all
 114  * fwaiter_t's in a hash structure, indexed by the memid_t of the integer
 115  * containing the futex's value.
 116  *
 117  * At the moment, all fwaiter_t's for a single futex are simply dumped into
 118  * the hash bucket.  If futex contention ever becomes a hot path, we can
 119  * chain a single futex's waiters together.
 120  */
 121 typedef struct fwaiter {
 122         memid_t         fw_memid;       /* memid of the user-space futex */
 123         kcondvar_t      fw_cv;          /* cond var */
 124         struct fwaiter  *fw_next;       /* hash queue */
 125         struct fwaiter  *fw_prev;       /* hash queue */
 126         volatile int    fw_woken;
 127 } fwaiter_t;
 128 
 129 #define MEMID_COPY(s, d) \
 130         { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; }
 131 #define MEMID_EQUAL(s, d) \
 132         ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1])
 133 
 134 /* Borrowed from the page freelist hash code.  */
 135 #define HASH_SHIFT_SZ   7
 136 #define HASH_SIZE       (1 << HASH_SHIFT_SZ)
 137 #define HASH_FUNC(id)                                           \
 138         ((((uintptr_t)((id)->val[1]) >> PAGESHIFT) +                   \
 139         ((uintptr_t)((id)->val[1]) >> (PAGESHIFT + HASH_SHIFT_SZ)) +   \
 140         ((uintptr_t)((id)->val[0]) >> 3) +                             \
 141         ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) +           \
 142         ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) &  \
 143         (HASH_SIZE - 1))
 144 
 145 static fwaiter_t *futex_hash[HASH_SIZE];
 146 static kmutex_t futex_hash_lock[HASH_SIZE];
 147 
 148 static void
 149 futex_hashin(fwaiter_t *fwp)
 150 {
 151         int index;
 152 
 153         index = HASH_FUNC(&fwp->fw_memid);
 154         ASSERT(MUTEX_HELD(&futex_hash_lock[index]));
 155 
 156         fwp->fw_prev = NULL;
 157         fwp->fw_next = futex_hash[index];
 158         if (fwp->fw_next)
 159                 fwp->fw_next->fw_prev = fwp;
 160         futex_hash[index] = fwp;
 161 }
 162 
 163 static void
 164 futex_hashout(fwaiter_t *fwp)
 165 {
 166         int index;
 167 
 168         index = HASH_FUNC(&fwp->fw_memid);
 169         ASSERT(MUTEX_HELD(&futex_hash_lock[index]));
 170 
 171         if (fwp->fw_prev)
 172                 fwp->fw_prev->fw_next = fwp->fw_next;
 173         if (fwp->fw_next)
 174                 fwp->fw_next->fw_prev = fwp->fw_prev;
 175         if (futex_hash[index] == fwp)
 176                 futex_hash[index] = fwp->fw_next;
 177 
 178         fwp->fw_prev = NULL;
 179         fwp->fw_next = NULL;
 180 }
 181 
 182 /*
 183  * Go to sleep until somebody does a WAKE operation on this futex, we get a
 184  * signal, or the timeout expires.
 185  */
 186 static int
 187 futex_wait(memid_t *memid, caddr_t addr, int val, timespec_t *timeout)
 188 {
 189         int err, ret;
 190         int32_t curval;
 191         fwaiter_t fw;
 192         int index;
 193 
 194         fw.fw_woken = 0;
 195         MEMID_COPY(memid, &fw.fw_memid);
 196         cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL);
 197 
 198         index = HASH_FUNC(&fw.fw_memid);
 199         mutex_enter(&futex_hash_lock[index]);
 200 
 201         if (fuword32(addr, (uint32_t *)&curval)) {
 202                 err = set_errno(EFAULT);
 203                 goto out;
 204         }
 205         if (curval != val) {
 206                 err = set_errno(EWOULDBLOCK);
 207                 goto out;
 208         }
 209 
 210         futex_hashin(&fw);
 211 
 212         err = 0;
 213         while ((fw.fw_woken == 0) && (err == 0)) {
 214                 ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index],
 215                         timeout, timechanged);
 216                 if (ret < 0)
 217                         err = set_errno(ETIMEDOUT);
 218                 else if (ret == 0)
 219                         err = set_errno(EINTR);
 220         }
 221 
 222         /*
 223          * The futex is normally hashed out in wakeup.  If we timed out or
 224          * got a signal, we need to hash it out here instead.
 225          */
 226         if (fw.fw_woken == 0)
 227                 futex_hashout(&fw);
 228 
 229 out:
 230         mutex_exit(&futex_hash_lock[index]);
 231 
 232         return (err);
 233 }
 234 
 235 /*
 236  * Wake up to wake_threads threads that are blocked on the futex at memid.
 237  */
 238 static int
 239 futex_wake(memid_t *memid, int wake_threads)
 240 {
 241         fwaiter_t *fwp, *next;
 242         int index;
 243         int ret = 0;
 244 
 245         index = HASH_FUNC(memid);
 246 
 247         mutex_enter(&futex_hash_lock[index]);
 248 
 249         for (fwp = futex_hash[index]; fwp && ret < wake_threads; fwp = next) {
 250                 next = fwp->fw_next;
 251                 if (MEMID_EQUAL(&fwp->fw_memid, memid)) {
 252                         futex_hashout(fwp);
 253                         fwp->fw_woken = 1;
 254                         cv_signal(&fwp->fw_cv);
 255                         ret++;
 256                 }
 257         }
 258 
 259         mutex_exit(&futex_hash_lock[index]);
 260 
 261         return (ret);
 262 }
 263 
 264 /*
 265  * Wake up to wake_threads waiting on the futex at memid.  If there are
 266  * more than that many threads waiting, requeue the remaining threads on
 267  * the futex at requeue_memid.
 268  */
 269 static int
 270 futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads,
 271         ulong_t requeue_threads, caddr_t addr, int *cmpval)
 272 {
 273         fwaiter_t *fwp, *next;
 274         int index1, index2;
 275         int ret = 0;
 276         int32_t curval;
 277         kmutex_t *l1, *l2;
 278 
 279         /*
 280          * To ensure that we don't miss a wakeup if the value of cmpval
 281          * changes, we need to grab locks on both the original and new hash
 282          * buckets.  To avoid deadlock, we always grab the lower-indexed
 283          * lock first.
 284          */
 285         index1 = HASH_FUNC(memid);
 286         index2 = HASH_FUNC(requeue_memid);
 287 
 288         if (index1 == index2) {
 289                 l1 = &futex_hash_lock[index1];
 290                 l2 = NULL;
 291         } else if (index1 < index2) {
 292                 l1 = &futex_hash_lock[index1];
 293                 l2 = &futex_hash_lock[index2];
 294         } else {
 295                 l1 = &futex_hash_lock[index2];
 296                 l2 = &futex_hash_lock[index1];
 297         }
 298 
 299         mutex_enter(l1);
 300         if (l2 != NULL)
 301                 mutex_enter(l2);
 302 
 303         if (cmpval != NULL) {
 304                 if (fuword32(addr, (uint32_t *)&curval)) {
 305                         ret = -EFAULT;
 306                         goto out;
 307                 }
 308                 if (curval != *cmpval) {
 309                         ret = -EAGAIN;
 310                         goto out;
 311                 }
 312         }
 313 
 314         for (fwp = futex_hash[index1]; fwp; fwp = next) {
 315                 next = fwp->fw_next;
 316                 if (!MEMID_EQUAL(&fwp->fw_memid, memid))
 317                         continue;
 318 
 319                 futex_hashout(fwp);
 320                 if (ret++ < wake_threads) {
 321                         fwp->fw_woken = 1;
 322                         cv_signal(&fwp->fw_cv);
 323                 } else {
 324                         MEMID_COPY(requeue_memid, &fwp->fw_memid);
 325                         futex_hashin(fwp);
 326 
 327                         if ((ret - wake_threads) >= requeue_threads)
 328                                 break;
 329                 }
 330         }
 331 
 332 out:
 333         if (l2 != NULL)
 334                 mutex_exit(l2);
 335         mutex_exit(l1);
 336 
 337         if (ret < 0)
 338                 return (set_errno(-ret));
 339         return (ret);
 340 }
 341 
 342 /*
 343  * Copy in the relative timeout provided by the application and convert it
 344  * to an absolute timeout.
 345  */
 346 static int
 347 get_timeout(void *lx_timeout, timestruc_t *timeout)
 348 {
 349         timestruc_t now;
 350 
 351         if (get_udatamodel() == DATAMODEL_NATIVE) {
 352                 if (copyin(lx_timeout, timeout, sizeof (timestruc_t)))
 353                         return (EFAULT);
 354         }
 355 #ifdef _SYSCALL32_IMPL
 356         else {
 357                 timestruc32_t timeout32;
 358                 if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t)))
 359                         return (EFAULT);
 360                 timeout->tv_sec = (time_t)timeout32.tv_sec;
 361                 timeout->tv_nsec = timeout32.tv_nsec;
 362         }
 363 #endif
 364         gethrestime(&now);
 365 
 366         if (itimerspecfix(timeout))
 367                 return (EINVAL);
 368 
 369         timespecadd(timeout, &now);
 370         return (0);
 371 }
 372 
 373 long
 374 lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout,
 375         uintptr_t addr2, int val2)
 376 {
 377         struct as *as = curproc->p_as;
 378         memid_t memid, requeue_memid;
 379         timestruc_t timeout;
 380         timestruc_t *tptr = NULL;
 381         int requeue_threads = NULL;
 382         int *requeue_cmp = NULL;
 383         int rval = 0;
 384 
 385         /* must be aligned on int boundary */
 386         if (addr & 0x3)
 387                 return (set_errno(EINVAL));
 388 
 389         /* Sanity check the futex command */
 390         if (cmd < 0 || cmd > FUTEX_MAX_CMD)
 391                 return (set_errno(EINVAL));
 392 
 393         /* Copy in the timeout structure from userspace. */
 394         if (cmd == FUTEX_WAIT && lx_timeout != NULL) {
 395                 rval = get_timeout((timespec_t *)lx_timeout, &timeout);
 396                 if (rval != 0)
 397                         return (set_errno(rval));
 398                 tptr = &timeout;
 399         }
 400 
 401         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) {
 402                 if (cmd == FUTEX_CMP_REQUEUE)
 403                         requeue_cmp = &val2;
 404 
 405                 /*
 406                  * lx_timeout is nominally a pointer to a userspace
 407                  * address.  For these two commands, it actually contains
 408                  * an integer which indicates the maximum number of threads
 409                  * to requeue.  This is horrible, and I'm sorry.
 410                  */
 411                 requeue_threads = (int)lx_timeout;
 412         }
 413 
 414         /*
 415          * Translate the process-specific, user-space futex virtual
 416          * address(es) to universal memid.
 417          */
 418         rval = as_getmemid(as, (void *)addr, &memid);
 419         if (rval != 0)
 420                 return (set_errno(rval));
 421 
 422         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) {
 423                 rval = as_getmemid(as, (void *)addr2, &requeue_memid);
 424                 if (rval)
 425                         return (set_errno(rval));
 426         }
 427 
 428         switch (cmd) {
 429         case FUTEX_WAIT:
 430                 rval = futex_wait(&memid, (void *)addr, val, tptr);
 431                 break;
 432 
 433         case FUTEX_WAKE:
 434                 rval = futex_wake(&memid, val);
 435                 break;
 436 
 437         case FUTEX_CMP_REQUEUE:
 438         case FUTEX_REQUEUE:
 439                 rval = futex_requeue(&memid, &requeue_memid, val,
 440                     requeue_threads, (void *)addr2, requeue_cmp);
 441 
 442                 break;
 443         }
 444 
 445         return (rval);
 446 }
 447 
 448 void
 449 lx_futex_init(void)
 450 {
 451         int i;
 452 
 453         for (i = 0; i < HASH_SIZE; i++)
 454                 mutex_init(&futex_hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
 455         bzero(futex_hash, sizeof (futex_hash));
 456 }
 457 
 458 int
 459 lx_futex_fini(void)
 460 {
 461         int i, err;
 462 
 463         err = 0;
 464         for (i = 0; (err == 0) && (i < HASH_SIZE); i++) {
 465                 mutex_enter(&futex_hash_lock[i]);
 466                 if (futex_hash[i] != NULL)
 467                         err = EBUSY;
 468                 mutex_exit(&futex_hash_lock[i]);
 469         }
 470         return (err);
 471 }