1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/errno.h> 31 #include <sys/debug.h> 32 #include <vm/as.h> 33 #include <vm/seg.h> 34 #include <vm/seg_vn.h> 35 #include <vm/page.h> 36 #include <sys/mman.h> 37 #include <sys/timer.h> 38 #include <sys/condvar.h> 39 #include <sys/inttypes.h> 40 #include <sys/lx_futex.h> 41 42 /* 43 * Futexes are a Linux-specific implementation of inter-process mutexes. 44 * They are designed to use shared memory for simple, uncontested 45 * operations, and rely on the kernel to resolve any contention issues. 46 * 47 * Most of the information in this section comes from the paper "Futexes 48 * Are Tricky", by Ulrich Drepper. This paper is currently available at: 49 * http://people.redhat.com/~drepper/futex.pdf. 50 * 51 * A futex itself a 4-byte integer, which must be 4-byte aligned. The 52 * value of this integer is expected to be modified using user-level atomic 53 * operations. The futex(4) design itself does not impose any semantic 54 * constraints on the value stored in the futex; it is up to the 55 * application to define its own protocol. 56 * 57 * When the application decides that kernel intervention is required, it 58 * will use the futex(2) system call. There are 5 different operations 59 * that can be performed on a futex, using this system call. Since this 60 * interface has evolved over time, there are several different prototypes 61 * available to the user. Fortunately, there is only a single kernel-level 62 * interface: 63 * 64 * long sys_futex(void *futex1, int cmd, int val1, 65 * struct timespec *timeout, void *futex2, int val2) 66 * 67 * The kernel-level operations that may be performed on a futex are: 68 * 69 * FUTEX_WAIT 70 * 71 * Atomically verify that futex1 contains the value val1. If it 72 * doesn't, return EWOULDBLOCK. If it does contain the expected 73 * value, the thread will sleep until somebody performs a FUTEX_WAKE 74 * on the futex. The caller may also specify a timeout, indicating 75 * the maximum time the thread should sleep. If the timer expires, 76 * the call returns ETIMEDOUT. If the thread is awoken with a signal, 77 * the call returns EINTR. Otherwise, the call returns 0. 78 * 79 * FUTEX_WAKE 80 * 81 * Wake up val1 processes that are waiting on futex1. The call 82 * returns the number of blocked threads that were woken up. 83 * 84 * FUTEX_CMP_REQUEUE 85 * 86 * If the value stored in futex1 matches that passed in in val2, wake 87 * up val1 processes that are waiting on futex1. Otherwise, return 88 * EAGAIN. 89 * 90 * If there are more than val1 threads waiting on the futex, remove 91 * the remaining threads from this futex, and requeue them on futex2. 92 * The caller can limit the number of threads being requeued by 93 * encoding an integral numerical value in the position usually used 94 * for the timeout pointer. 95 * 96 * The call returns the number of blocked threads that were woken up 97 * or requeued. 98 * 99 * FUTEX_REQUEUE 100 * 101 * Identical to FUTEX_CMP_REQUEUE except that it does not use val2. 102 * This command has been declared broken and obsolete, but we still 103 * need to support it. 104 * 105 * FUTEX_FD 106 * 107 * Return a file descriptor, which can be used to refer to the futex. 108 * We don't support this operation. 109 */ 110 111 /* 112 * This structure is used to track all the threads currently waiting on a 113 * futex. There is one fwaiter_t for each blocked thread. We store all 114 * fwaiter_t's in a hash structure, indexed by the memid_t of the integer 115 * containing the futex's value. 116 * 117 * At the moment, all fwaiter_t's for a single futex are simply dumped into 118 * the hash bucket. If futex contention ever becomes a hot path, we can 119 * chain a single futex's waiters together. 120 */ 121 typedef struct fwaiter { 122 memid_t fw_memid; /* memid of the user-space futex */ 123 kcondvar_t fw_cv; /* cond var */ 124 struct fwaiter *fw_next; /* hash queue */ 125 struct fwaiter *fw_prev; /* hash queue */ 126 volatile int fw_woken; 127 } fwaiter_t; 128 129 #define MEMID_COPY(s, d) \ 130 { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; } 131 #define MEMID_EQUAL(s, d) \ 132 ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1]) 133 134 /* Borrowed from the page freelist hash code. */ 135 #define HASH_SHIFT_SZ 7 136 #define HASH_SIZE (1 << HASH_SHIFT_SZ) 137 #define HASH_FUNC(id) \ 138 ((((uintptr_t)((id)->val[1]) >> PAGESHIFT) + \ 139 ((uintptr_t)((id)->val[1]) >> (PAGESHIFT + HASH_SHIFT_SZ)) + \ 140 ((uintptr_t)((id)->val[0]) >> 3) + \ 141 ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \ 142 ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \ 143 (HASH_SIZE - 1)) 144 145 static fwaiter_t *futex_hash[HASH_SIZE]; 146 static kmutex_t futex_hash_lock[HASH_SIZE]; 147 148 static void 149 futex_hashin(fwaiter_t *fwp) 150 { 151 int index; 152 153 index = HASH_FUNC(&fwp->fw_memid); 154 ASSERT(MUTEX_HELD(&futex_hash_lock[index])); 155 156 fwp->fw_prev = NULL; 157 fwp->fw_next = futex_hash[index]; 158 if (fwp->fw_next) 159 fwp->fw_next->fw_prev = fwp; 160 futex_hash[index] = fwp; 161 } 162 163 static void 164 futex_hashout(fwaiter_t *fwp) 165 { 166 int index; 167 168 index = HASH_FUNC(&fwp->fw_memid); 169 ASSERT(MUTEX_HELD(&futex_hash_lock[index])); 170 171 if (fwp->fw_prev) 172 fwp->fw_prev->fw_next = fwp->fw_next; 173 if (fwp->fw_next) 174 fwp->fw_next->fw_prev = fwp->fw_prev; 175 if (futex_hash[index] == fwp) 176 futex_hash[index] = fwp->fw_next; 177 178 fwp->fw_prev = NULL; 179 fwp->fw_next = NULL; 180 } 181 182 /* 183 * Go to sleep until somebody does a WAKE operation on this futex, we get a 184 * signal, or the timeout expires. 185 */ 186 static int 187 futex_wait(memid_t *memid, caddr_t addr, int val, timespec_t *timeout) 188 { 189 int err, ret; 190 int32_t curval; 191 fwaiter_t fw; 192 int index; 193 194 fw.fw_woken = 0; 195 MEMID_COPY(memid, &fw.fw_memid); 196 cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL); 197 198 index = HASH_FUNC(&fw.fw_memid); 199 mutex_enter(&futex_hash_lock[index]); 200 201 if (fuword32(addr, (uint32_t *)&curval)) { 202 err = set_errno(EFAULT); 203 goto out; 204 } 205 if (curval != val) { 206 err = set_errno(EWOULDBLOCK); 207 goto out; 208 } 209 210 futex_hashin(&fw); 211 212 err = 0; 213 while ((fw.fw_woken == 0) && (err == 0)) { 214 ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index], 215 timeout, timechanged); 216 if (ret < 0) 217 err = set_errno(ETIMEDOUT); 218 else if (ret == 0) 219 err = set_errno(EINTR); 220 } 221 222 /* 223 * The futex is normally hashed out in wakeup. If we timed out or 224 * got a signal, we need to hash it out here instead. 225 */ 226 if (fw.fw_woken == 0) 227 futex_hashout(&fw); 228 229 out: 230 mutex_exit(&futex_hash_lock[index]); 231 232 return (err); 233 } 234 235 /* 236 * Wake up to wake_threads threads that are blocked on the futex at memid. 237 */ 238 static int 239 futex_wake(memid_t *memid, int wake_threads) 240 { 241 fwaiter_t *fwp, *next; 242 int index; 243 int ret = 0; 244 245 index = HASH_FUNC(memid); 246 247 mutex_enter(&futex_hash_lock[index]); 248 249 for (fwp = futex_hash[index]; fwp && ret < wake_threads; fwp = next) { 250 next = fwp->fw_next; 251 if (MEMID_EQUAL(&fwp->fw_memid, memid)) { 252 futex_hashout(fwp); 253 fwp->fw_woken = 1; 254 cv_signal(&fwp->fw_cv); 255 ret++; 256 } 257 } 258 259 mutex_exit(&futex_hash_lock[index]); 260 261 return (ret); 262 } 263 264 /* 265 * Wake up to wake_threads waiting on the futex at memid. If there are 266 * more than that many threads waiting, requeue the remaining threads on 267 * the futex at requeue_memid. 268 */ 269 static int 270 futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads, 271 ulong_t requeue_threads, caddr_t addr, int *cmpval) 272 { 273 fwaiter_t *fwp, *next; 274 int index1, index2; 275 int ret = 0; 276 int32_t curval; 277 kmutex_t *l1, *l2; 278 279 /* 280 * To ensure that we don't miss a wakeup if the value of cmpval 281 * changes, we need to grab locks on both the original and new hash 282 * buckets. To avoid deadlock, we always grab the lower-indexed 283 * lock first. 284 */ 285 index1 = HASH_FUNC(memid); 286 index2 = HASH_FUNC(requeue_memid); 287 288 if (index1 == index2) { 289 l1 = &futex_hash_lock[index1]; 290 l2 = NULL; 291 } else if (index1 < index2) { 292 l1 = &futex_hash_lock[index1]; 293 l2 = &futex_hash_lock[index2]; 294 } else { 295 l1 = &futex_hash_lock[index2]; 296 l2 = &futex_hash_lock[index1]; 297 } 298 299 mutex_enter(l1); 300 if (l2 != NULL) 301 mutex_enter(l2); 302 303 if (cmpval != NULL) { 304 if (fuword32(addr, (uint32_t *)&curval)) { 305 ret = -EFAULT; 306 goto out; 307 } 308 if (curval != *cmpval) { 309 ret = -EAGAIN; 310 goto out; 311 } 312 } 313 314 for (fwp = futex_hash[index1]; fwp; fwp = next) { 315 next = fwp->fw_next; 316 if (!MEMID_EQUAL(&fwp->fw_memid, memid)) 317 continue; 318 319 futex_hashout(fwp); 320 if (ret++ < wake_threads) { 321 fwp->fw_woken = 1; 322 cv_signal(&fwp->fw_cv); 323 } else { 324 MEMID_COPY(requeue_memid, &fwp->fw_memid); 325 futex_hashin(fwp); 326 327 if ((ret - wake_threads) >= requeue_threads) 328 break; 329 } 330 } 331 332 out: 333 if (l2 != NULL) 334 mutex_exit(l2); 335 mutex_exit(l1); 336 337 if (ret < 0) 338 return (set_errno(-ret)); 339 return (ret); 340 } 341 342 /* 343 * Copy in the relative timeout provided by the application and convert it 344 * to an absolute timeout. 345 */ 346 static int 347 get_timeout(void *lx_timeout, timestruc_t *timeout) 348 { 349 timestruc_t now; 350 351 if (get_udatamodel() == DATAMODEL_NATIVE) { 352 if (copyin(lx_timeout, timeout, sizeof (timestruc_t))) 353 return (EFAULT); 354 } 355 #ifdef _SYSCALL32_IMPL 356 else { 357 timestruc32_t timeout32; 358 if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t))) 359 return (EFAULT); 360 timeout->tv_sec = (time_t)timeout32.tv_sec; 361 timeout->tv_nsec = timeout32.tv_nsec; 362 } 363 #endif 364 gethrestime(&now); 365 366 if (itimerspecfix(timeout)) 367 return (EINVAL); 368 369 timespecadd(timeout, &now); 370 return (0); 371 } 372 373 long 374 lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, 375 uintptr_t addr2, int val2) 376 { 377 struct as *as = curproc->p_as; 378 memid_t memid, requeue_memid; 379 timestruc_t timeout; 380 timestruc_t *tptr = NULL; 381 int requeue_threads = NULL; 382 int *requeue_cmp = NULL; 383 int rval = 0; 384 385 /* must be aligned on int boundary */ 386 if (addr & 0x3) 387 return (set_errno(EINVAL)); 388 389 /* Sanity check the futex command */ 390 if (cmd < 0 || cmd > FUTEX_MAX_CMD) 391 return (set_errno(EINVAL)); 392 393 /* Copy in the timeout structure from userspace. */ 394 if (cmd == FUTEX_WAIT && lx_timeout != NULL) { 395 rval = get_timeout((timespec_t *)lx_timeout, &timeout); 396 if (rval != 0) 397 return (set_errno(rval)); 398 tptr = &timeout; 399 } 400 401 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) { 402 if (cmd == FUTEX_CMP_REQUEUE) 403 requeue_cmp = &val2; 404 405 /* 406 * lx_timeout is nominally a pointer to a userspace 407 * address. For these two commands, it actually contains 408 * an integer which indicates the maximum number of threads 409 * to requeue. This is horrible, and I'm sorry. 410 */ 411 requeue_threads = (int)lx_timeout; 412 } 413 414 /* 415 * Translate the process-specific, user-space futex virtual 416 * address(es) to universal memid. 417 */ 418 rval = as_getmemid(as, (void *)addr, &memid); 419 if (rval != 0) 420 return (set_errno(rval)); 421 422 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) { 423 rval = as_getmemid(as, (void *)addr2, &requeue_memid); 424 if (rval) 425 return (set_errno(rval)); 426 } 427 428 switch (cmd) { 429 case FUTEX_WAIT: 430 rval = futex_wait(&memid, (void *)addr, val, tptr); 431 break; 432 433 case FUTEX_WAKE: 434 rval = futex_wake(&memid, val); 435 break; 436 437 case FUTEX_CMP_REQUEUE: 438 case FUTEX_REQUEUE: 439 rval = futex_requeue(&memid, &requeue_memid, val, 440 requeue_threads, (void *)addr2, requeue_cmp); 441 442 break; 443 } 444 445 return (rval); 446 } 447 448 void 449 lx_futex_init(void) 450 { 451 int i; 452 453 for (i = 0; i < HASH_SIZE; i++) 454 mutex_init(&futex_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 455 bzero(futex_hash, sizeof (futex_hash)); 456 } 457 458 int 459 lx_futex_fini(void) 460 { 461 int i, err; 462 463 err = 0; 464 for (i = 0; (err == 0) && (i < HASH_SIZE); i++) { 465 mutex_enter(&futex_hash_lock[i]); 466 if (futex_hash[i] != NULL) 467 err = EBUSY; 468 mutex_exit(&futex_hash_lock[i]); 469 } 470 return (err); 471 }