1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Inter-Process Communication Semaphore Facility.
  31  *
  32  * See os/ipc.c for a description of common IPC functionality.
  33  *
  34  * Resource controls
  35  * -----------------
  36  *
  37  * Control:      zone.max-sem-ids (rc_zone_semmni)
  38  * Description:  Maximum number of semaphore ids allowed a zone.
  39  *
  40  *   When semget() is used to allocate a semaphore set, one id is
  41  *   allocated.  If the id allocation doesn't succeed, semget() fails
  42  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
  43  *   the id is deallocated.
  44  *
  45  * Control:      project.max-sem-ids (rc_project_semmni)
  46  * Description:  Maximum number of semaphore ids allowed a project.
  47  *
  48  *   When semget() is used to allocate a semaphore set, one id is
  49  *   allocated.  If the id allocation doesn't succeed, semget() fails
  50  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
  51  *   the id is deallocated.
  52  *
  53  * Control:      process.max-sem-nsems (rc_process_semmsl)
  54  * Description:  Maximum number of semaphores allowed per semaphore set.
  55  *
  56  *   When semget() is used to allocate a semaphore set, the size of the
  57  *   set is compared with this limit.  If the number of semaphores
  58  *   exceeds the limit, semget() fails and errno is set to EINVAL.
  59  *
  60  * Control:      process.max-sem-ops (rc_process_semopm)
  61  * Description:  Maximum number of semaphore operations allowed per
  62  *               semop call.
  63  *
  64  *   When semget() successfully allocates a semaphore set, the minimum
  65  *   enforced value of this limit is used to initialize the
  66  *   "system-imposed maximum" number of operations a semop() call for
  67  *   this set can perform.
  68  *
  69  * Undo structures
  70  * ---------------
  71  *
  72  * Removing the undo structure tunables involved a serious redesign of
  73  * how they were implemented.  There is now one undo structure for
  74  * every process/semaphore array combination (lazily allocated, of
  75  * course), and each is equal in size to the semaphore it corresponds
  76  * to.  To avoid scalability and performance problems, the undo
  77  * structures are stored in two places: a per-process AVL tree sorted
  78  * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
  79  * per-semaphore linked list (sem_undos, protected by the semaphore's
  80  * ID lock).  The former is used by semop, where a lookup is performed
  81  * once and cached if SEM_UNDO is specified for any of the operations,
  82  * and at process exit where the undoable operations are rolled back.
  83  * The latter is used when removing the semaphore, so the undo
  84  * structures can be removed from the appropriate processes' trees.
  85  *
  86  * The undo structure itself contains pointers to the ksemid and proc
  87  * to which it corresponds, a list node, an AVL node, and an array of
  88  * adjust-on-exit (AOE) values.  When an undo structure is allocated it
  89  * is immediately added to both the process's tree and the semaphore's
  90  * list.  Lastly, the reference count on the semaphore is increased.
  91  *
  92  * Avoiding a lock ordering violation between p_lock and the ID lock,
  93  * wont to occur when there is a race between a process exiting and the
  94  * removal of a semaphore, mandates the delicate dance that exists
  95  * between semexit and sem_rmid.
  96  *
  97  * sem_rmid, holding the ID lock, iterates through all undo structures
  98  * and for each takes the appropriate process's p_lock and checks to
  99  * see if p_semacct is NULL.  If it is, it skips that undo structure
 100  * and continues to the next.  Otherwise, it removes the undo structure
 101  * from both the AVL tree and the semaphore's list, and releases the
 102  * hold that the undo structure had on the semaphore.
 103  *
 104  * The important other half of this is semexit, which will immediately
 105  * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
 106  * p_lock.  From this point on it is semexit's responsibility to clean
 107  * up all undo structures found in the tree -- a coexecuting sem_rmid
 108  * will see the NULL p_semacct and skip that undo structure.  It walks
 109  * the AVL tree (using avl_destroy_nodes) and for each undo structure
 110  * takes the appropriate semaphore's ID lock (always legal since the
 111  * undo structure has a hold on the semaphore), updates all semaphores
 112  * with non-zero AOE values, and removes the structure from the
 113  * semaphore's list.  It then drops the structure's reference on the
 114  * semaphore, drops the ID lock, and frees the undo structure.
 115  */
 116 
 117 #include <sys/types.h>
 118 #include <sys/t_lock.h>
 119 #include <sys/param.h>
 120 #include <sys/systm.h>
 121 #include <sys/sysmacros.h>
 122 #include <sys/cred.h>
 123 #include <sys/vmem.h>
 124 #include <sys/kmem.h>
 125 #include <sys/errno.h>
 126 #include <sys/time.h>
 127 #include <sys/ipc.h>
 128 #include <sys/ipc_impl.h>
 129 #include <sys/sem.h>
 130 #include <sys/sem_impl.h>
 131 #include <sys/user.h>
 132 #include <sys/proc.h>
 133 #include <sys/cpuvar.h>
 134 #include <sys/debug.h>
 135 #include <sys/var.h>
 136 #include <sys/cmn_err.h>
 137 #include <sys/modctl.h>
 138 #include <sys/syscall.h>
 139 #include <sys/avl.h>
 140 #include <sys/list.h>
 141 #include <sys/zone.h>
 142 
 143 #include <c2/audit.h>
 144 
 145 extern rctl_hndl_t rc_zone_semmni;
 146 extern rctl_hndl_t rc_project_semmni;
 147 extern rctl_hndl_t rc_process_semmsl;
 148 extern rctl_hndl_t rc_process_semopm;
 149 static ipc_service_t *sem_svc;
 150 static zone_key_t sem_zone_key;
 151 
 152 /*
 153  * The following tunables are obsolete.  Though for compatibility we
 154  * still read and interpret seminfo_semmsl, seminfo_semopm and
 155  * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
 156  * mechanism for administrating the IPC Semaphore facility is through
 157  * the resource controls described at the top of this file.
 158  */
 159 int seminfo_semaem = 16384;     /* (obsolete) */
 160 int seminfo_semmap = 10;        /* (obsolete) */
 161 int seminfo_semmni = 10;        /* (obsolete) */
 162 int seminfo_semmns = 60;        /* (obsolete) */
 163 int seminfo_semmnu = 30;        /* (obsolete) */
 164 int seminfo_semmsl = 25;        /* (obsolete) */
 165 int seminfo_semopm = 10;        /* (obsolete) */
 166 int seminfo_semume = 10;        /* (obsolete) */
 167 int seminfo_semusz = 96;        /* (obsolete) */
 168 int seminfo_semvmx = 32767;     /* (obsolete) */
 169 
 170 #define SEM_MAXUCOPS    4096    /* max # of unchecked ops per semop call */
 171 #define SEM_UNDOSZ(n)   (sizeof (struct sem_undo) + (n - 1) * sizeof (int))
 172 
 173 static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
 174     uintptr_t a2, uintptr_t a3);
 175 static void sem_dtor(kipc_perm_t *);
 176 static void sem_rmid(kipc_perm_t *);
 177 static void sem_remove_zone(zoneid_t, void *);
 178 
 179 static struct sysent ipcsem_sysent = {
 180         5,
 181         SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
 182         semsys
 183 };
 184 
 185 /*
 186  * Module linkage information for the kernel.
 187  */
 188 static struct modlsys modlsys = {
 189         &mod_syscallops, "System V semaphore facility", &ipcsem_sysent
 190 };
 191 
 192 #ifdef _SYSCALL32_IMPL
 193 static struct modlsys modlsys32 = {
 194         &mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
 195 };
 196 #endif
 197 
 198 static struct modlinkage modlinkage = {
 199         MODREV_1,
 200         &modlsys,
 201 #ifdef _SYSCALL32_IMPL
 202         &modlsys32,
 203 #endif
 204         NULL
 205 };
 206 
 207 
 208 int
 209 _init(void)
 210 {
 211         int result;
 212 
 213         sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
 214             sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
 215             offsetof(ipc_rqty_t, ipcq_semmni));
 216         zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
 217 
 218         if ((result = mod_install(&modlinkage)) == 0)
 219                 return (0);
 220 
 221         (void) zone_key_delete(sem_zone_key);
 222         ipcs_destroy(sem_svc);
 223 
 224         return (result);
 225 }
 226 
 227 int
 228 _fini(void)
 229 {
 230         return (EBUSY);
 231 }
 232 
 233 int
 234 _info(struct modinfo *modinfop)
 235 {
 236         return (mod_info(&modlinkage, modinfop));
 237 }
 238 
 239 static void
 240 sem_dtor(kipc_perm_t *perm)
 241 {
 242         ksemid_t *sp = (ksemid_t *)perm;
 243 
 244         kmem_free(sp->sem_base,
 245             P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
 246         list_destroy(&sp->sem_undos);
 247 }
 248 
 249 /*
 250  * sem_undo_add - Create or update adjust on exit entry.
 251  */
 252 static int
 253 sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
 254 {
 255         int newval = undo->un_aoe[num] - val;
 256 
 257         if (newval > USHRT_MAX || newval < -USHRT_MAX)
 258                 return (ERANGE);
 259         undo->un_aoe[num] = newval;
 260 
 261         return (0);
 262 }
 263 
 264 /*
 265  * sem_undo_clear - clears all undo entries for specified semaphores
 266  *
 267  * Used when semaphores are reset by SETVAL or SETALL.
 268  */
 269 static void
 270 sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
 271 {
 272         struct sem_undo *undo;
 273         int i;
 274 
 275         ASSERT(low <= high);
 276         ASSERT(high < sp->sem_nsems);
 277 
 278         for (undo = list_head(&sp->sem_undos); undo;
 279             undo = list_next(&sp->sem_undos, undo))
 280                 for (i = low; i <= high; i++)
 281                         undo->un_aoe[i] = 0;
 282 }
 283 
 284 /*
 285  * sem_rollback - roll back work done so far if unable to complete operation
 286  */
 287 static void
 288 sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
 289 {
 290         struct sem *semp;       /* semaphore ptr */
 291 
 292         for (op += n - 1; n--; op--) {
 293                 if (op->sem_op == 0)
 294                         continue;
 295                 semp = &sp->sem_base[op->sem_num];
 296                 semp->semval -= op->sem_op;
 297                 if (op->sem_flg & SEM_UNDO) {
 298                         ASSERT(undo != NULL);
 299                         (void) sem_undo_add(-op->sem_op, op->sem_num, undo);
 300                 }
 301         }
 302 }
 303 
 304 static void
 305 sem_rmid(kipc_perm_t *perm)
 306 {
 307         ksemid_t *sp = (ksemid_t *)perm;
 308         struct sem *semp;
 309         struct sem_undo *undo;
 310         size_t size = SEM_UNDOSZ(sp->sem_nsems);
 311         int i;
 312 
 313         /*LINTED*/
 314         while (undo = list_head(&sp->sem_undos)) {
 315                 list_remove(&sp->sem_undos, undo);
 316                 mutex_enter(&undo->un_proc->p_lock);
 317                 if (undo->un_proc->p_semacct == NULL) {
 318                         mutex_exit(&undo->un_proc->p_lock);
 319                         continue;
 320                 }
 321                 avl_remove(undo->un_proc->p_semacct, undo);
 322                 mutex_exit(&undo->un_proc->p_lock);
 323                 kmem_free(undo, size);
 324                 ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
 325         }
 326 
 327         for (i = 0; i < sp->sem_nsems; i++) {
 328                 semp = &sp->sem_base[i];
 329                 semp->semval = semp->sempid = 0;
 330                 if (semp->semncnt) {
 331                         cv_broadcast(&semp->semncnt_cv);
 332                         semp->semncnt = 0;
 333                 }
 334                 if (semp->semzcnt) {
 335                         cv_broadcast(&semp->semzcnt_cv);
 336                         semp->semzcnt = 0;
 337                 }
 338         }
 339 }
 340 
 341 /*
 342  * semctl - Semctl system call.
 343  */
 344 static int
 345 semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
 346 {
 347         ksemid_t                *sp;    /* ptr to semaphore header */
 348         struct sem              *p;     /* ptr to semaphore */
 349         unsigned int            i;      /* loop control */
 350         ushort_t                *vals, *vp;
 351         size_t                  vsize = 0;
 352         int                     error = 0;
 353         int                     retval = 0;
 354         struct cred             *cr;
 355         kmutex_t                *lock;
 356         model_t                 mdl = get_udatamodel();
 357         STRUCT_DECL(semid_ds, sid);
 358         struct semid_ds64       ds64;
 359 
 360         STRUCT_INIT(sid, mdl);
 361         cr = CRED();
 362 
 363         /*
 364          * Perform pre- or non-lookup actions (e.g. copyins, RMID).
 365          */
 366         switch (cmd) {
 367         case IPC_SET:
 368                 if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
 369                         return (set_errno(EFAULT));
 370                 break;
 371 
 372         case IPC_SET64:
 373                 if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
 374                         return (set_errno(EFAULT));
 375                 break;
 376 
 377         case SETALL:
 378                 if ((lock = ipc_lookup(sem_svc, semid,
 379                     (kipc_perm_t **)&sp)) == NULL)
 380                         return (set_errno(EINVAL));
 381                 vsize = sp->sem_nsems * sizeof (*vals);
 382                 mutex_exit(lock);
 383 
 384                 /* allocate space to hold all semaphore values */
 385                 vals = kmem_alloc(vsize, KM_SLEEP);
 386 
 387                 if (copyin((void *)arg, vals, vsize)) {
 388                         kmem_free(vals, vsize);
 389                         return (set_errno(EFAULT));
 390                 }
 391                 break;
 392 
 393         case IPC_RMID:
 394                 if (error = ipc_rmid(sem_svc, semid, cr))
 395                         return (set_errno(error));
 396                 return (0);
 397         }
 398 
 399         if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
 400                 if (vsize != 0)
 401                         kmem_free(vals, vsize);
 402                 return (set_errno(EINVAL));
 403         }
 404         switch (cmd) {
 405         /* Set ownership and permissions. */
 406         case IPC_SET:
 407 
 408                 if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
 409                     &STRUCT_BUF(sid)->sem_perm, mdl)) {
 410                         mutex_exit(lock);
 411                         return (set_errno(error));
 412                 }
 413                 sp->sem_ctime = gethrestime_sec();
 414                 mutex_exit(lock);
 415                 return (0);
 416 
 417         /* Get semaphore data structure. */
 418         case IPC_STAT:
 419 
 420                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 421                         mutex_exit(lock);
 422                         return (set_errno(error));
 423                 }
 424 
 425                 ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
 426                 STRUCT_FSETP(sid, sem_base, NULL);      /* kernel addr */
 427                 STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
 428                 STRUCT_FSET(sid, sem_otime, sp->sem_otime);
 429                 STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
 430                 STRUCT_FSET(sid, sem_binary, sp->sem_binary);
 431                 mutex_exit(lock);
 432 
 433                 if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
 434                         return (set_errno(EFAULT));
 435                 return (0);
 436 
 437         case IPC_SET64:
 438 
 439                 if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
 440                     &ds64.semx_perm)) {
 441                         mutex_exit(lock);
 442                         return (set_errno(error));
 443                 }
 444                 sp->sem_ctime = gethrestime_sec();
 445                 mutex_exit(lock);
 446                 return (0);
 447 
 448         case IPC_STAT64:
 449 
 450                 ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
 451                 ds64.semx_nsems = sp->sem_nsems;
 452                 ds64.semx_otime = sp->sem_otime;
 453                 ds64.semx_ctime = sp->sem_ctime;
 454 
 455                 mutex_exit(lock);
 456                 if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
 457                         return (set_errno(EFAULT));
 458 
 459                 return (0);
 460 
 461         /* Get # of processes sleeping for greater semval. */
 462         case GETNCNT:
 463                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 464                         mutex_exit(lock);
 465                         return (set_errno(error));
 466                 }
 467                 if (semnum >= sp->sem_nsems) {
 468                         mutex_exit(lock);
 469                         return (set_errno(EINVAL));
 470                 }
 471                 retval = sp->sem_base[semnum].semncnt;
 472                 mutex_exit(lock);
 473                 return (retval);
 474 
 475         /* Get pid of last process to operate on semaphore. */
 476         case GETPID:
 477                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 478                         mutex_exit(lock);
 479                         return (set_errno(error));
 480                 }
 481                 if (semnum >= sp->sem_nsems) {
 482                         mutex_exit(lock);
 483                         return (set_errno(EINVAL));
 484                 }
 485                 retval = sp->sem_base[semnum].sempid;
 486                 mutex_exit(lock);
 487                 return (retval);
 488 
 489         /* Get semval of one semaphore. */
 490         case GETVAL:
 491                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 492                         mutex_exit(lock);
 493                         return (set_errno(error));
 494                 }
 495                 if (semnum >= sp->sem_nsems) {
 496                         mutex_exit(lock);
 497                         return (set_errno(EINVAL));
 498                 }
 499                 retval = sp->sem_base[semnum].semval;
 500                 mutex_exit(lock);
 501                 return (retval);
 502 
 503         /* Get all semvals in set. */
 504         case GETALL:
 505                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 506                         mutex_exit(lock);
 507                         return (set_errno(error));
 508                 }
 509 
 510                 /* allocate space to hold all semaphore values */
 511                 vsize = sp->sem_nsems * sizeof (*vals);
 512                 vals = vp = kmem_alloc(vsize, KM_SLEEP);
 513 
 514                 for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
 515                         bcopy(&p->semval, vp, sizeof (p->semval));
 516 
 517                 mutex_exit(lock);
 518 
 519                 if (copyout((void *)vals, (void *)arg, vsize)) {
 520                         kmem_free(vals, vsize);
 521                         return (set_errno(EFAULT));
 522                 }
 523 
 524                 kmem_free(vals, vsize);
 525                 return (0);
 526 
 527         /* Get # of processes sleeping for semval to become zero. */
 528         case GETZCNT:
 529                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 530                         mutex_exit(lock);
 531                         return (set_errno(error));
 532                 }
 533                 if (semnum >= sp->sem_nsems) {
 534                         mutex_exit(lock);
 535                         return (set_errno(EINVAL));
 536                 }
 537                 retval = sp->sem_base[semnum].semzcnt;
 538                 mutex_exit(lock);
 539                 return (retval);
 540 
 541         /* Set semval of one semaphore. */
 542         case SETVAL:
 543                 if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
 544                         mutex_exit(lock);
 545                         return (set_errno(error));
 546                 }
 547                 if (semnum >= sp->sem_nsems) {
 548                         mutex_exit(lock);
 549                         return (set_errno(EINVAL));
 550                 }
 551                 if ((uint_t)arg > USHRT_MAX) {
 552                         mutex_exit(lock);
 553                         return (set_errno(ERANGE));
 554                 }
 555                 p = &sp->sem_base[semnum];
 556                 if ((p->semval = (ushort_t)arg) != 0) {
 557                         if (p->semncnt) {
 558                                 cv_broadcast(&p->semncnt_cv);
 559                         }
 560                 } else if (p->semzcnt) {
 561                         cv_broadcast(&p->semzcnt_cv);
 562                 }
 563                 p->sempid = curproc->p_pid;
 564                 sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
 565                 mutex_exit(lock);
 566                 return (0);
 567 
 568         /* Set semvals of all semaphores in set. */
 569         case SETALL:
 570                 /* Check if semaphore set has been deleted and reallocated. */
 571                 if (sp->sem_nsems * sizeof (*vals) != vsize) {
 572                         error = set_errno(EINVAL);
 573                         goto seterr;
 574                 }
 575                 if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
 576                         error = set_errno(error);
 577                         goto seterr;
 578                 }
 579                 sem_undo_clear(sp, 0, sp->sem_nsems - 1);
 580                 for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
 581                     (p++)->sempid = curproc->p_pid) {
 582                         if ((p->semval = vals[i++]) != 0) {
 583                                 if (p->semncnt) {
 584                                         cv_broadcast(&p->semncnt_cv);
 585                                 }
 586                         } else if (p->semzcnt) {
 587                                 cv_broadcast(&p->semzcnt_cv);
 588                         }
 589                 }
 590 seterr:
 591                 mutex_exit(lock);
 592                 kmem_free(vals, vsize);
 593                 return (error);
 594 
 595         default:
 596                 mutex_exit(lock);
 597                 return (set_errno(EINVAL));
 598         }
 599 
 600         /* NOTREACHED */
 601 }
 602 
 603 /*
 604  * semexit - Called by exit() to clean up on process exit.
 605  */
 606 void
 607 semexit(proc_t *pp)
 608 {
 609         avl_tree_t      *tree;
 610         struct sem_undo *undo;
 611         void            *cookie = NULL;
 612 
 613         mutex_enter(&pp->p_lock);
 614         tree = pp->p_semacct;
 615         pp->p_semacct = NULL;
 616         mutex_exit(&pp->p_lock);
 617 
 618         while (undo = avl_destroy_nodes(tree, &cookie)) {
 619                 ksemid_t *sp = undo->un_sp;
 620                 size_t size = SEM_UNDOSZ(sp->sem_nsems);
 621                 int i;
 622 
 623                 (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 624                 if (!IPC_FREE(&sp->sem_perm)) {
 625                         for (i = 0; i < sp->sem_nsems; i++) {
 626                                 int adj = undo->un_aoe[i];
 627                                 if (adj) {
 628                                         struct sem *semp = &sp->sem_base[i];
 629                                         int v = (int)semp->semval + adj;
 630 
 631                                         if (v < 0 || v > USHRT_MAX)
 632                                                 continue;
 633                                         semp->semval = (ushort_t)v;
 634                                         if (v == 0 && semp->semzcnt)
 635                                                 cv_broadcast(&semp->semzcnt_cv);
 636                                         if (adj > 0 && semp->semncnt)
 637                                                 cv_broadcast(&semp->semncnt_cv);
 638                                 }
 639                         }
 640                         list_remove(&sp->sem_undos, undo);
 641                 }
 642                 ipc_rele(sem_svc, (kipc_perm_t *)sp);
 643                 kmem_free(undo, size);
 644         }
 645 
 646         avl_destroy(tree);
 647         kmem_free(tree, sizeof (avl_tree_t));
 648 }
 649 
 650 /*
 651  * Remove all semaphores associated with a given zone.  Called by
 652  * zone_shutdown when the zone is halted.
 653  */
 654 /*ARGSUSED1*/
 655 static void
 656 sem_remove_zone(zoneid_t zoneid, void *arg)
 657 {
 658         ipc_remove_zone(sem_svc, zoneid);
 659 }
 660 
 661 /*
 662  * semget - Semget system call.
 663  */
 664 static int
 665 semget(key_t key, int nsems, int semflg)
 666 {
 667         ksemid_t        *sp;
 668         kmutex_t        *lock;
 669         int             id, error;
 670         proc_t          *pp = curproc;
 671 
 672 top:
 673         if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
 674                 return (set_errno(error));
 675 
 676         if (!IPC_FREE(&sp->sem_perm)) {
 677                 /*
 678                  * A semaphore with the requested key exists.
 679                  */
 680                 if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
 681                         mutex_exit(lock);
 682                         return (set_errno(EINVAL));
 683                 }
 684         } else {
 685                 /*
 686                  * This is a new semaphore set.  Finish initialization.
 687                  */
 688                 if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
 689                     nsems, RCA_SAFE) & RCT_DENY)) {
 690                         mutex_exit(lock);
 691                         mutex_exit(&pp->p_lock);
 692                         ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
 693                         return (set_errno(EINVAL));
 694                 }
 695                 mutex_exit(lock);
 696                 mutex_exit(&pp->p_lock);
 697 
 698                 /*
 699                  * We round the allocation up to coherency granularity
 700                  * so that multiple semaphore allocations won't result
 701                  * in the false sharing of their sem structures.
 702                  */
 703                 sp->sem_base =
 704                     kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
 705                     KM_SLEEP);
 706                 sp->sem_binary = (nsems == 1);
 707                 sp->sem_nsems = (ushort_t)nsems;
 708                 sp->sem_ctime = gethrestime_sec();
 709                 sp->sem_otime = 0;
 710                 list_create(&sp->sem_undos, sizeof (struct sem_undo),
 711                     offsetof(struct sem_undo, un_list));
 712 
 713                 if (error = ipc_commit_begin(sem_svc, key, semflg,
 714                     (kipc_perm_t *)sp)) {
 715                         if (error == EAGAIN)
 716                                 goto top;
 717                         return (set_errno(error));
 718                 }
 719                 sp->sem_maxops =
 720                     rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
 721                 if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
 722                     RCA_SAFE) & RCT_DENY) {
 723                         ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
 724                         return (set_errno(EINVAL));
 725                 }
 726                 lock = ipc_commit_end(sem_svc, &sp->sem_perm);
 727         }
 728 
 729         if (AU_AUDITING())
 730                 audit_ipcget(AT_IPC_SEM, (void *)sp);
 731 
 732         id = sp->sem_perm.ipc_id;
 733         mutex_exit(lock);
 734         return (id);
 735 }
 736 
 737 /*
 738  * semids system call.
 739  */
 740 static int
 741 semids(int *buf, uint_t nids, uint_t *pnids)
 742 {
 743         int error;
 744 
 745         if (error = ipc_ids(sem_svc, buf, nids, pnids))
 746                 return (set_errno(error));
 747 
 748         return (0);
 749 }
 750 
 751 
 752 /*
 753  * Helper function for semop - copies in the provided timespec and
 754  * computes the absolute future time after which we must return.
 755  */
 756 static int
 757 compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
 758         timespec_t *timeout)
 759 {
 760         model_t datamodel = get_udatamodel();
 761 
 762         if (datamodel == DATAMODEL_NATIVE) {
 763                 if (copyin(timeout, ts, sizeof (timespec_t)))
 764                         return (EFAULT);
 765         } else {
 766                 timespec32_t ts32;
 767 
 768                 if (copyin(timeout, &ts32, sizeof (timespec32_t)))
 769                         return (EFAULT);
 770                 TIMESPEC32_TO_TIMESPEC(ts, &ts32)
 771         }
 772 
 773         if (itimerspecfix(ts))
 774                 return (EINVAL);
 775 
 776         /*
 777          * Convert the timespec value into absolute time.
 778          */
 779         timespecadd(ts, now);
 780         *tsp = ts;
 781 
 782         return (0);
 783 }
 784 
 785 /*
 786  * Undo structure comparator.  We sort based on ksemid_t pointer.
 787  */
 788 static int
 789 sem_undo_compar(const void *x, const void *y)
 790 {
 791         struct sem_undo *undo1 = (struct sem_undo *)x;
 792         struct sem_undo *undo2 = (struct sem_undo *)y;
 793 
 794         if (undo1->un_sp < undo2->un_sp)
 795                 return (-1);
 796         if (undo1->un_sp > undo2->un_sp)
 797                 return (1);
 798         return (0);
 799 }
 800 
 801 /*
 802  * Helper function for semop - creates an undo structure and adds it to
 803  * the process's avl tree and the semaphore's list.
 804  */
 805 static int
 806 sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
 807     struct sem_undo *template, struct sem_undo **un)
 808 {
 809         size_t size;
 810         struct sem_undo *undo;
 811         avl_tree_t *tree = NULL;
 812         avl_index_t where;
 813 
 814         mutex_exit(*lock);
 815 
 816         size = SEM_UNDOSZ(sp->sem_nsems);
 817         undo = kmem_zalloc(size, KM_SLEEP);
 818         undo->un_proc = pp;
 819         undo->un_sp = sp;
 820 
 821         if (pp->p_semacct == NULL)
 822                 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 823 
 824         *lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 825         if (IPC_FREE(&sp->sem_perm)) {
 826                 kmem_free(undo, size);
 827                 if (tree)
 828                         kmem_free(tree, sizeof (avl_tree_t));
 829                 return (EIDRM);
 830         }
 831 
 832         mutex_enter(&pp->p_lock);
 833         if (tree) {
 834                 if (pp->p_semacct == NULL) {
 835                         avl_create(tree, sem_undo_compar,
 836                             sizeof (struct sem_undo),
 837                             offsetof(struct sem_undo, un_avl));
 838                         pp->p_semacct = tree;
 839                 } else {
 840                         kmem_free(tree, sizeof (avl_tree_t));
 841                 }
 842         }
 843 
 844         if (*un = avl_find(pp->p_semacct, template, &where)) {
 845                 mutex_exit(&pp->p_lock);
 846                 kmem_free(undo, size);
 847         } else {
 848                 *un = undo;
 849                 avl_insert(pp->p_semacct, undo, where);
 850                 mutex_exit(&pp->p_lock);
 851                 list_insert_head(&sp->sem_undos, undo);
 852                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
 853         }
 854 
 855 
 856         return (0);
 857 }
 858 
 859 /*
 860  * semop - Semop system call.
 861  */
 862 static int
 863 semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
 864 {
 865         ksemid_t        *sp = NULL;
 866         kmutex_t        *lock;
 867         struct sembuf   *op;    /* ptr to operation */
 868         int             i;      /* loop control */
 869         struct sem      *semp;  /* ptr to semaphore */
 870         int             error = 0;
 871         struct sembuf   *uops;  /* ptr to copy of user ops */
 872         struct sembuf   x_sem;  /* avoid kmem_alloc's */
 873         timespec_t      now, ts, *tsp = NULL;
 874         int             timecheck = 0;
 875         int             cvres, needundo, mode;
 876         struct sem_undo *undo;
 877         proc_t          *pp = curproc;
 878         int             held = 0;
 879 
 880         CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
 881 
 882         /*
 883          * To avoid the cost of copying in 'timeout' in the common
 884          * case, we could only grab the time here and defer the copyin
 885          * and associated computations until we are about to block.
 886          *
 887          * The down side to this is that we would then have to spin
 888          * some goto top nonsense to avoid the copyin behind the semid
 889          * lock.  As a common use of timed semaphores is as an explicit
 890          * blocking mechanism, this could incur a greater penalty.
 891          *
 892          * If we eventually decide that this would be a wise route to
 893          * take, the deferrable functionality is completely contained
 894          * in 'compute_timeout', and the interface is defined such that
 895          * we can legally not validate 'timeout' if it is unused.
 896          */
 897         if (timeout != NULL) {
 898                 timecheck = timechanged;
 899                 gethrestime(&now);
 900                 if (error = compute_timeout(&tsp, &ts, &now, timeout))
 901                         return (set_errno(error));
 902         }
 903 
 904         /*
 905          * Allocate space to hold the vector of semaphore ops.  If
 906          * there is only 1 operation we use a preallocated buffer on
 907          * the stack for speed.
 908          *
 909          * Since we don't want to allow the user to allocate an
 910          * arbitrary amount of kernel memory, we need to check against
 911          * the number of operations allowed by the semaphore.  We only
 912          * bother doing this if the number of operations is larger than
 913          * SEM_MAXUCOPS.
 914          */
 915         if (nsops == 1)
 916                 uops = &x_sem;
 917         else if (nsops == 0)
 918                 return (0);
 919         else if (nsops <= SEM_MAXUCOPS)
 920                 uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
 921 
 922         if (nsops > SEM_MAXUCOPS) {
 923                 if ((lock = ipc_lookup(sem_svc, semid,
 924                     (kipc_perm_t **)&sp)) == NULL)
 925                         return (set_errno(EFAULT));
 926 
 927                 if (nsops > sp->sem_maxops) {
 928                         mutex_exit(lock);
 929                         return (set_errno(E2BIG));
 930                 }
 931                 held = 1;
 932                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
 933                 mutex_exit(lock);
 934 
 935                 uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
 936                 if (copyin(sops, uops, nsops * sizeof (*op))) {
 937                         error = EFAULT;
 938                         (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 939                         goto semoperr;
 940                 }
 941 
 942                 lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 943                 if (IPC_FREE(&sp->sem_perm)) {
 944                         error = EIDRM;
 945                         goto semoperr;
 946                 }
 947         } else {
 948                 /*
 949                  * This could be interleaved with the above code, but
 950                  * keeping them separate improves readability.
 951                  */
 952                 if (copyin(sops, uops, nsops * sizeof (*op))) {
 953                         error = EFAULT;
 954                         goto semoperr_unlocked;
 955                 }
 956 
 957                 if ((lock = ipc_lookup(sem_svc, semid,
 958                     (kipc_perm_t **)&sp)) == NULL) {
 959                         error = EINVAL;
 960                         goto semoperr_unlocked;
 961                 }
 962 
 963                 if (nsops > sp->sem_maxops) {
 964                         error = E2BIG;
 965                         goto semoperr;
 966                 }
 967         }
 968 
 969         /*
 970          * Scan all operations.  Verify that sem #s are in range and
 971          * this process is allowed the requested operations.  If any
 972          * operations are marked SEM_UNDO, find (or allocate) the undo
 973          * structure for this process and semaphore.
 974          */
 975         needundo = 0;
 976         mode = 0;
 977         for (i = 0, op = uops; i++ < nsops; op++) {
 978                 mode |= op->sem_op ? SEM_A : SEM_R;
 979                 if (op->sem_num >= sp->sem_nsems) {
 980                         error = EFBIG;
 981                         goto semoperr;
 982                 }
 983                 if ((op->sem_flg & SEM_UNDO) && op->sem_op)
 984                         needundo = 1;
 985         }
 986         if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
 987                 goto semoperr;
 988 
 989         if (needundo) {
 990                 struct sem_undo template;
 991 
 992                 template.un_sp = sp;
 993                 mutex_enter(&pp->p_lock);
 994                 if (pp->p_semacct)
 995                         undo = avl_find(pp->p_semacct, &template, NULL);
 996                 else
 997                         undo = NULL;
 998                 mutex_exit(&pp->p_lock);
 999                 if (undo == NULL) {
1000                         if (!held) {
1001                                 held = 1;
1002                                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
1003                         }
1004                         if (error = sem_undo_alloc(pp, sp, &lock, &template,
1005                             &undo))
1006                                 goto semoperr;
1007 
1008                         /* sem_undo_alloc unlocks the semaphore */
1009                         if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
1010                                 goto semoperr;
1011                 }
1012         }
1013 
1014 check:
1015         /*
1016          * Loop waiting for the operations to be satisfied atomically.
1017          * Actually, do the operations and undo them if a wait is needed
1018          * or an error is detected.
1019          */
1020         for (i = 0; i < nsops; i++) {
1021                 op = &uops[i];
1022                 semp = &sp->sem_base[op->sem_num];
1023 
1024                 /*
1025                  * Raise the semaphore (i.e. sema_v)
1026                  */
1027                 if (op->sem_op > 0) {
1028                         if (op->sem_op + (int)semp->semval > USHRT_MAX ||
1029                             ((op->sem_flg & SEM_UNDO) &&
1030                             (error = sem_undo_add(op->sem_op, op->sem_num,
1031                             undo)))) {
1032                                 if (i)
1033                                         sem_rollback(sp, uops, i, undo);
1034                                 if (error == 0)
1035                                         error = ERANGE;
1036                                 goto semoperr;
1037                         }
1038                         semp->semval += op->sem_op;
1039                         /*
1040                          * If we are only incrementing the semaphore value
1041                          * by one on a binary semaphore, we can cv_signal.
1042                          */
1043                         if (semp->semncnt) {
1044                                 if (op->sem_op == 1 && sp->sem_binary)
1045                                         cv_signal(&semp->semncnt_cv);
1046                                 else
1047                                         cv_broadcast(&semp->semncnt_cv);
1048                         }
1049                         if (semp->semzcnt && !semp->semval)
1050                                 cv_broadcast(&semp->semzcnt_cv);
1051                         continue;
1052                 }
1053 
1054                 /*
1055                  * Lower the semaphore (i.e. sema_p)
1056                  */
1057                 if (op->sem_op < 0) {
1058                         if (semp->semval >= (unsigned)(-op->sem_op)) {
1059                                 if ((op->sem_flg & SEM_UNDO) &&
1060                                     (error = sem_undo_add(op->sem_op,
1061                                     op->sem_num, undo))) {
1062                                         if (i)
1063                                                 sem_rollback(sp, uops, i, undo);
1064                                         goto semoperr;
1065                                 }
1066                                 semp->semval += op->sem_op;
1067                                 if (semp->semzcnt && !semp->semval)
1068                                         cv_broadcast(&semp->semzcnt_cv);
1069                                 continue;
1070                         }
1071                         if (i)
1072                                 sem_rollback(sp, uops, i, undo);
1073                         if (op->sem_flg & IPC_NOWAIT) {
1074                                 error = EAGAIN;
1075                                 goto semoperr;
1076                         }
1077 
1078                         /*
1079                          * Mark the semaphore set as not a binary type
1080                          * if we are decrementing the value by more than 1.
1081                          *
1082                          * V operations will resort to cv_broadcast
1083                          * for this set because there are too many weird
1084                          * cases that have to be caught.
1085                          */
1086                         if (op->sem_op < -1)
1087                                 sp->sem_binary = 0;
1088                         if (!held) {
1089                                 held = 1;
1090                                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
1091                         }
1092                         semp->semncnt++;
1093                         cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
1094                             tsp, timecheck);
1095                         lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1096 
1097                         if (!IPC_FREE(&sp->sem_perm)) {
1098                                 ASSERT(semp->semncnt != 0);
1099                                 semp->semncnt--;
1100                                 if (cvres > 0)       /* normal wakeup */
1101                                         goto check;
1102                         }
1103 
1104                         /* EINTR or EAGAIN overrides EIDRM */
1105                         if (cvres == 0)
1106                                 error = EINTR;
1107                         else if (cvres < 0)
1108                                 error = EAGAIN;
1109                         else
1110                                 error = EIDRM;
1111                         goto semoperr;
1112                 }
1113 
1114                 /*
1115                  * Wait for zero value
1116                  */
1117                 if (semp->semval) {
1118                         if (i)
1119                                 sem_rollback(sp, uops, i, undo);
1120                         if (op->sem_flg & IPC_NOWAIT) {
1121                                 error = EAGAIN;
1122                                 goto semoperr;
1123                         }
1124 
1125                         if (!held) {
1126                                 held = 1;
1127                                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
1128                         }
1129                         semp->semzcnt++;
1130                         cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
1131                             tsp, timecheck);
1132                         lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1133 
1134                         /*
1135                          * Don't touch semp if the semaphores have been removed.
1136                          */
1137                         if (!IPC_FREE(&sp->sem_perm)) {
1138                                 ASSERT(semp->semzcnt != 0);
1139                                 semp->semzcnt--;
1140                                 if (cvres > 0)       /* normal wakeup */
1141                                         goto check;
1142                         }
1143 
1144                         /* EINTR or EAGAIN overrides EIDRM */
1145                         if (cvres == 0)
1146                                 error = EINTR;
1147                         else if (cvres < 0)
1148                                 error = EAGAIN;
1149                         else
1150                                 error = EIDRM;
1151                         goto semoperr;
1152                 }
1153         }
1154 
1155         /* All operations succeeded.  Update sempid for accessed semaphores. */
1156         for (i = 0, op = uops; i++ < nsops;
1157             sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
1158                 ;
1159         sp->sem_otime = gethrestime_sec();
1160         if (held)
1161                 ipc_rele(sem_svc, (kipc_perm_t *)sp);
1162         else
1163                 mutex_exit(lock);
1164 
1165         /* Before leaving, deallocate the buffer that held the user semops */
1166         if (nsops != 1)
1167                 kmem_free(uops, sizeof (*uops) * nsops);
1168         return (0);
1169 
1170         /*
1171          * Error return labels
1172          */
1173 semoperr:
1174         if (held)
1175                 ipc_rele(sem_svc, (kipc_perm_t *)sp);
1176         else
1177                 mutex_exit(lock);
1178 
1179 semoperr_unlocked:
1180 
1181         /* Before leaving, deallocate the buffer that held the user semops */
1182         if (nsops != 1)
1183                 kmem_free(uops, sizeof (*uops) * nsops);
1184         return (set_errno(error));
1185 }
1186 
1187 /*
1188  * semsys - System entry point for semctl, semget, and semop system calls.
1189  */
1190 static int
1191 semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
1192 {
1193         int error;
1194 
1195         switch (opcode) {
1196         case SEMCTL:
1197                 error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
1198                 break;
1199         case SEMGET:
1200                 error = semget((key_t)a1, (int)a2, (int)a3);
1201                 break;
1202         case SEMOP:
1203                 error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
1204                 break;
1205         case SEMIDS:
1206                 error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
1207                 break;
1208         case SEMTIMEDOP:
1209                 error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
1210                     (timespec_t *)a4);
1211                 break;
1212         default:
1213                 error = set_errno(EINVAL);
1214                 break;
1215         }
1216         return (error);
1217 }