1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * Inter-Process Communication Semaphore Facility.
  31  *
  32  * See os/ipc.c for a description of common IPC functionality.
  33  *
  34  * Resource controls
  35  * -----------------
  36  *
  37  * Control:      zone.max-sem-ids (rc_zone_semmni)
  38  * Description:  Maximum number of semaphore ids allowed a zone.
  39  *
  40  *   When semget() is used to allocate a semaphore set, one id is
  41  *   allocated.  If the id allocation doesn't succeed, semget() fails
  42  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
  43  *   the id is deallocated.
  44  *
  45  * Control:      project.max-sem-ids (rc_project_semmni)
  46  * Description:  Maximum number of semaphore ids allowed a project.
  47  *
  48  *   When semget() is used to allocate a semaphore set, one id is
  49  *   allocated.  If the id allocation doesn't succeed, semget() fails
  50  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
  51  *   the id is deallocated.
  52  *
  53  * Control:      process.max-sem-nsems (rc_process_semmsl)
  54  * Description:  Maximum number of semaphores allowed per semaphore set.
  55  *
  56  *   When semget() is used to allocate a semaphore set, the size of the
  57  *   set is compared with this limit.  If the number of semaphores
  58  *   exceeds the limit, semget() fails and errno is set to EINVAL.
  59  *
  60  * Control:      process.max-sem-ops (rc_process_semopm)
  61  * Description:  Maximum number of semaphore operations allowed per
  62  *               semop call.
  63  *
  64  *   When semget() successfully allocates a semaphore set, the minimum
  65  *   enforced value of this limit is used to initialize the
  66  *   "system-imposed maximum" number of operations a semop() call for
  67  *   this set can perform.
  68  *
  69  * Undo structures
  70  * ---------------
  71  *
  72  * Removing the undo structure tunables involved a serious redesign of
  73  * how they were implemented.  There is now one undo structure for
  74  * every process/semaphore array combination (lazily allocated, of
  75  * course), and each is equal in size to the semaphore it corresponds
  76  * to.  To avoid scalability and performance problems, the undo
  77  * structures are stored in two places: a per-process AVL tree sorted
  78  * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
  79  * per-semaphore linked list (sem_undos, protected by the semaphore's
  80  * ID lock).  The former is used by semop, where a lookup is performed
  81  * once and cached if SEM_UNDO is specified for any of the operations,
  82  * and at process exit where the undoable operations are rolled back.
  83  * The latter is used when removing the semaphore, so the undo
  84  * structures can be removed from the appropriate processes' trees.
  85  *
  86  * The undo structure itself contains pointers to the ksemid and proc
  87  * to which it corresponds, a list node, an AVL node, and an array of
  88  * adjust-on-exit (AOE) values.  When an undo structure is allocated it
  89  * is immediately added to both the process's tree and the semaphore's
  90  * list.  Lastly, the reference count on the semaphore is increased.
  91  *
  92  * Avoiding a lock ordering violation between p_lock and the ID lock,
  93  * wont to occur when there is a race between a process exiting and the
  94  * removal of a semaphore, mandates the delicate dance that exists
  95  * between semexit and sem_rmid.
  96  *
  97  * sem_rmid, holding the ID lock, iterates through all undo structures
  98  * and for each takes the appropriate process's p_lock and checks to
  99  * see if p_semacct is NULL.  If it is, it skips that undo structure
 100  * and continues to the next.  Otherwise, it removes the undo structure
 101  * from both the AVL tree and the semaphore's list, and releases the
 102  * hold that the undo structure had on the semaphore.
 103  *
 104  * The important other half of this is semexit, which will immediately
 105  * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
 106  * p_lock.  From this point on it is semexit's responsibility to clean
 107  * up all undo structures found in the tree -- a coexecuting sem_rmid
 108  * will see the NULL p_semacct and skip that undo structure.  It walks
 109  * the AVL tree (using avl_destroy_nodes) and for each undo structure
 110  * takes the appropriate semaphore's ID lock (always legal since the
 111  * undo structure has a hold on the semaphore), updates all semaphores
 112  * with non-zero AOE values, and removes the structure from the
 113  * semaphore's list.  It then drops the structure's reference on the
 114  * semaphore, drops the ID lock, and frees the undo structure.
 115  */
 116 
 117 #include <sys/types.h>
 118 #include <sys/t_lock.h>
 119 #include <sys/param.h>
 120 #include <sys/systm.h>
 121 #include <sys/sysmacros.h>
 122 #include <sys/cred.h>
 123 #include <sys/vmem.h>
 124 #include <sys/kmem.h>
 125 #include <sys/errno.h>
 126 #include <sys/time.h>
 127 #include <sys/ipc.h>
 128 #include <sys/ipc_impl.h>
 129 #include <sys/sem.h>
 130 #include <sys/sem_impl.h>
 131 #include <sys/user.h>
 132 #include <sys/proc.h>
 133 #include <sys/cpuvar.h>
 134 #include <sys/debug.h>
 135 #include <sys/var.h>
 136 #include <sys/cmn_err.h>
 137 #include <sys/modctl.h>
 138 #include <sys/syscall.h>
 139 #include <sys/avl.h>
 140 #include <sys/list.h>
 141 #include <sys/zone.h>
 142 
 143 #include <c2/audit.h>
 144 
 145 extern rctl_hndl_t rc_zone_semmni;
 146 extern rctl_hndl_t rc_project_semmni;
 147 extern rctl_hndl_t rc_process_semmsl;
 148 extern rctl_hndl_t rc_process_semopm;
 149 static ipc_service_t *sem_svc;
 150 static zone_key_t sem_zone_key;
 151 
 152 /*
 153  * The following tunables are obsolete.  Though for compatibility we
 154  * still read and interpret seminfo_semmsl, seminfo_semopm and
 155  * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
 156  * mechanism for administrating the IPC Semaphore facility is through
 157  * the resource controls described at the top of this file.
 158  */
 159 int seminfo_semaem = 16384;     /* (obsolete) */
 160 int seminfo_semmap = 10;        /* (obsolete) */
 161 int seminfo_semmni = 10;        /* (obsolete) */
 162 int seminfo_semmns = 60;        /* (obsolete) */
 163 int seminfo_semmnu = 30;        /* (obsolete) */
 164 int seminfo_semmsl = 25;        /* (obsolete) */
 165 int seminfo_semopm = 10;        /* (obsolete) */
 166 int seminfo_semume = 10;        /* (obsolete) */
 167 int seminfo_semusz = 96;        /* (obsolete) */
 168 int seminfo_semvmx = 32767;     /* (obsolete) */
 169 
 170 #define SEM_MAXUCOPS    4096    /* max # of unchecked ops per semop call */
 171 #define SEM_UNDOSZ(n)   (sizeof (struct sem_undo) + (n - 1) * sizeof (int))
 172 
 173 static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
 174     uintptr_t a2, uintptr_t a3);
 175 static void sem_dtor(kipc_perm_t *);
 176 static void sem_rmid(kipc_perm_t *);
 177 static void sem_remove_zone(zoneid_t, void *);
 178 
 179 static struct sysent ipcsem_sysent = {
 180         5,
 181         SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
 182         semsys
 183 };
 184 
 185 /*
 186  * Module linkage information for the kernel.
 187  */
 188 static struct modlsys modlsys = {
 189         &mod_syscallops, "System V semaphore facility", &ipcsem_sysent
 190 };
 191 
 192 #ifdef _SYSCALL32_IMPL
 193 static struct modlsys modlsys32 = {
 194         &mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
 195 };
 196 #endif
 197 
 198 static struct modlinkage modlinkage = {
 199         MODREV_1,
 200         {   &modlsys,
 201 #ifdef _SYSCALL32_IMPL
 202             &modlsys32,
 203 #endif
 204             NULL
 205         }
 206 };
 207 
 208 
 209 int
 210 _init(void)
 211 {
 212         int result;
 213 
 214         sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
 215             sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
 216             offsetof(ipc_rqty_t, ipcq_semmni));
 217         zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
 218 
 219         if ((result = mod_install(&modlinkage)) == 0)
 220                 return (0);
 221 
 222         (void) zone_key_delete(sem_zone_key);
 223         ipcs_destroy(sem_svc);
 224 
 225         return (result);
 226 }
 227 
 228 int
 229 _fini(void)
 230 {
 231         return (EBUSY);
 232 }
 233 
 234 int
 235 _info(struct modinfo *modinfop)
 236 {
 237         return (mod_info(&modlinkage, modinfop));
 238 }
 239 
 240 static void
 241 sem_dtor(kipc_perm_t *perm)
 242 {
 243         ksemid_t *sp = (ksemid_t *)perm;
 244 
 245         kmem_free(sp->sem_base,
 246             P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
 247         list_destroy(&sp->sem_undos);
 248 }
 249 
 250 /*
 251  * sem_undo_add - Create or update adjust on exit entry.
 252  */
 253 static int
 254 sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
 255 {
 256         int newval = undo->un_aoe[num] - val;
 257 
 258         if (newval > USHRT_MAX || newval < -USHRT_MAX)
 259                 return (ERANGE);
 260         undo->un_aoe[num] = newval;
 261 
 262         return (0);
 263 }
 264 
 265 /*
 266  * sem_undo_clear - clears all undo entries for specified semaphores
 267  *
 268  * Used when semaphores are reset by SETVAL or SETALL.
 269  */
 270 static void
 271 sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
 272 {
 273         struct sem_undo *undo;
 274         int i;
 275 
 276         ASSERT(low <= high);
 277         ASSERT(high < sp->sem_nsems);
 278 
 279         for (undo = list_head(&sp->sem_undos); undo;
 280             undo = list_next(&sp->sem_undos, undo))
 281                 for (i = low; i <= high; i++)
 282                         undo->un_aoe[i] = 0;
 283 }
 284 
 285 /*
 286  * sem_rollback - roll back work done so far if unable to complete operation
 287  */
 288 static void
 289 sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
 290 {
 291         struct sem *semp;       /* semaphore ptr */
 292 
 293         for (op += n - 1; n--; op--) {
 294                 if (op->sem_op == 0)
 295                         continue;
 296                 semp = &sp->sem_base[op->sem_num];
 297                 semp->semval -= op->sem_op;
 298                 if (op->sem_flg & SEM_UNDO) {
 299                         ASSERT(undo != NULL);
 300                         (void) sem_undo_add(-op->sem_op, op->sem_num, undo);
 301                 }
 302         }
 303 }
 304 
 305 static void
 306 sem_rmid(kipc_perm_t *perm)
 307 {
 308         ksemid_t *sp = (ksemid_t *)perm;
 309         struct sem *semp;
 310         struct sem_undo *undo;
 311         size_t size = SEM_UNDOSZ(sp->sem_nsems);
 312         int i;
 313 
 314         /*LINTED*/
 315         while (undo = list_head(&sp->sem_undos)) {
 316                 list_remove(&sp->sem_undos, undo);
 317                 mutex_enter(&undo->un_proc->p_lock);
 318                 if (undo->un_proc->p_semacct == NULL) {
 319                         mutex_exit(&undo->un_proc->p_lock);
 320                         continue;
 321                 }
 322                 avl_remove(undo->un_proc->p_semacct, undo);
 323                 mutex_exit(&undo->un_proc->p_lock);
 324                 kmem_free(undo, size);
 325                 ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
 326         }
 327 
 328         for (i = 0; i < sp->sem_nsems; i++) {
 329                 semp = &sp->sem_base[i];
 330                 semp->semval = semp->sempid = 0;
 331                 if (semp->semncnt) {
 332                         cv_broadcast(&semp->semncnt_cv);
 333                         semp->semncnt = 0;
 334                 }
 335                 if (semp->semzcnt) {
 336                         cv_broadcast(&semp->semzcnt_cv);
 337                         semp->semzcnt = 0;
 338                 }
 339         }
 340 }
 341 
 342 /*
 343  * semctl - Semctl system call.
 344  */
 345 static int
 346 semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
 347 {
 348         ksemid_t                *sp;    /* ptr to semaphore header */
 349         struct sem              *p;     /* ptr to semaphore */
 350         unsigned int            i;      /* loop control */
 351         ushort_t                *vals, *vp;
 352         size_t                  vsize = 0;
 353         int                     error = 0;
 354         int                     retval = 0;
 355         struct cred             *cr;
 356         kmutex_t                *lock;
 357         model_t                 mdl = get_udatamodel();
 358         STRUCT_DECL(semid_ds, sid);
 359         struct semid_ds64       ds64;
 360 
 361         STRUCT_INIT(sid, mdl);
 362         cr = CRED();
 363 
 364         /*
 365          * Perform pre- or non-lookup actions (e.g. copyins, RMID).
 366          */
 367         switch (cmd) {
 368         case IPC_SET:
 369                 if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
 370                         return (set_errno(EFAULT));
 371                 break;
 372 
 373         case IPC_SET64:
 374                 if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
 375                         return (set_errno(EFAULT));
 376                 break;
 377 
 378         case SETALL:
 379                 if ((lock = ipc_lookup(sem_svc, semid,
 380                     (kipc_perm_t **)&sp)) == NULL)
 381                         return (set_errno(EINVAL));
 382                 vsize = sp->sem_nsems * sizeof (*vals);
 383                 mutex_exit(lock);
 384 
 385                 /* allocate space to hold all semaphore values */
 386                 vals = kmem_alloc(vsize, KM_SLEEP);
 387 
 388                 if (copyin((void *)arg, vals, vsize)) {
 389                         kmem_free(vals, vsize);
 390                         return (set_errno(EFAULT));
 391                 }
 392                 break;
 393 
 394         case IPC_RMID:
 395                 if (error = ipc_rmid(sem_svc, semid, cr))
 396                         return (set_errno(error));
 397                 return (0);
 398         }
 399 
 400         if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
 401                 if (vsize != 0)
 402                         kmem_free(vals, vsize);
 403                 return (set_errno(EINVAL));
 404         }
 405         switch (cmd) {
 406         /* Set ownership and permissions. */
 407         case IPC_SET:
 408 
 409                 if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
 410                     &STRUCT_BUF(sid)->sem_perm, mdl)) {
 411                         mutex_exit(lock);
 412                         return (set_errno(error));
 413                 }
 414                 sp->sem_ctime = gethrestime_sec();
 415                 mutex_exit(lock);
 416                 return (0);
 417 
 418         /* Get semaphore data structure. */
 419         case IPC_STAT:
 420 
 421                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 422                         mutex_exit(lock);
 423                         return (set_errno(error));
 424                 }
 425 
 426                 ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
 427                 STRUCT_FSETP(sid, sem_base, NULL);      /* kernel addr */
 428                 STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
 429                 STRUCT_FSET(sid, sem_otime, sp->sem_otime);
 430                 STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
 431                 STRUCT_FSET(sid, sem_binary, sp->sem_binary);
 432                 mutex_exit(lock);
 433 
 434                 if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
 435                         return (set_errno(EFAULT));
 436                 return (0);
 437 
 438         case IPC_SET64:
 439 
 440                 if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
 441                     &ds64.semx_perm)) {
 442                         mutex_exit(lock);
 443                         return (set_errno(error));
 444                 }
 445                 sp->sem_ctime = gethrestime_sec();
 446                 mutex_exit(lock);
 447                 return (0);
 448 
 449         case IPC_STAT64:
 450 
 451                 ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
 452                 ds64.semx_nsems = sp->sem_nsems;
 453                 ds64.semx_otime = sp->sem_otime;
 454                 ds64.semx_ctime = sp->sem_ctime;
 455 
 456                 mutex_exit(lock);
 457                 if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
 458                         return (set_errno(EFAULT));
 459 
 460                 return (0);
 461 
 462         /* Get # of processes sleeping for greater semval. */
 463         case GETNCNT:
 464                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 465                         mutex_exit(lock);
 466                         return (set_errno(error));
 467                 }
 468                 if (semnum >= sp->sem_nsems) {
 469                         mutex_exit(lock);
 470                         return (set_errno(EINVAL));
 471                 }
 472                 retval = sp->sem_base[semnum].semncnt;
 473                 mutex_exit(lock);
 474                 return (retval);
 475 
 476         /* Get pid of last process to operate on semaphore. */
 477         case GETPID:
 478                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 479                         mutex_exit(lock);
 480                         return (set_errno(error));
 481                 }
 482                 if (semnum >= sp->sem_nsems) {
 483                         mutex_exit(lock);
 484                         return (set_errno(EINVAL));
 485                 }
 486                 retval = sp->sem_base[semnum].sempid;
 487                 mutex_exit(lock);
 488                 return (retval);
 489 
 490         /* Get semval of one semaphore. */
 491         case GETVAL:
 492                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 493                         mutex_exit(lock);
 494                         return (set_errno(error));
 495                 }
 496                 if (semnum >= sp->sem_nsems) {
 497                         mutex_exit(lock);
 498                         return (set_errno(EINVAL));
 499                 }
 500                 retval = sp->sem_base[semnum].semval;
 501                 mutex_exit(lock);
 502                 return (retval);
 503 
 504         /* Get all semvals in set. */
 505         case GETALL:
 506                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 507                         mutex_exit(lock);
 508                         return (set_errno(error));
 509                 }
 510 
 511                 /* allocate space to hold all semaphore values */
 512                 vsize = sp->sem_nsems * sizeof (*vals);
 513                 vals = vp = kmem_alloc(vsize, KM_SLEEP);
 514 
 515                 for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
 516                         bcopy(&p->semval, vp, sizeof (p->semval));
 517 
 518                 mutex_exit(lock);
 519 
 520                 if (copyout((void *)vals, (void *)arg, vsize)) {
 521                         kmem_free(vals, vsize);
 522                         return (set_errno(EFAULT));
 523                 }
 524 
 525                 kmem_free(vals, vsize);
 526                 return (0);
 527 
 528         /* Get # of processes sleeping for semval to become zero. */
 529         case GETZCNT:
 530                 if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
 531                         mutex_exit(lock);
 532                         return (set_errno(error));
 533                 }
 534                 if (semnum >= sp->sem_nsems) {
 535                         mutex_exit(lock);
 536                         return (set_errno(EINVAL));
 537                 }
 538                 retval = sp->sem_base[semnum].semzcnt;
 539                 mutex_exit(lock);
 540                 return (retval);
 541 
 542         /* Set semval of one semaphore. */
 543         case SETVAL:
 544                 if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
 545                         mutex_exit(lock);
 546                         return (set_errno(error));
 547                 }
 548                 if (semnum >= sp->sem_nsems) {
 549                         mutex_exit(lock);
 550                         return (set_errno(EINVAL));
 551                 }
 552                 if ((uint_t)arg > USHRT_MAX) {
 553                         mutex_exit(lock);
 554                         return (set_errno(ERANGE));
 555                 }
 556                 p = &sp->sem_base[semnum];
 557                 if ((p->semval = (ushort_t)arg) != 0) {
 558                         if (p->semncnt) {
 559                                 cv_broadcast(&p->semncnt_cv);
 560                         }
 561                 } else if (p->semzcnt) {
 562                         cv_broadcast(&p->semzcnt_cv);
 563                 }
 564                 p->sempid = curproc->p_pid;
 565                 sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
 566                 mutex_exit(lock);
 567                 return (0);
 568 
 569         /* Set semvals of all semaphores in set. */
 570         case SETALL:
 571                 /* Check if semaphore set has been deleted and reallocated. */
 572                 if (sp->sem_nsems * sizeof (*vals) != vsize) {
 573                         error = set_errno(EINVAL);
 574                         goto seterr;
 575                 }
 576                 if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
 577                         error = set_errno(error);
 578                         goto seterr;
 579                 }
 580                 sem_undo_clear(sp, 0, sp->sem_nsems - 1);
 581                 for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
 582                     (p++)->sempid = curproc->p_pid) {
 583                         if ((p->semval = vals[i++]) != 0) {
 584                                 if (p->semncnt) {
 585                                         cv_broadcast(&p->semncnt_cv);
 586                                 }
 587                         } else if (p->semzcnt) {
 588                                 cv_broadcast(&p->semzcnt_cv);
 589                         }
 590                 }
 591 seterr:
 592                 mutex_exit(lock);
 593                 kmem_free(vals, vsize);
 594                 return (error);
 595 
 596         default:
 597                 mutex_exit(lock);
 598                 return (set_errno(EINVAL));
 599         }
 600 
 601         /* NOTREACHED */
 602 }
 603 
 604 /*
 605  * semexit - Called by exit() to clean up on process exit.
 606  */
 607 void
 608 semexit(proc_t *pp)
 609 {
 610         avl_tree_t      *tree;
 611         struct sem_undo *undo;
 612         void            *cookie = NULL;
 613 
 614         mutex_enter(&pp->p_lock);
 615         tree = pp->p_semacct;
 616         pp->p_semacct = NULL;
 617         mutex_exit(&pp->p_lock);
 618 
 619         while (undo = avl_destroy_nodes(tree, &cookie)) {
 620                 ksemid_t *sp = undo->un_sp;
 621                 size_t size = SEM_UNDOSZ(sp->sem_nsems);
 622                 int i;
 623 
 624                 (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 625                 if (!IPC_FREE(&sp->sem_perm)) {
 626                         for (i = 0; i < sp->sem_nsems; i++) {
 627                                 int adj = undo->un_aoe[i];
 628                                 if (adj) {
 629                                         struct sem *semp = &sp->sem_base[i];
 630                                         int v = (int)semp->semval + adj;
 631 
 632                                         if (v < 0 || v > USHRT_MAX)
 633                                                 continue;
 634                                         semp->semval = (ushort_t)v;
 635                                         if (v == 0 && semp->semzcnt)
 636                                                 cv_broadcast(&semp->semzcnt_cv);
 637                                         if (adj > 0 && semp->semncnt)
 638                                                 cv_broadcast(&semp->semncnt_cv);
 639                                 }
 640                         }
 641                         list_remove(&sp->sem_undos, undo);
 642                 }
 643                 ipc_rele(sem_svc, (kipc_perm_t *)sp);
 644                 kmem_free(undo, size);
 645         }
 646 
 647         avl_destroy(tree);
 648         kmem_free(tree, sizeof (avl_tree_t));
 649 }
 650 
 651 /*
 652  * Remove all semaphores associated with a given zone.  Called by
 653  * zone_shutdown when the zone is halted.
 654  */
 655 /*ARGSUSED1*/
 656 static void
 657 sem_remove_zone(zoneid_t zoneid, void *arg)
 658 {
 659         ipc_remove_zone(sem_svc, zoneid);
 660 }
 661 
 662 /*
 663  * semget - Semget system call.
 664  */
 665 static int
 666 semget(key_t key, int nsems, int semflg)
 667 {
 668         ksemid_t        *sp;
 669         kmutex_t        *lock;
 670         int             id, error;
 671         proc_t          *pp = curproc;
 672 
 673 top:
 674         if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
 675                 return (set_errno(error));
 676 
 677         if (!IPC_FREE(&sp->sem_perm)) {
 678                 /*
 679                  * A semaphore with the requested key exists.
 680                  */
 681                 if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
 682                         mutex_exit(lock);
 683                         return (set_errno(EINVAL));
 684                 }
 685         } else {
 686                 /*
 687                  * This is a new semaphore set.  Finish initialization.
 688                  */
 689                 if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
 690                     nsems, RCA_SAFE) & RCT_DENY)) {
 691                         mutex_exit(lock);
 692                         mutex_exit(&pp->p_lock);
 693                         ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
 694                         return (set_errno(EINVAL));
 695                 }
 696                 mutex_exit(lock);
 697                 mutex_exit(&pp->p_lock);
 698 
 699                 /*
 700                  * We round the allocation up to coherency granularity
 701                  * so that multiple semaphore allocations won't result
 702                  * in the false sharing of their sem structures.
 703                  */
 704                 sp->sem_base =
 705                     kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
 706                     KM_SLEEP);
 707                 sp->sem_binary = (nsems == 1);
 708                 sp->sem_nsems = (ushort_t)nsems;
 709                 sp->sem_ctime = gethrestime_sec();
 710                 sp->sem_otime = 0;
 711                 list_create(&sp->sem_undos, sizeof (struct sem_undo),
 712                     offsetof(struct sem_undo, un_list));
 713 
 714                 if (error = ipc_commit_begin(sem_svc, key, semflg,
 715                     (kipc_perm_t *)sp)) {
 716                         if (error == EAGAIN)
 717                                 goto top;
 718                         return (set_errno(error));
 719                 }
 720                 sp->sem_maxops =
 721                     rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
 722                 if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
 723                     RCA_SAFE) & RCT_DENY) {
 724                         ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
 725                         return (set_errno(EINVAL));
 726                 }
 727                 lock = ipc_commit_end(sem_svc, &sp->sem_perm);
 728         }
 729 
 730         if (AU_AUDITING())
 731                 audit_ipcget(AT_IPC_SEM, (void *)sp);
 732 
 733         id = sp->sem_perm.ipc_id;
 734         mutex_exit(lock);
 735         return (id);
 736 }
 737 
 738 /*
 739  * semids system call.
 740  */
 741 static int
 742 semids(int *buf, uint_t nids, uint_t *pnids)
 743 {
 744         int error;
 745 
 746         if (error = ipc_ids(sem_svc, buf, nids, pnids))
 747                 return (set_errno(error));
 748 
 749         return (0);
 750 }
 751 
 752 
 753 /*
 754  * Helper function for semop - copies in the provided timespec and
 755  * computes the absolute future time after which we must return.
 756  */
 757 static int
 758 compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
 759         timespec_t *timeout)
 760 {
 761         model_t datamodel = get_udatamodel();
 762 
 763         if (datamodel == DATAMODEL_NATIVE) {
 764                 if (copyin(timeout, ts, sizeof (timespec_t)))
 765                         return (EFAULT);
 766         } else {
 767                 timespec32_t ts32;
 768 
 769                 if (copyin(timeout, &ts32, sizeof (timespec32_t)))
 770                         return (EFAULT);
 771                 TIMESPEC32_TO_TIMESPEC(ts, &ts32)
 772         }
 773 
 774         if (itimerspecfix(ts))
 775                 return (EINVAL);
 776 
 777         /*
 778          * Convert the timespec value into absolute time.
 779          */
 780         timespecadd(ts, now);
 781         *tsp = ts;
 782 
 783         return (0);
 784 }
 785 
 786 /*
 787  * Undo structure comparator.  We sort based on ksemid_t pointer.
 788  */
 789 static int
 790 sem_undo_compar(const void *x, const void *y)
 791 {
 792         struct sem_undo *undo1 = (struct sem_undo *)x;
 793         struct sem_undo *undo2 = (struct sem_undo *)y;
 794 
 795         if (undo1->un_sp < undo2->un_sp)
 796                 return (-1);
 797         if (undo1->un_sp > undo2->un_sp)
 798                 return (1);
 799         return (0);
 800 }
 801 
 802 /*
 803  * Helper function for semop - creates an undo structure and adds it to
 804  * the process's avl tree and the semaphore's list.
 805  */
 806 static int
 807 sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
 808     struct sem_undo *template, struct sem_undo **un)
 809 {
 810         size_t size;
 811         struct sem_undo *undo;
 812         avl_tree_t *tree = NULL;
 813         avl_index_t where;
 814 
 815         mutex_exit(*lock);
 816 
 817         size = SEM_UNDOSZ(sp->sem_nsems);
 818         undo = kmem_zalloc(size, KM_SLEEP);
 819         undo->un_proc = pp;
 820         undo->un_sp = sp;
 821 
 822         if (pp->p_semacct == NULL)
 823                 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 824 
 825         *lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 826         if (IPC_FREE(&sp->sem_perm)) {
 827                 kmem_free(undo, size);
 828                 if (tree)
 829                         kmem_free(tree, sizeof (avl_tree_t));
 830                 return (EIDRM);
 831         }
 832 
 833         mutex_enter(&pp->p_lock);
 834         if (tree) {
 835                 if (pp->p_semacct == NULL) {
 836                         avl_create(tree, sem_undo_compar,
 837                             sizeof (struct sem_undo),
 838                             offsetof(struct sem_undo, un_avl));
 839                         pp->p_semacct = tree;
 840                 } else {
 841                         kmem_free(tree, sizeof (avl_tree_t));
 842                 }
 843         }
 844 
 845         if (*un = avl_find(pp->p_semacct, template, &where)) {
 846                 mutex_exit(&pp->p_lock);
 847                 kmem_free(undo, size);
 848         } else {
 849                 *un = undo;
 850                 avl_insert(pp->p_semacct, undo, where);
 851                 mutex_exit(&pp->p_lock);
 852                 list_insert_head(&sp->sem_undos, undo);
 853                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
 854         }
 855 
 856 
 857         return (0);
 858 }
 859 
 860 /*
 861  * semop - Semop system call.
 862  */
 863 static int
 864 semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
 865 {
 866         ksemid_t        *sp = NULL;
 867         kmutex_t        *lock;
 868         struct sembuf   *op;    /* ptr to operation */
 869         int             i;      /* loop control */
 870         struct sem      *semp;  /* ptr to semaphore */
 871         int             error = 0;
 872         struct sembuf   *uops;  /* ptr to copy of user ops */
 873         struct sembuf   x_sem;  /* avoid kmem_alloc's */
 874         timespec_t      now, ts, *tsp = NULL;
 875         int             timecheck = 0;
 876         int             cvres, needundo, mode;
 877         struct sem_undo *undo;
 878         proc_t          *pp = curproc;
 879         int             held = 0;
 880 
 881         CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
 882 
 883         /*
 884          * To avoid the cost of copying in 'timeout' in the common
 885          * case, we could only grab the time here and defer the copyin
 886          * and associated computations until we are about to block.
 887          *
 888          * The down side to this is that we would then have to spin
 889          * some goto top nonsense to avoid the copyin behind the semid
 890          * lock.  As a common use of timed semaphores is as an explicit
 891          * blocking mechanism, this could incur a greater penalty.
 892          *
 893          * If we eventually decide that this would be a wise route to
 894          * take, the deferrable functionality is completely contained
 895          * in 'compute_timeout', and the interface is defined such that
 896          * we can legally not validate 'timeout' if it is unused.
 897          */
 898         if (timeout != NULL) {
 899                 timecheck = timechanged;
 900                 gethrestime(&now);
 901                 if (error = compute_timeout(&tsp, &ts, &now, timeout))
 902                         return (set_errno(error));
 903         }
 904 
 905         /*
 906          * Allocate space to hold the vector of semaphore ops.  If
 907          * there is only 1 operation we use a preallocated buffer on
 908          * the stack for speed.
 909          *
 910          * Since we don't want to allow the user to allocate an
 911          * arbitrary amount of kernel memory, we need to check against
 912          * the number of operations allowed by the semaphore.  We only
 913          * bother doing this if the number of operations is larger than
 914          * SEM_MAXUCOPS.
 915          */
 916         if (nsops == 1)
 917                 uops = &x_sem;
 918         else if (nsops == 0)
 919                 return (0);
 920         else if (nsops <= SEM_MAXUCOPS)
 921                 uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
 922 
 923         if (nsops > SEM_MAXUCOPS) {
 924                 if ((lock = ipc_lookup(sem_svc, semid,
 925                     (kipc_perm_t **)&sp)) == NULL)
 926                         return (set_errno(EFAULT));
 927 
 928                 if (nsops > sp->sem_maxops) {
 929                         mutex_exit(lock);
 930                         return (set_errno(E2BIG));
 931                 }
 932                 held = 1;
 933                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
 934                 mutex_exit(lock);
 935 
 936                 uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
 937                 if (copyin(sops, uops, nsops * sizeof (*op))) {
 938                         error = EFAULT;
 939                         (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 940                         goto semoperr;
 941                 }
 942 
 943                 lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
 944                 if (IPC_FREE(&sp->sem_perm)) {
 945                         error = EIDRM;
 946                         goto semoperr;
 947                 }
 948         } else {
 949                 /*
 950                  * This could be interleaved with the above code, but
 951                  * keeping them separate improves readability.
 952                  */
 953                 if (copyin(sops, uops, nsops * sizeof (*op))) {
 954                         error = EFAULT;
 955                         goto semoperr_unlocked;
 956                 }
 957 
 958                 if ((lock = ipc_lookup(sem_svc, semid,
 959                     (kipc_perm_t **)&sp)) == NULL) {
 960                         error = EINVAL;
 961                         goto semoperr_unlocked;
 962                 }
 963 
 964                 if (nsops > sp->sem_maxops) {
 965                         error = E2BIG;
 966                         goto semoperr;
 967                 }
 968         }
 969 
 970         /*
 971          * Scan all operations.  Verify that sem #s are in range and
 972          * this process is allowed the requested operations.  If any
 973          * operations are marked SEM_UNDO, find (or allocate) the undo
 974          * structure for this process and semaphore.
 975          */
 976         needundo = 0;
 977         mode = 0;
 978         for (i = 0, op = uops; i++ < nsops; op++) {
 979                 mode |= op->sem_op ? SEM_A : SEM_R;
 980                 if (op->sem_num >= sp->sem_nsems) {
 981                         error = EFBIG;
 982                         goto semoperr;
 983                 }
 984                 if ((op->sem_flg & SEM_UNDO) && op->sem_op)
 985                         needundo = 1;
 986         }
 987         if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
 988                 goto semoperr;
 989 
 990         if (needundo) {
 991                 struct sem_undo template;
 992 
 993                 template.un_sp = sp;
 994                 mutex_enter(&pp->p_lock);
 995                 if (pp->p_semacct)
 996                         undo = avl_find(pp->p_semacct, &template, NULL);
 997                 else
 998                         undo = NULL;
 999                 mutex_exit(&pp->p_lock);
1000                 if (undo == NULL) {
1001                         if (!held) {
1002                                 held = 1;
1003                                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
1004                         }
1005                         if (error = sem_undo_alloc(pp, sp, &lock, &template,
1006                             &undo))
1007                                 goto semoperr;
1008 
1009                         /* sem_undo_alloc unlocks the semaphore */
1010                         if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
1011                                 goto semoperr;
1012                 }
1013         }
1014 
1015 check:
1016         /*
1017          * Loop waiting for the operations to be satisfied atomically.
1018          * Actually, do the operations and undo them if a wait is needed
1019          * or an error is detected.
1020          */
1021         for (i = 0; i < nsops; i++) {
1022                 op = &uops[i];
1023                 semp = &sp->sem_base[op->sem_num];
1024 
1025                 /*
1026                  * Raise the semaphore (i.e. sema_v)
1027                  */
1028                 if (op->sem_op > 0) {
1029                         if (op->sem_op + (int)semp->semval > USHRT_MAX ||
1030                             ((op->sem_flg & SEM_UNDO) &&
1031                             (error = sem_undo_add(op->sem_op, op->sem_num,
1032                             undo)))) {
1033                                 if (i)
1034                                         sem_rollback(sp, uops, i, undo);
1035                                 if (error == 0)
1036                                         error = ERANGE;
1037                                 goto semoperr;
1038                         }
1039                         semp->semval += op->sem_op;
1040                         /*
1041                          * If we are only incrementing the semaphore value
1042                          * by one on a binary semaphore, we can cv_signal.
1043                          */
1044                         if (semp->semncnt) {
1045                                 if (op->sem_op == 1 && sp->sem_binary)
1046                                         cv_signal(&semp->semncnt_cv);
1047                                 else
1048                                         cv_broadcast(&semp->semncnt_cv);
1049                         }
1050                         if (semp->semzcnt && !semp->semval)
1051                                 cv_broadcast(&semp->semzcnt_cv);
1052                         continue;
1053                 }
1054 
1055                 /*
1056                  * Lower the semaphore (i.e. sema_p)
1057                  */
1058                 if (op->sem_op < 0) {
1059                         if (semp->semval >= (unsigned)(-op->sem_op)) {
1060                                 if ((op->sem_flg & SEM_UNDO) &&
1061                                     (error = sem_undo_add(op->sem_op,
1062                                     op->sem_num, undo))) {
1063                                         if (i)
1064                                                 sem_rollback(sp, uops, i, undo);
1065                                         goto semoperr;
1066                                 }
1067                                 semp->semval += op->sem_op;
1068                                 if (semp->semzcnt && !semp->semval)
1069                                         cv_broadcast(&semp->semzcnt_cv);
1070                                 continue;
1071                         }
1072                         if (i)
1073                                 sem_rollback(sp, uops, i, undo);
1074                         if (op->sem_flg & IPC_NOWAIT) {
1075                                 error = EAGAIN;
1076                                 goto semoperr;
1077                         }
1078 
1079                         /*
1080                          * Mark the semaphore set as not a binary type
1081                          * if we are decrementing the value by more than 1.
1082                          *
1083                          * V operations will resort to cv_broadcast
1084                          * for this set because there are too many weird
1085                          * cases that have to be caught.
1086                          */
1087                         if (op->sem_op < -1)
1088                                 sp->sem_binary = 0;
1089                         if (!held) {
1090                                 held = 1;
1091                                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
1092                         }
1093                         semp->semncnt++;
1094                         cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
1095                             tsp, timecheck);
1096                         lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1097 
1098                         if (!IPC_FREE(&sp->sem_perm)) {
1099                                 ASSERT(semp->semncnt != 0);
1100                                 semp->semncnt--;
1101                                 if (cvres > 0)       /* normal wakeup */
1102                                         goto check;
1103                         }
1104 
1105                         /* EINTR or EAGAIN overrides EIDRM */
1106                         if (cvres == 0)
1107                                 error = EINTR;
1108                         else if (cvres < 0)
1109                                 error = EAGAIN;
1110                         else
1111                                 error = EIDRM;
1112                         goto semoperr;
1113                 }
1114 
1115                 /*
1116                  * Wait for zero value
1117                  */
1118                 if (semp->semval) {
1119                         if (i)
1120                                 sem_rollback(sp, uops, i, undo);
1121                         if (op->sem_flg & IPC_NOWAIT) {
1122                                 error = EAGAIN;
1123                                 goto semoperr;
1124                         }
1125 
1126                         if (!held) {
1127                                 held = 1;
1128                                 ipc_hold(sem_svc, (kipc_perm_t *)sp);
1129                         }
1130                         semp->semzcnt++;
1131                         cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
1132                             tsp, timecheck);
1133                         lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1134 
1135                         /*
1136                          * Don't touch semp if the semaphores have been removed.
1137                          */
1138                         if (!IPC_FREE(&sp->sem_perm)) {
1139                                 ASSERT(semp->semzcnt != 0);
1140                                 semp->semzcnt--;
1141                                 if (cvres > 0)       /* normal wakeup */
1142                                         goto check;
1143                         }
1144 
1145                         /* EINTR or EAGAIN overrides EIDRM */
1146                         if (cvres == 0)
1147                                 error = EINTR;
1148                         else if (cvres < 0)
1149                                 error = EAGAIN;
1150                         else
1151                                 error = EIDRM;
1152                         goto semoperr;
1153                 }
1154         }
1155 
1156         /* All operations succeeded.  Update sempid for accessed semaphores. */
1157         for (i = 0, op = uops; i++ < nsops;
1158             sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
1159                 ;
1160         sp->sem_otime = gethrestime_sec();
1161         if (held)
1162                 ipc_rele(sem_svc, (kipc_perm_t *)sp);
1163         else
1164                 mutex_exit(lock);
1165 
1166         /* Before leaving, deallocate the buffer that held the user semops */
1167         if (nsops != 1)
1168                 kmem_free(uops, sizeof (*uops) * nsops);
1169         return (0);
1170 
1171         /*
1172          * Error return labels
1173          */
1174 semoperr:
1175         if (held)
1176                 ipc_rele(sem_svc, (kipc_perm_t *)sp);
1177         else
1178                 mutex_exit(lock);
1179 
1180 semoperr_unlocked:
1181 
1182         /* Before leaving, deallocate the buffer that held the user semops */
1183         if (nsops != 1)
1184                 kmem_free(uops, sizeof (*uops) * nsops);
1185         return (set_errno(error));
1186 }
1187 
1188 /*
1189  * semsys - System entry point for semctl, semget, and semop system calls.
1190  */
1191 static int
1192 semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
1193 {
1194         int error;
1195 
1196         switch (opcode) {
1197         case SEMCTL:
1198                 error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
1199                 break;
1200         case SEMGET:
1201                 error = semget((key_t)a1, (int)a2, (int)a3);
1202                 break;
1203         case SEMOP:
1204                 error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
1205                 break;
1206         case SEMIDS:
1207                 error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
1208                 break;
1209         case SEMTIMEDOP:
1210                 error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
1211                     (timespec_t *)a4);
1212                 break;
1213         default:
1214                 error = set_errno(EINVAL);
1215                 break;
1216         }
1217         return (error);
1218 }