1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29 
  30 /*
  31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  32  */
  33 
  34 #include <sys/flock_impl.h>
  35 #include <sys/vfs.h>
  36 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  37 #include <sys/callb.h>
  38 #include <sys/clconf.h>
  39 #include <sys/cladm.h>
  40 #include <sys/nbmlock.h>
  41 #include <sys/cred.h>
  42 #include <sys/policy.h>
  43 
  44 /*
  45  * The following four variables are for statistics purposes and they are
  46  * not protected by locks. They may not be accurate but will at least be
  47  * close to the actual value.
  48  */
  49 
  50 int     flk_lock_allocs;
  51 int     flk_lock_frees;
  52 int     edge_allocs;
  53 int     edge_frees;
  54 int     flk_proc_vertex_allocs;
  55 int     flk_proc_edge_allocs;
  56 int     flk_proc_vertex_frees;
  57 int     flk_proc_edge_frees;
  58 
  59 static kmutex_t flock_lock;
  60 
  61 #ifdef DEBUG
  62 int check_debug = 0;
  63 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  64                                         check_active_locks(gp);
  65 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  66                                                 check_sleeping_locks(gp);
  67 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  68                 if (check_debug)        \
  69                         check_owner_locks(gp, pid, sysid, vp);
  70 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  71         { \
  72                 if (check_lock_transition(old_state, new_state)) { \
  73                         cmn_err(CE_PANIC, "Illegal lock transition \
  74                             from %d to %d", old_state, new_state); \
  75                 } \
  76         }
  77 #else
  78 
  79 #define CHECK_ACTIVE_LOCKS(gp)
  80 #define CHECK_SLEEPING_LOCKS(gp)
  81 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  82 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  83 
  84 #endif /* DEBUG */
  85 
  86 struct kmem_cache       *flk_edge_cache;
  87 
  88 graph_t         *lock_graph[HASH_SIZE];
  89 proc_graph_t    pgraph;
  90 
  91 /*
  92  * Clustering.
  93  *
  94  * NLM REGISTRY TYPE IMPLEMENTATION
  95  *
  96  * Assumptions:
  97  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  98  *      integers; maximum node id is returned by clconf_maximum_nodeid().
  99  *  2.  We use this node id to identify the node an NLM server runs on.
 100  */
 101 
 102 /*
 103  * NLM registry object keeps track of NLM servers via their
 104  * nlmids (which are the node ids of the node in the cluster they run on)
 105  * that have requested locks at this LLM with which this registry is
 106  * associated.
 107  *
 108  * Representation of abstraction:
 109  *    rep = record[     states: array[nlm_state],
 110  *                      lock: mutex]
 111  *
 112  *    Representation invariants:
 113  *      1. index i of rep.states is between 0 and n - 1 where n is number
 114  *         of elements in the array, which happen to be the maximum number
 115  *         of nodes in the cluster configuration + 1.
 116  *      2. map nlmid to index i of rep.states
 117  *              0   -> 0
 118  *              1   -> 1
 119  *              2   -> 2
 120  *              n-1 -> clconf_maximum_nodeid()+1
 121  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 122  *          from forgetting to subtract 1 from the index.
 123  *      4.  The reason we keep the 0th index is the following.  A legitimate
 124  *          cluster configuration includes making a UFS file system NFS
 125  *          exportable.  The code is structured so that if you're in a cluster
 126  *          you do one thing; otherwise, you do something else.  The problem
 127  *          is what to do if you think you're in a cluster with PXFS loaded,
 128  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 129  *          encode the node id of the node where NLM server runs; these bytes
 130  *          are zero for UFS.  Since the nodeid is used to index into the
 131  *          registry, we can record the NLM server state information at index
 132  *          0 using the same mechanism used for PXFS file locks!
 133  */
 134 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 135 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 136 static uint_t nlm_status_size;                  /* size of state array */
 137 
 138 /*
 139  * Although we need a global lock dependency graph (and associated data
 140  * structures), we also need a per-zone notion of whether the lock manager is
 141  * running, and so whether to allow lock manager requests or not.
 142  *
 143  * Thus, on a per-zone basis we maintain a ``global'' variable
 144  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 145  * manager is determined to be changing state (starting or stopping).
 146  *
 147  * Each graph/zone pair also has a copy of this variable, which is protected by
 148  * the graph's mutex.
 149  *
 150  * The per-graph copies are used to synchronize lock requests with shutdown
 151  * requests.  The global copy is used to initialize the per-graph field when a
 152  * new graph is created.
 153  */
 154 struct flock_globals {
 155         flk_lockmgr_status_t flk_lockmgr_status;
 156         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 157 };
 158 
 159 zone_key_t flock_zone_key;
 160 
 161 static void create_flock(lock_descriptor_t *, flock64_t *);
 162 static lock_descriptor_t        *flk_get_lock(void);
 163 static void     flk_free_lock(lock_descriptor_t *lock);
 164 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 165 static int flk_process_request(lock_descriptor_t *);
 166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 167 static edge_t *flk_get_edge(void);
 168 static int flk_wait_execute_request(lock_descriptor_t *);
 169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 170 static void flk_insert_active_lock(lock_descriptor_t *);
 171 static void flk_delete_active_lock(lock_descriptor_t *, int);
 172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 173 static void flk_graph_uncolor(graph_t *);
 174 static void flk_wakeup(lock_descriptor_t *, int);
 175 static void flk_free_edge(edge_t *);
 176 static void flk_recompute_dependencies(lock_descriptor_t *,
 177                         lock_descriptor_t **,  int, int);
 178 static int flk_find_barriers(lock_descriptor_t *);
 179 static void flk_update_barriers(lock_descriptor_t *);
 180 static int flk_color_reachables(lock_descriptor_t *);
 181 static int flk_canceled(lock_descriptor_t *);
 182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 184 static void wait_for_lock(lock_descriptor_t *);
 185 static void unlock_lockmgr_granted(struct flock_globals *);
 186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 187 
 188 /* Clustering hooks */
 189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 191 static void cl_flk_unlock_nlm_granted(int);
 192 
 193 #ifdef DEBUG
 194 static int check_lock_transition(int, int);
 195 static void check_sleeping_locks(graph_t *);
 196 static void check_active_locks(graph_t *);
 197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 198 static void path(lock_descriptor_t *, lock_descriptor_t *);
 199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 202 #endif
 203 
 204 /*      proc_graph function definitions */
 205 static int flk_check_deadlock(lock_descriptor_t *);
 206 static void flk_proc_graph_uncolor(void);
 207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 208 static proc_edge_t *flk_get_proc_edge(void);
 209 static void flk_proc_release(proc_vertex_t *);
 210 static void flk_free_proc_edge(proc_edge_t *);
 211 static void flk_update_proc_graph(edge_t *, int);
 212 
 213 /* Non-blocking mandatory locking */
 214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 215                         u_offset_t);
 216 
 217 static struct flock_globals *
 218 flk_get_globals(void)
 219 {
 220         /*
 221          * The KLM module had better be loaded if we're attempting to handle
 222          * lockmgr requests.
 223          */
 224         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 225         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 226 }
 227 
 228 static flk_lockmgr_status_t
 229 flk_get_lockmgr_status(void)
 230 {
 231         struct flock_globals *fg;
 232 
 233         ASSERT(MUTEX_HELD(&flock_lock));
 234 
 235         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 236                 /*
 237                  * KLM module not loaded; lock manager definitely not running.
 238                  */
 239                 return (FLK_LOCKMGR_DOWN);
 240         }
 241         fg = flk_get_globals();
 242         return (fg->flk_lockmgr_status);
 243 }
 244 
 245 /*
 246  * Routine called from fs_frlock in fs/fs_subr.c
 247  */
 248 
 249 int
 250 reclock(vnode_t         *vp,
 251         flock64_t       *lckdat,
 252         int             cmd,
 253         int             flag,
 254         u_offset_t      offset,
 255         flk_callback_t  *flk_cbp)
 256 {
 257         lock_descriptor_t       stack_lock_request;
 258         lock_descriptor_t       *lock_request;
 259         int error = 0;
 260         graph_t *gp;
 261         int                     nlmid;
 262 
 263         /*
 264          * Check access permissions
 265          */
 266         if ((cmd & SETFLCK) &&
 267                 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 268                 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 269                         return (EBADF);
 270 
 271         /*
 272          * for query and unlock we use the stack_lock_request
 273          */
 274 
 275         if ((lckdat->l_type == F_UNLCK) ||
 276                         !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 277                 lock_request = &stack_lock_request;
 278                 (void) bzero((caddr_t)lock_request,
 279                                 sizeof (lock_descriptor_t));
 280 
 281                 /*
 282                  * following is added to make the assertions in
 283                  * flk_execute_request() to pass through
 284                  */
 285 
 286                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 287                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 288                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 289                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 290                 lock_request->l_status = FLK_INITIAL_STATE;
 291         } else {
 292                 lock_request = flk_get_lock();
 293         }
 294         lock_request->l_state = 0;
 295         lock_request->l_vnode = vp;
 296         lock_request->l_zoneid = getzoneid();
 297 
 298         /*
 299          * Convert the request range into the canonical start and end
 300          * values.  The NLM protocol supports locking over the entire
 301          * 32-bit range, so there's no range checking for remote requests,
 302          * but we still need to verify that local requests obey the rules.
 303          */
 304         /* Clustering */
 305         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 306                 ASSERT(lckdat->l_whence == 0);
 307                 lock_request->l_start = lckdat->l_start;
 308                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 309                         lckdat->l_start + (lckdat->l_len - 1);
 310         } else {
 311                 /* check the validity of the lock range */
 312                 error = flk_convert_lock_data(vp, lckdat,
 313                         &lock_request->l_start, &lock_request->l_end,
 314                         offset);
 315                 if (error) {
 316                         goto done;
 317                 }
 318                 error = flk_check_lock_data(lock_request->l_start,
 319                                             lock_request->l_end, MAXEND);
 320                 if (error) {
 321                         goto done;
 322                 }
 323         }
 324 
 325         ASSERT(lock_request->l_end >= lock_request->l_start);
 326 
 327         lock_request->l_type = lckdat->l_type;
 328         if (cmd & INOFLCK)
 329                 lock_request->l_state |= IO_LOCK;
 330         if (cmd & SLPFLCK)
 331                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 332         if (cmd & RCMDLCK)
 333                 lock_request->l_state |= LOCKMGR_LOCK;
 334         if (cmd & NBMLCK)
 335                 lock_request->l_state |= NBMAND_LOCK;
 336         /*
 337          * Clustering: set flag for PXFS locks
 338          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 339          * also be of type 'RCMDLCK'.
 340          * We do not _only_ check the GETPXFSID() macro because local PXFS
 341          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 342          */
 343 
 344         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 345                 lock_request->l_state |= PXFS_LOCK;
 346         }
 347         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 348                 if (lock_request->l_type == F_RDLCK ||
 349                         lock_request->l_type == F_WRLCK)
 350                         lock_request->l_state |= QUERY_LOCK;
 351         }
 352         lock_request->l_flock = (*lckdat);
 353         lock_request->l_callbacks = flk_cbp;
 354 
 355         /*
 356          * We are ready for processing the request
 357          */
 358         if (IS_LOCKMGR(lock_request)) {
 359                 /*
 360                  * If the lock request is an NLM server request ....
 361                  */
 362                 if (nlm_status_size == 0) { /* not booted as cluster */
 363                         mutex_enter(&flock_lock);
 364                         /*
 365                          * Bail out if this is a lock manager request and the
 366                          * lock manager is not supposed to be running.
 367                          */
 368                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 369                                 mutex_exit(&flock_lock);
 370                                 error = ENOLCK;
 371                                 goto done;
 372                         }
 373                         mutex_exit(&flock_lock);
 374                 } else {                        /* booted as a cluster */
 375                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 376                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 377 
 378                         mutex_enter(&nlm_reg_lock);
 379                         /*
 380                          * If the NLM registry does not know about this
 381                          * NLM server making the request, add its nlmid
 382                          * to the registry.
 383                          */
 384                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 385                                 nlmid)) {
 386                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 387                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 388                                 nlmid)) {
 389                                 /*
 390                                  * If the NLM server is already known (has made
 391                                  * previous lock requests) and its state is
 392                                  * not NLM_UP (means that NLM server is
 393                                  * shutting down), then bail out with an
 394                                  * error to deny the lock request.
 395                                  */
 396                                 mutex_exit(&nlm_reg_lock);
 397                                 error = ENOLCK;
 398                                 goto done;
 399                         }
 400                         mutex_exit(&nlm_reg_lock);
 401                 }
 402         }
 403 
 404         /* Now get the lock graph for a particular vnode */
 405         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 406 
 407         /*
 408          * We drop rwlock here otherwise this might end up causing a
 409          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 410          */
 411 
 412         if (IS_IO_LOCK(lock_request)) {
 413                 VOP_RWUNLOCK(vp,
 414                         (lock_request->l_type == F_RDLCK) ?
 415                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 416         }
 417         mutex_enter(&gp->gp_mutex);
 418 
 419         lock_request->l_state |= REFERENCED_LOCK;
 420         lock_request->l_graph = gp;
 421 
 422         switch (lock_request->l_type) {
 423         case F_RDLCK:
 424         case F_WRLCK:
 425                 if (IS_QUERY_LOCK(lock_request)) {
 426                         flk_get_first_blocking_lock(lock_request);
 427                         (*lckdat) = lock_request->l_flock;
 428                         break;
 429                 }
 430 
 431                 /* process the request now */
 432 
 433                 error = flk_process_request(lock_request);
 434                 break;
 435 
 436         case F_UNLCK:
 437                 /* unlock request will not block so execute it immediately */
 438 
 439                 if (IS_LOCKMGR(lock_request) &&
 440                     flk_canceled(lock_request)) {
 441                         error = 0;
 442                 } else {
 443                         error = flk_execute_request(lock_request);
 444                 }
 445                 break;
 446 
 447         case F_UNLKSYS:
 448                 /*
 449                  * Recovery mechanism to release lock manager locks when
 450                  * NFS client crashes and restart. NFS server will clear
 451                  * old locks and grant new locks.
 452                  */
 453 
 454                 if (lock_request->l_flock.l_sysid == 0) {
 455                         mutex_exit(&gp->gp_mutex);
 456                         return (EINVAL);
 457                 }
 458                 if (secpolicy_nfs(CRED()) != 0) {
 459                         mutex_exit(&gp->gp_mutex);
 460                         return (EPERM);
 461                 }
 462                 flk_delete_locks_by_sysid(lock_request);
 463                 lock_request->l_state &= ~REFERENCED_LOCK;
 464                 flk_set_state(lock_request, FLK_DEAD_STATE);
 465                 flk_free_lock(lock_request);
 466                 mutex_exit(&gp->gp_mutex);
 467                 return (0);
 468 
 469         default:
 470                 error = EINVAL;
 471                 break;
 472         }
 473 
 474         /* Clustering: For blocked PXFS locks, return */
 475         if (error == PXFS_LOCK_BLOCKED) {
 476                 lock_request->l_state &= ~REFERENCED_LOCK;
 477                 mutex_exit(&gp->gp_mutex);
 478                 return (error);
 479         }
 480 
 481         /*
 482          * Now that we have seen the status of locks in the system for
 483          * this vnode we acquire the rwlock if it is an IO_LOCK.
 484          */
 485 
 486         if (IS_IO_LOCK(lock_request)) {
 487                 (void) VOP_RWLOCK(vp,
 488                         (lock_request->l_type == F_RDLCK) ?
 489                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 490                 if (!error) {
 491                         lckdat->l_type = F_UNLCK;
 492 
 493                         /*
 494                          * This wake up is needed otherwise
 495                          * if IO_LOCK has slept the dependents on this
 496                          * will not be woken up at all. (bugid # 1185482).
 497                          */
 498 
 499                         flk_wakeup(lock_request, 1);
 500                         flk_set_state(lock_request, FLK_DEAD_STATE);
 501                         flk_free_lock(lock_request);
 502                 }
 503                 /*
 504                  * else if error had occurred either flk_process_request()
 505                  * has returned EDEADLK in which case there will be no
 506                  * dependents for this lock or EINTR from flk_wait_execute_
 507                  * request() in which case flk_cancel_sleeping_lock()
 508                  * would have been done. same is true with EBADF.
 509                  */
 510         }
 511 
 512         if (lock_request == &stack_lock_request) {
 513                 flk_set_state(lock_request, FLK_DEAD_STATE);
 514         } else {
 515                 lock_request->l_state &= ~REFERENCED_LOCK;
 516                 if ((error != 0) || IS_DELETED(lock_request)) {
 517                         flk_set_state(lock_request, FLK_DEAD_STATE);
 518                         flk_free_lock(lock_request);
 519                 }
 520         }
 521 
 522         mutex_exit(&gp->gp_mutex);
 523         return (error);
 524 
 525 done:
 526         flk_set_state(lock_request, FLK_DEAD_STATE);
 527         if (lock_request != &stack_lock_request)
 528                 flk_free_lock(lock_request);
 529         return (error);
 530 }
 531 
 532 /*
 533  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 534  * list order.  If after sleeping, invoke in reverse order.
 535  *
 536  * CPR (suspend/resume) support: if one of the callbacks returns a
 537  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 538  * while it is sleeping.  There should be at most one callb_cpr_t for the
 539  * thread.
 540  * XXX This is unnecessarily complicated.  The CPR information should just
 541  * get passed in directly through VOP_FRLOCK and reclock, rather than
 542  * sneaking it in via a callback.
 543  */
 544 
 545 callb_cpr_t *
 546 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 547 {
 548         callb_cpr_t *cpr_callbackp = NULL;
 549         callb_cpr_t *one_result;
 550         flk_callback_t *cb;
 551 
 552         if (cblist == NULL)
 553                 return (NULL);
 554 
 555         if (when == FLK_BEFORE_SLEEP) {
 556                 cb = cblist;
 557                 do {
 558                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 559                         if (one_result != NULL) {
 560                                 ASSERT(cpr_callbackp == NULL);
 561                                 cpr_callbackp = one_result;
 562                         }
 563                         cb = cb->cb_next;
 564                 } while (cb != cblist);
 565         } else {
 566                 cb = cblist->cb_prev;
 567                 do {
 568                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 569                         if (one_result != NULL) {
 570                                 cpr_callbackp = one_result;
 571                         }
 572                         cb = cb->cb_prev;
 573                 } while (cb != cblist->cb_prev);
 574         }
 575 
 576         return (cpr_callbackp);
 577 }
 578 
 579 /*
 580  * Initialize a flk_callback_t to hold the given callback.
 581  */
 582 
 583 void
 584 flk_init_callback(flk_callback_t *flk_cb,
 585         callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 586 {
 587         flk_cb->cb_next = flk_cb;
 588         flk_cb->cb_prev = flk_cb;
 589         flk_cb->cb_callback = cb_fcn;
 590         flk_cb->cb_data = cbdata;
 591 }
 592 
 593 /*
 594  * Initialize an flk_callback_t and then link it into the head of an
 595  * existing list (which may be NULL).
 596  */
 597 
 598 void
 599 flk_add_callback(flk_callback_t *newcb,
 600                 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 601                 void *cbdata, flk_callback_t *cblist)
 602 {
 603         flk_init_callback(newcb, cb_fcn, cbdata);
 604 
 605         if (cblist == NULL)
 606                 return;
 607 
 608         newcb->cb_prev = cblist->cb_prev;
 609         newcb->cb_next = cblist;
 610         cblist->cb_prev->cb_next = newcb;
 611         cblist->cb_prev = newcb;
 612 }
 613 
 614 /*
 615  * Initialize the flk_edge_cache data structure and create the
 616  * nlm_reg_status array.
 617  */
 618 
 619 void
 620 flk_init(void)
 621 {
 622         uint_t  i;
 623 
 624         flk_edge_cache = kmem_cache_create("flk_edges",
 625                 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 626         if (flk_edge_cache == NULL) {
 627                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 628         }
 629         /*
 630          * Create the NLM registry object.
 631          */
 632 
 633         if (cluster_bootflags & CLUSTER_BOOTED) {
 634                 /*
 635                  * This routine tells you the maximum node id that will be used
 636                  * in the cluster.  This number will be the size of the nlm
 637                  * registry status array.  We add 1 because we will be using
 638                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 639                  * to 64, for a total of 65 entries.
 640                  */
 641                 nlm_status_size = clconf_maximum_nodeid() + 1;
 642         } else {
 643                 nlm_status_size = 0;
 644         }
 645 
 646         if (nlm_status_size != 0) {     /* booted as a cluster */
 647                 nlm_reg_status = (flk_nlm_status_t *)
 648                         kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 649                                 KM_SLEEP);
 650 
 651                 /* initialize all NLM states in array to NLM_UNKNOWN */
 652                 for (i = 0; i < nlm_status_size; i++) {
 653                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 654                 }
 655         }
 656 }
 657 
 658 /*
 659  * Zone constructor/destructor callbacks to be executed when a zone is
 660  * created/destroyed.
 661  */
 662 /* ARGSUSED */
 663 void *
 664 flk_zone_init(zoneid_t zoneid)
 665 {
 666         struct flock_globals *fg;
 667         uint_t i;
 668 
 669         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 670         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 671         for (i = 0; i < HASH_SIZE; i++)
 672                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 673         return (fg);
 674 }
 675 
 676 /* ARGSUSED */
 677 void
 678 flk_zone_fini(zoneid_t zoneid, void *data)
 679 {
 680         struct flock_globals *fg = data;
 681 
 682         kmem_free(fg, sizeof (*fg));
 683 }
 684 
 685 /*
 686  * Get a lock_descriptor structure with initialization of edge lists.
 687  */
 688 
 689 static lock_descriptor_t *
 690 flk_get_lock(void)
 691 {
 692         lock_descriptor_t       *l;
 693 
 694         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 695 
 696         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 697         l->l_edge.edge_in_next = &l->l_edge;
 698         l->l_edge.edge_in_prev = &l->l_edge;
 699         l->l_edge.edge_adj_next = &l->l_edge;
 700         l->l_edge.edge_adj_prev = &l->l_edge;
 701         l->pvertex = -1;
 702         l->l_status = FLK_INITIAL_STATE;
 703         flk_lock_allocs++;
 704         return (l);
 705 }
 706 
 707 /*
 708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 709  * when some thread has a reference to it as in reclock().
 710  */
 711 
 712 void
 713 flk_free_lock(lock_descriptor_t *lock)
 714 {
 715         ASSERT(IS_DEAD(lock));
 716         if (IS_REFERENCED(lock)) {
 717                 lock->l_state |= DELETED_LOCK;
 718                 return;
 719         }
 720         flk_lock_frees++;
 721         kmem_free((void *)lock, sizeof (lock_descriptor_t));
 722 }
 723 
 724 void
 725 flk_set_state(lock_descriptor_t *lock, int new_state)
 726 {
 727         /*
 728          * Locks in the sleeping list may be woken up in a number of ways,
 729          * and more than once.  If a sleeping lock is signaled awake more
 730          * than once, then it may or may not change state depending on its
 731          * current state.
 732          * Also note that NLM locks that are sleeping could be moved to an
 733          * interrupted state more than once if the unlock request is
 734          * retransmitted by the NLM client - the second time around, this is
 735          * just a nop.
 736          * The ordering of being signaled awake is:
 737          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 738          * The checks below implement this ordering.
 739          */
 740         if (IS_INTERRUPTED(lock)) {
 741                 if ((new_state == FLK_CANCELLED_STATE) ||
 742                     (new_state == FLK_GRANTED_STATE) ||
 743                     (new_state == FLK_INTERRUPTED_STATE)) {
 744                         return;
 745                 }
 746         }
 747         if (IS_CANCELLED(lock)) {
 748                 if ((new_state == FLK_GRANTED_STATE) ||
 749                     (new_state == FLK_CANCELLED_STATE)) {
 750                         return;
 751                 }
 752         }
 753         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 754         if (IS_PXFS(lock)) {
 755                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 756         }
 757         lock->l_status = new_state;
 758 }
 759 
 760 /*
 761  * Routine that checks whether there are any blocking locks in the system.
 762  *
 763  * The policy followed is if a write lock is sleeping we don't allow read
 764  * locks before this write lock even though there may not be any active
 765  * locks corresponding to the read locks' region.
 766  *
 767  * flk_add_edge() function adds an edge between l1 and l2 iff there
 768  * is no path between l1 and l2. This is done to have a "minimum
 769  * storage representation" of the dependency graph.
 770  *
 771  * Another property of the graph is since only the new request throws
 772  * edges to the existing locks in the graph, the graph is always topologically
 773  * ordered.
 774  */
 775 
 776 static int
 777 flk_process_request(lock_descriptor_t *request)
 778 {
 779         graph_t *gp = request->l_graph;
 780         lock_descriptor_t *lock;
 781         int request_blocked_by_active = 0;
 782         int request_blocked_by_granted = 0;
 783         int request_blocked_by_sleeping = 0;
 784         vnode_t *vp = request->l_vnode;
 785         int     error = 0;
 786         int request_will_wait = 0;
 787         int found_covering_lock = 0;
 788         lock_descriptor_t *covered_by = NULL;
 789 
 790         ASSERT(MUTEX_HELD(&gp->gp_mutex));
 791         request_will_wait = IS_WILLING_TO_SLEEP(request);
 792 
 793         /*
 794          * check active locks
 795          */
 796 
 797         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 798 
 799 
 800         if (lock) {
 801                 do {
 802                         if (BLOCKS(lock, request)) {
 803                                 if (!request_will_wait)
 804                                         return (EAGAIN);
 805                                 request_blocked_by_active = 1;
 806                                 break;
 807                         }
 808                         /*
 809                          * Grant lock if it is for the same owner holding active
 810                          * lock that covers the request.
 811                          */
 812 
 813                         if (SAME_OWNER(lock, request) &&
 814                                         COVERS(lock, request) &&
 815                                                 (request->l_type == F_RDLCK))
 816                                 return (flk_execute_request(request));
 817                         lock = lock->l_next;
 818                 } while (lock->l_vnode == vp);
 819         }
 820 
 821         if (!request_blocked_by_active) {
 822                         lock_descriptor_t *lk[1];
 823                         lock_descriptor_t *first_glock = NULL;
 824                 /*
 825                  * Shall we grant this?! NO!!
 826                  * What about those locks that were just granted and still
 827                  * in sleep queue. Those threads are woken up and so locks
 828                  * are almost active.
 829                  */
 830                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 831                 if (lock) {
 832                         do {
 833                                 if (BLOCKS(lock, request)) {
 834                                         if (IS_GRANTED(lock)) {
 835                                                 request_blocked_by_granted = 1;
 836                                         } else {
 837                                                 request_blocked_by_sleeping = 1;
 838                                         }
 839                                 }
 840 
 841                                 lock = lock->l_next;
 842                         } while ((lock->l_vnode == vp));
 843                         first_glock = lock->l_prev;
 844                         ASSERT(first_glock->l_vnode == vp);
 845                 }
 846 
 847                 if (request_blocked_by_granted)
 848                         goto block;
 849 
 850                 if (!request_blocked_by_sleeping) {
 851                         /*
 852                          * If the request isn't going to be blocked by a
 853                          * sleeping request, we know that it isn't going to
 854                          * be blocked; we can just execute the request --
 855                          * without performing costly deadlock detection.
 856                          */
 857                         ASSERT(!request_blocked_by_active);
 858                         return (flk_execute_request(request));
 859                 } else if (request->l_type == F_RDLCK) {
 860                         /*
 861                          * If we have a sleeping writer in the requested
 862                          * lock's range, block.
 863                          */
 864                         goto block;
 865                 }
 866 
 867                 lk[0] = request;
 868                 request->l_state |= RECOMPUTE_LOCK;
 869                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 870                 if (lock) {
 871                         do {
 872                                 flk_recompute_dependencies(lock, lk, 1, 0);
 873                                 lock = lock->l_next;
 874                         } while (lock->l_vnode == vp);
 875                 }
 876                 lock = first_glock;
 877                 if (lock) {
 878                         do {
 879                                 if (IS_GRANTED(lock)) {
 880                                 flk_recompute_dependencies(lock, lk, 1, 0);
 881                                 }
 882                                 lock = lock->l_prev;
 883                         } while ((lock->l_vnode == vp));
 884                 }
 885                 request->l_state &= ~RECOMPUTE_LOCK;
 886                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 887                         return (EDEADLK);
 888                 return (flk_execute_request(request));
 889         }
 890 
 891 block:
 892         if (request_will_wait)
 893                 flk_graph_uncolor(gp);
 894 
 895         /* check sleeping locks */
 896 
 897         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 898 
 899         /*
 900          * If we find a sleeping write lock that is a superset of the
 901          * region wanted by request we can be assured that by adding an
 902          * edge to this write lock we have paths to all locks in the
 903          * graph that blocks the request except in one case and that is why
 904          * another check for SAME_OWNER in the loop below. The exception
 905          * case is when this process that owns the sleeping write lock 'l1'
 906          * has other locks l2, l3, l4 that are in the system and arrived
 907          * before l1. l1 does not have path to these locks as they are from
 908          * same process. We break when we find a second covering sleeping
 909          * lock l5 owned by a process different from that owning l1, because
 910          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 911          * it has l1 would have produced a deadlock already.
 912          */
 913 
 914         if (lock) {
 915                 do {
 916                         if (BLOCKS(lock, request)) {
 917                                 if (!request_will_wait)
 918                                         return (EAGAIN);
 919                                 if (COVERS(lock, request) &&
 920                                                 lock->l_type == F_WRLCK) {
 921                                         if (found_covering_lock &&
 922                                             !SAME_OWNER(lock, covered_by)) {
 923                                                 found_covering_lock++;
 924                                                 break;
 925                                         }
 926                                         found_covering_lock = 1;
 927                                         covered_by = lock;
 928                                 }
 929                                 if (found_covering_lock &&
 930                                         !SAME_OWNER(lock, covered_by)) {
 931                                         lock = lock->l_next;
 932                                         continue;
 933                                 }
 934                                 if ((error = flk_add_edge(request, lock,
 935                                                 !found_covering_lock, 0)))
 936                                         return (error);
 937                         }
 938                         lock = lock->l_next;
 939                 } while (lock->l_vnode == vp);
 940         }
 941 
 942 /*
 943  * found_covering_lock == 2 iff at this point 'request' has paths
 944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 945  * point 'request' has paths to all locks that blocks 'request' whose owners
 946  * are not same as the one that covers 'request' (covered_by above) and
 947  * we can have locks whose owner is same as covered_by in the active list.
 948  */
 949 
 950         if (request_blocked_by_active && found_covering_lock != 2) {
 951                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 952                 ASSERT(lock != NULL);
 953                 do {
 954                         if (BLOCKS(lock, request)) {
 955                                 if (found_covering_lock &&
 956                                         !SAME_OWNER(lock, covered_by)) {
 957                                         lock = lock->l_next;
 958                                         continue;
 959                                 }
 960                                 if ((error = flk_add_edge(request, lock,
 961                                                         CHECK_CYCLE, 0)))
 962                                         return (error);
 963                         }
 964                         lock = lock->l_next;
 965                 } while (lock->l_vnode == vp);
 966         }
 967 
 968         if (NOT_BLOCKED(request)) {
 969                 /*
 970                  * request not dependent on any other locks
 971                  * so execute this request
 972                  */
 973                 return (flk_execute_request(request));
 974         } else {
 975                 /*
 976                  * check for deadlock
 977                  */
 978                 if (flk_check_deadlock(request))
 979                         return (EDEADLK);
 980                 /*
 981                  * this thread has to sleep
 982                  */
 983                 return (flk_wait_execute_request(request));
 984         }
 985 }
 986 
 987 /*
 988  * The actual execution of the request in the simple case is only to
 989  * insert the 'request' in the list of active locks if it is not an
 990  * UNLOCK.
 991  * We have to consider the existing active locks' relation to
 992  * this 'request' if they are owned by same process. flk_relation() does
 993  * this job and sees to that the dependency graph information is maintained
 994  * properly.
 995  */
 996 
 997 int
 998 flk_execute_request(lock_descriptor_t *request)
 999 {
1000         graph_t *gp = request->l_graph;
1001         vnode_t *vp = request->l_vnode;
1002         lock_descriptor_t       *lock, *lock1;
1003         int done_searching = 0;
1004 
1005         CHECK_SLEEPING_LOCKS(gp);
1006         CHECK_ACTIVE_LOCKS(gp);
1007 
1008         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1009 
1010         flk_set_state(request, FLK_START_STATE);
1011 
1012         ASSERT(NOT_BLOCKED(request));
1013 
1014         /* IO_LOCK requests are only to check status */
1015 
1016         if (IS_IO_LOCK(request))
1017                 return (0);
1018 
1019         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1020 
1021         if (lock == NULL && request->l_type == F_UNLCK)
1022                 return (0);
1023         if (lock == NULL) {
1024                 flk_insert_active_lock(request);
1025                 return (0);
1026         }
1027 
1028         do {
1029                 lock1 = lock->l_next;
1030                 if (SAME_OWNER(request, lock)) {
1031                         done_searching = flk_relation(lock, request);
1032                 }
1033                 lock = lock1;
1034         } while (lock->l_vnode == vp && !done_searching);
1035 
1036         /*
1037          * insert in active queue
1038          */
1039 
1040         if (request->l_type != F_UNLCK)
1041                 flk_insert_active_lock(request);
1042 
1043         return (0);
1044 }
1045 
1046 /*
1047  * 'request' is blocked by some one therefore we put it into sleep queue.
1048  */
1049 static int
1050 flk_wait_execute_request(lock_descriptor_t *request)
1051 {
1052         graph_t *gp = request->l_graph;
1053         callb_cpr_t     *cprp;          /* CPR info from callback */
1054         struct flock_globals *fg;
1055         int index;
1056 
1057         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1058         ASSERT(IS_WILLING_TO_SLEEP(request));
1059 
1060         flk_insert_sleeping_lock(request);
1061 
1062         if (IS_LOCKMGR(request)) {
1063                 index = HASH_INDEX(request->l_vnode);
1064                 fg = flk_get_globals();
1065 
1066                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1067                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1068                                 flk_cancel_sleeping_lock(request, 1);
1069                                 return (ENOLCK);
1070                         }
1071                 } else {                        /* booted as a cluster */
1072                         /*
1073                          * If the request is an NLM server lock request,
1074                          * and the NLM state of the lock request is not
1075                          * NLM_UP (because the NLM server is shutting
1076                          * down), then cancel the sleeping lock and
1077                          * return error ENOLCK that will encourage the
1078                          * client to retransmit.
1079                          */
1080                         if (!IS_NLM_UP(request)) {
1081                                 flk_cancel_sleeping_lock(request, 1);
1082                                 return (ENOLCK);
1083                         }
1084                 }
1085         }
1086 
1087         /* Clustering: For blocking PXFS locks, return */
1088         if (IS_PXFS(request)) {
1089                 /*
1090                  * PXFS locks sleep on the client side.
1091                  * The callback argument is used to wake up the sleeper
1092                  * when the lock is granted.
1093                  * We return -1 (rather than an errno value) to indicate
1094                  * the client side should sleep
1095                  */
1096                 return (PXFS_LOCK_BLOCKED);
1097         }
1098 
1099         if (request->l_callbacks != NULL) {
1100                 /*
1101                  * To make sure the shutdown code works correctly, either
1102                  * the callback must happen after putting the lock on the
1103                  * sleep list, or we must check the shutdown status after
1104                  * returning from the callback (and before sleeping).  At
1105                  * least for now, we'll use the first option.  If a
1106                  * shutdown or signal or whatever happened while the graph
1107                  * mutex was dropped, that will be detected by
1108                  * wait_for_lock().
1109                  */
1110                 mutex_exit(&gp->gp_mutex);
1111 
1112                 cprp = flk_invoke_callbacks(request->l_callbacks,
1113                                             FLK_BEFORE_SLEEP);
1114 
1115                 mutex_enter(&gp->gp_mutex);
1116 
1117                 if (cprp == NULL) {
1118                         wait_for_lock(request);
1119                 } else {
1120                         mutex_enter(cprp->cc_lockp);
1121                         CALLB_CPR_SAFE_BEGIN(cprp);
1122                         mutex_exit(cprp->cc_lockp);
1123                         wait_for_lock(request);
1124                         mutex_enter(cprp->cc_lockp);
1125                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1126                         mutex_exit(cprp->cc_lockp);
1127                 }
1128 
1129                 mutex_exit(&gp->gp_mutex);
1130                 (void) flk_invoke_callbacks(request->l_callbacks,
1131                                             FLK_AFTER_SLEEP);
1132                 mutex_enter(&gp->gp_mutex);
1133         } else {
1134                 wait_for_lock(request);
1135         }
1136 
1137         if (IS_LOCKMGR(request)) {
1138                 /*
1139                  * If the lock manager is shutting down, return an
1140                  * error that will encourage the client to retransmit.
1141                  */
1142                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1143                         !IS_GRANTED(request)) {
1144                         flk_cancel_sleeping_lock(request, 1);
1145                         return (ENOLCK);
1146                 }
1147         }
1148 
1149         if (IS_INTERRUPTED(request)) {
1150                 /* we got a signal, or act like we did */
1151                 flk_cancel_sleeping_lock(request, 1);
1152                 return (EINTR);
1153         }
1154 
1155         /* Cancelled if some other thread has closed the file */
1156 
1157         if (IS_CANCELLED(request)) {
1158                 flk_cancel_sleeping_lock(request, 1);
1159                 return (EBADF);
1160         }
1161 
1162         request->l_state &= ~GRANTED_LOCK;
1163         REMOVE_SLEEP_QUEUE(request);
1164         return (flk_execute_request(request));
1165 }
1166 
1167 /*
1168  * This routine adds an edge between from and to because from depends
1169  * to. If asked to check for deadlock it checks whether there are any
1170  * reachable locks from "from_lock" that is owned by the same process
1171  * as "from_lock".
1172  * NOTE: It is the caller's responsibility to make sure that the color
1173  * of the graph is consistent between the calls to flk_add_edge as done
1174  * in flk_process_request. This routine does not color and check for
1175  * deadlock explicitly.
1176  */
1177 
1178 static int
1179 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1180                         int check_cycle, int update_graph)
1181 {
1182         edge_t  *edge;
1183         edge_t  *ep;
1184         lock_descriptor_t       *vertex;
1185         lock_descriptor_t *vertex_stack;
1186 
1187         STACK_INIT(vertex_stack);
1188 
1189         /*
1190          * if to vertex already has mark_color just return
1191          * don't add an edge as it is reachable from from vertex
1192          * before itself.
1193          */
1194 
1195         if (COLORED(to_lock))
1196                 return (0);
1197 
1198         edge = flk_get_edge();
1199 
1200         /*
1201          * set the from and to vertex
1202          */
1203 
1204         edge->from_vertex = from_lock;
1205         edge->to_vertex = to_lock;
1206 
1207         /*
1208          * put in adjacency list of from vertex
1209          */
1210 
1211         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1212         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1213         edge->edge_adj_prev = &from_lock->l_edge;
1214         from_lock->l_edge.edge_adj_next = edge;
1215 
1216         /*
1217          * put in in list of to vertex
1218          */
1219 
1220         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1221         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1222         to_lock->l_edge.edge_in_next = edge;
1223         edge->edge_in_prev = &to_lock->l_edge;
1224 
1225 
1226         if (update_graph) {
1227                 flk_update_proc_graph(edge, 0);
1228                 return (0);
1229         }
1230         if (!check_cycle) {
1231                 return (0);
1232         }
1233 
1234         STACK_PUSH(vertex_stack, from_lock, l_stack);
1235 
1236         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1237 
1238                 STACK_POP(vertex_stack, l_stack);
1239 
1240                 for (ep = FIRST_ADJ(vertex);
1241                         ep != HEAD(vertex);
1242                                 ep = NEXT_ADJ(ep)) {
1243                         if (COLORED(ep->to_vertex))
1244                                 continue;
1245                         COLOR(ep->to_vertex);
1246                         if (SAME_OWNER(ep->to_vertex, from_lock))
1247                                 goto dead_lock;
1248                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1249                 }
1250         }
1251         return (0);
1252 
1253 dead_lock:
1254 
1255         /*
1256          * remove all edges
1257          */
1258 
1259         ep = FIRST_ADJ(from_lock);
1260 
1261         while (ep != HEAD(from_lock)) {
1262                 IN_LIST_REMOVE(ep);
1263                 from_lock->l_sedge = NEXT_ADJ(ep);
1264                 ADJ_LIST_REMOVE(ep);
1265                 flk_free_edge(ep);
1266                 ep = from_lock->l_sedge;
1267         }
1268         return (EDEADLK);
1269 }
1270 
1271 /*
1272  * Get an edge structure for representing the dependency between two locks.
1273  */
1274 
1275 static edge_t *
1276 flk_get_edge()
1277 {
1278         edge_t  *ep;
1279 
1280         ASSERT(flk_edge_cache != NULL);
1281 
1282         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1283         edge_allocs++;
1284         return (ep);
1285 }
1286 
1287 /*
1288  * Free the edge structure.
1289  */
1290 
1291 static void
1292 flk_free_edge(edge_t *ep)
1293 {
1294         edge_frees++;
1295         kmem_cache_free(flk_edge_cache, (void *)ep);
1296 }
1297 
1298 /*
1299  * Check the relationship of request with lock and perform the
1300  * recomputation of dependencies, break lock if required, and return
1301  * 1 if request cannot have any more relationship with the next
1302  * active locks.
1303  * The 'lock' and 'request' are compared and in case of overlap we
1304  * delete the 'lock' and form new locks to represent the non-overlapped
1305  * portion of original 'lock'. This function has side effects such as
1306  * 'lock' will be freed, new locks will be added to the active list.
1307  */
1308 
1309 static int
1310 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1311 {
1312         int lock_effect;
1313         lock_descriptor_t *lock1, *lock2;
1314         lock_descriptor_t *topology[3];
1315         int nvertex = 0;
1316         int i;
1317         edge_t  *ep;
1318         graph_t *gp = (lock->l_graph);
1319 
1320 
1321         CHECK_SLEEPING_LOCKS(gp);
1322         CHECK_ACTIVE_LOCKS(gp);
1323 
1324         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1325 
1326         topology[0] = topology[1] = topology[2] = NULL;
1327 
1328         if (request->l_type == F_UNLCK)
1329                 lock_effect = FLK_UNLOCK;
1330         else if (request->l_type == F_RDLCK &&
1331                         lock->l_type == F_WRLCK)
1332                 lock_effect = FLK_DOWNGRADE;
1333         else if (request->l_type == F_WRLCK &&
1334                         lock->l_type == F_RDLCK)
1335                 lock_effect = FLK_UPGRADE;
1336         else
1337                 lock_effect = FLK_STAY_SAME;
1338 
1339         if (lock->l_end < request->l_start) {
1340                 if (lock->l_end == request->l_start - 1 &&
1341                                 lock_effect == FLK_STAY_SAME) {
1342                         topology[0] = request;
1343                         request->l_start = lock->l_start;
1344                         nvertex = 1;
1345                         goto recompute;
1346                 } else {
1347                         return (0);
1348                 }
1349         }
1350 
1351         if (lock->l_start > request->l_end) {
1352                 if (request->l_end == lock->l_start - 1 &&
1353                                         lock_effect == FLK_STAY_SAME) {
1354                         topology[0] = request;
1355                         request->l_end = lock->l_end;
1356                         nvertex = 1;
1357                         goto recompute;
1358                 } else {
1359                         return (1);
1360                 }
1361         }
1362 
1363         if (request->l_end < lock->l_end) {
1364                 if (request->l_start > lock->l_start) {
1365                         if (lock_effect == FLK_STAY_SAME) {
1366                                 request->l_start = lock->l_start;
1367                                 request->l_end = lock->l_end;
1368                                 topology[0] = request;
1369                                 nvertex = 1;
1370                         } else {
1371                                 lock1 = flk_get_lock();
1372                                 lock2 = flk_get_lock();
1373                                 COPY(lock1, lock);
1374                                 COPY(lock2, lock);
1375                                 lock1->l_start = lock->l_start;
1376                                 lock1->l_end = request->l_start - 1;
1377                                 lock2->l_start = request->l_end + 1;
1378                                 lock2->l_end = lock->l_end;
1379                                 topology[0] = lock1;
1380                                 topology[1] = lock2;
1381                                 topology[2] = request;
1382                                 nvertex = 3;
1383                         }
1384                 } else if (request->l_start < lock->l_start) {
1385                         if (lock_effect == FLK_STAY_SAME) {
1386                                 request->l_end = lock->l_end;
1387                                 topology[0] = request;
1388                                 nvertex = 1;
1389                         } else {
1390                                 lock1 = flk_get_lock();
1391                                 COPY(lock1, lock);
1392                                 lock1->l_start = request->l_end + 1;
1393                                 topology[0] = lock1;
1394                                 topology[1] = request;
1395                                 nvertex = 2;
1396                         }
1397                 } else  {
1398                         if (lock_effect == FLK_STAY_SAME) {
1399                                 request->l_start = lock->l_start;
1400                                 request->l_end = lock->l_end;
1401                                 topology[0] = request;
1402                                 nvertex = 1;
1403                         } else {
1404                                 lock1 = flk_get_lock();
1405                                 COPY(lock1, lock);
1406                                 lock1->l_start = request->l_end + 1;
1407                                 topology[0] = lock1;
1408                                 topology[1] = request;
1409                                 nvertex = 2;
1410                         }
1411                 }
1412         } else if (request->l_end > lock->l_end) {
1413                 if (request->l_start > lock->l_start)  {
1414                         if (lock_effect == FLK_STAY_SAME) {
1415                                 request->l_start = lock->l_start;
1416                                 topology[0] = request;
1417                                 nvertex = 1;
1418                         } else {
1419                                 lock1 = flk_get_lock();
1420                                 COPY(lock1, lock);
1421                                 lock1->l_end = request->l_start - 1;
1422                                 topology[0] = lock1;
1423                                 topology[1] = request;
1424                                 nvertex = 2;
1425                         }
1426                 } else if (request->l_start < lock->l_start)  {
1427                         topology[0] = request;
1428                         nvertex = 1;
1429                 } else {
1430                         topology[0] = request;
1431                         nvertex = 1;
1432                 }
1433         } else {
1434                 if (request->l_start > lock->l_start) {
1435                         if (lock_effect == FLK_STAY_SAME) {
1436                                 request->l_start = lock->l_start;
1437                                 topology[0] = request;
1438                                 nvertex = 1;
1439                         } else {
1440                                 lock1 = flk_get_lock();
1441                                 COPY(lock1, lock);
1442                                 lock1->l_end = request->l_start - 1;
1443                                 topology[0] = lock1;
1444                                 topology[1] = request;
1445                                 nvertex = 2;
1446                         }
1447                 } else if (request->l_start < lock->l_start) {
1448                         topology[0] = request;
1449                         nvertex = 1;
1450                 } else {
1451                         if (lock_effect !=  FLK_UNLOCK) {
1452                                 topology[0] = request;
1453                                 nvertex = 1;
1454                         } else {
1455                                 flk_delete_active_lock(lock, 0);
1456                                 flk_wakeup(lock, 1);
1457                                 flk_free_lock(lock);
1458                                 CHECK_SLEEPING_LOCKS(gp);
1459                                 CHECK_ACTIVE_LOCKS(gp);
1460                                 return (1);
1461                         }
1462                 }
1463         }
1464 
1465 recompute:
1466 
1467         /*
1468          * For unlock we don't send the 'request' to for recomputing
1469          * dependencies because no lock will add an edge to this.
1470          */
1471 
1472         if (lock_effect == FLK_UNLOCK) {
1473                 topology[nvertex-1] = NULL;
1474                 nvertex--;
1475         }
1476         for (i = 0; i < nvertex; i++) {
1477                 topology[i]->l_state |= RECOMPUTE_LOCK;
1478                 topology[i]->l_color = NO_COLOR;
1479         }
1480 
1481         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1482 
1483         /*
1484          * we remove the adjacent edges for all vertices' to this vertex
1485          * 'lock'.
1486          */
1487 
1488         ep = FIRST_IN(lock);
1489         while (ep != HEAD(lock)) {
1490                 ADJ_LIST_REMOVE(ep);
1491                 ep = NEXT_IN(ep);
1492         }
1493 
1494         flk_delete_active_lock(lock, 0);
1495 
1496         /* We are ready for recomputing the dependencies now */
1497 
1498         flk_recompute_dependencies(lock, topology, nvertex, 1);
1499 
1500         for (i = 0; i < nvertex; i++) {
1501                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1502                 topology[i]->l_color = NO_COLOR;
1503         }
1504 
1505 
1506         if (lock_effect == FLK_UNLOCK) {
1507                 nvertex++;
1508         }
1509         for (i = 0; i < nvertex - 1; i++) {
1510                 flk_insert_active_lock(topology[i]);
1511         }
1512 
1513 
1514         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1515                 flk_wakeup(lock, 0);
1516         } else {
1517                 ep = FIRST_IN(lock);
1518                 while (ep != HEAD(lock)) {
1519                         lock->l_sedge = NEXT_IN(ep);
1520                         IN_LIST_REMOVE(ep);
1521                         flk_update_proc_graph(ep, 1);
1522                         flk_free_edge(ep);
1523                         ep = lock->l_sedge;
1524                 }
1525         }
1526         flk_free_lock(lock);
1527 
1528         CHECK_SLEEPING_LOCKS(gp);
1529         CHECK_ACTIVE_LOCKS(gp);
1530         return (0);
1531 }
1532 
1533 /*
1534  * Insert a lock into the active queue.
1535  */
1536 
1537 static void
1538 flk_insert_active_lock(lock_descriptor_t *new_lock)
1539 {
1540         graph_t *gp = new_lock->l_graph;
1541         vnode_t *vp = new_lock->l_vnode;
1542         lock_descriptor_t *first_lock, *lock;
1543 
1544         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1545 
1546         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1547         first_lock = lock;
1548 
1549         if (first_lock != NULL) {
1550                 for (; (lock->l_vnode == vp &&
1551                         lock->l_start < new_lock->l_start); lock = lock->l_next)
1552                         ;
1553         } else {
1554                 lock = ACTIVE_HEAD(gp);
1555         }
1556 
1557         lock->l_prev->l_next = new_lock;
1558         new_lock->l_next = lock;
1559         new_lock->l_prev = lock->l_prev;
1560         lock->l_prev = new_lock;
1561 
1562         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1563                 vp->v_filocks = (struct filock *)new_lock;
1564         }
1565         flk_set_state(new_lock, FLK_ACTIVE_STATE);
1566         new_lock->l_state |= ACTIVE_LOCK;
1567 
1568         CHECK_ACTIVE_LOCKS(gp);
1569         CHECK_SLEEPING_LOCKS(gp);
1570 }
1571 
1572 /*
1573  * Delete the active lock : Performs two functions depending on the
1574  * value of second parameter. One is to remove from the active lists
1575  * only and other is to both remove and free the lock.
1576  */
1577 
1578 static void
1579 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1580 {
1581         vnode_t *vp = lock->l_vnode;
1582         graph_t *gp = lock->l_graph;
1583 
1584         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1585         if (free_lock)
1586                 ASSERT(NO_DEPENDENTS(lock));
1587         ASSERT(NOT_BLOCKED(lock));
1588         ASSERT(IS_ACTIVE(lock));
1589 
1590         ASSERT((vp->v_filocks != NULL));
1591 
1592         if (vp->v_filocks == (struct filock *)lock) {
1593                 vp->v_filocks = (struct filock *)
1594                                 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1595                                                                 NULL);
1596         }
1597         lock->l_next->l_prev = lock->l_prev;
1598         lock->l_prev->l_next = lock->l_next;
1599         lock->l_next = lock->l_prev = NULL;
1600         flk_set_state(lock, FLK_DEAD_STATE);
1601         lock->l_state &= ~ACTIVE_LOCK;
1602 
1603         if (free_lock)
1604                 flk_free_lock(lock);
1605         CHECK_ACTIVE_LOCKS(gp);
1606         CHECK_SLEEPING_LOCKS(gp);
1607 }
1608 
1609 /*
1610  * Insert into the sleep queue.
1611  */
1612 
1613 static void
1614 flk_insert_sleeping_lock(lock_descriptor_t *request)
1615 {
1616         graph_t *gp = request->l_graph;
1617         vnode_t *vp = request->l_vnode;
1618         lock_descriptor_t       *lock;
1619 
1620         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1621         ASSERT(IS_INITIAL(request));
1622 
1623         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1624                 lock->l_vnode < vp); lock = lock->l_next)
1625                 ;
1626 
1627         lock->l_prev->l_next = request;
1628         request->l_prev = lock->l_prev;
1629         lock->l_prev = request;
1630         request->l_next = lock;
1631         flk_set_state(request, FLK_SLEEPING_STATE);
1632         request->l_state |= SLEEPING_LOCK;
1633 }
1634 
1635 /*
1636  * Cancelling a sleeping lock implies removing a vertex from the
1637  * dependency graph and therefore we should recompute the dependencies
1638  * of all vertices that have a path  to this vertex, w.r.t. all
1639  * vertices reachable from this vertex.
1640  */
1641 
1642 void
1643 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1644 {
1645         graph_t *gp = request->l_graph;
1646         vnode_t *vp = request->l_vnode;
1647         lock_descriptor_t **topology = NULL;
1648         edge_t  *ep;
1649         lock_descriptor_t *vertex, *lock;
1650         int nvertex = 0;
1651         int i;
1652         lock_descriptor_t *vertex_stack;
1653 
1654         STACK_INIT(vertex_stack);
1655 
1656         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1657         /*
1658          * count number of vertex pointers that has to be allocated
1659          * All vertices that are reachable from request.
1660          */
1661 
1662         STACK_PUSH(vertex_stack, request, l_stack);
1663 
1664         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1665                 STACK_POP(vertex_stack, l_stack);
1666                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1667                                         ep = NEXT_ADJ(ep)) {
1668                         if (IS_RECOMPUTE(ep->to_vertex))
1669                                 continue;
1670                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1671                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1672                         nvertex++;
1673                 }
1674         }
1675 
1676         /*
1677          * allocate memory for holding the vertex pointers
1678          */
1679 
1680         if (nvertex) {
1681                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1682                                                 KM_SLEEP);
1683         }
1684 
1685         /*
1686          * one more pass to actually store the vertices in the
1687          * allocated array.
1688          * We first check sleeping locks and then active locks
1689          * so that topology array will be in a topological
1690          * order.
1691          */
1692 
1693         nvertex = 0;
1694         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1695 
1696         if (lock) {
1697                 do {
1698                         if (IS_RECOMPUTE(lock)) {
1699                                 lock->l_index = nvertex;
1700                                 topology[nvertex++] = lock;
1701                         }
1702                         lock->l_color = NO_COLOR;
1703                         lock = lock->l_next;
1704                 } while (lock->l_vnode == vp);
1705         }
1706 
1707         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1708 
1709         if (lock) {
1710                 do {
1711                         if (IS_RECOMPUTE(lock)) {
1712                                 lock->l_index = nvertex;
1713                                 topology[nvertex++] = lock;
1714                         }
1715                         lock->l_color = NO_COLOR;
1716                         lock = lock->l_next;
1717                 } while (lock->l_vnode == vp);
1718         }
1719 
1720         /*
1721          * remove in and out edges of request
1722          * They are freed after updating proc_graph below.
1723          */
1724 
1725         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1726                 ADJ_LIST_REMOVE(ep);
1727         }
1728 
1729 
1730         if (remove_from_queue)
1731                 REMOVE_SLEEP_QUEUE(request);
1732 
1733         /* we are ready to recompute */
1734 
1735         flk_recompute_dependencies(request, topology, nvertex, 1);
1736 
1737         ep = FIRST_ADJ(request);
1738         while (ep != HEAD(request)) {
1739                 IN_LIST_REMOVE(ep);
1740                 request->l_sedge = NEXT_ADJ(ep);
1741                 ADJ_LIST_REMOVE(ep);
1742                 flk_update_proc_graph(ep, 1);
1743                 flk_free_edge(ep);
1744                 ep = request->l_sedge;
1745         }
1746 
1747 
1748         /*
1749          * unset the RECOMPUTE flag in those vertices
1750          */
1751 
1752         for (i = 0; i < nvertex; i++) {
1753                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1754         }
1755 
1756         /*
1757          * free the topology
1758          */
1759         if (nvertex)
1760                 kmem_free((void *)topology,
1761                         (nvertex * sizeof (lock_descriptor_t *)));
1762         /*
1763          * Possibility of some locks unblocked now
1764          */
1765 
1766         flk_wakeup(request, 0);
1767 
1768         /*
1769          * we expect to have a correctly recomputed graph  now.
1770          */
1771         flk_set_state(request, FLK_DEAD_STATE);
1772         flk_free_lock(request);
1773         CHECK_SLEEPING_LOCKS(gp);
1774         CHECK_ACTIVE_LOCKS(gp);
1775 
1776 }
1777 
1778 /*
1779  * Uncoloring the graph is simply to increment the mark value of the graph
1780  * And only when wrap round takes place will we color all vertices in
1781  * the graph explicitly.
1782  */
1783 
1784 static void
1785 flk_graph_uncolor(graph_t *gp)
1786 {
1787         lock_descriptor_t *lock;
1788 
1789         if (gp->mark == UINT_MAX) {
1790                 gp->mark = 1;
1791         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1792                                         lock = lock->l_next)
1793                         lock->l_color  = 0;
1794 
1795         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1796                                         lock = lock->l_next)
1797                         lock->l_color  = 0;
1798         } else {
1799                 gp->mark++;
1800         }
1801 }
1802 
1803 /*
1804  * Wake up locks that are blocked on the given lock.
1805  */
1806 
1807 static void
1808 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1809 {
1810         edge_t  *ep;
1811         graph_t *gp = lock->l_graph;
1812         lock_descriptor_t       *lck;
1813 
1814         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1815         if (NO_DEPENDENTS(lock))
1816                 return;
1817         ep = FIRST_IN(lock);
1818         do {
1819                 /*
1820                  * delete the edge from the adjacency list
1821                  * of from vertex. if no more adjacent edges
1822                  * for this vertex wake this process.
1823                  */
1824                 lck = ep->from_vertex;
1825                 if (adj_list_remove)
1826                         ADJ_LIST_REMOVE(ep);
1827                 flk_update_proc_graph(ep, 1);
1828                 if (NOT_BLOCKED(lck)) {
1829                         GRANT_WAKEUP(lck);
1830                 }
1831                 lock->l_sedge = NEXT_IN(ep);
1832                 IN_LIST_REMOVE(ep);
1833                 flk_free_edge(ep);
1834                 ep = lock->l_sedge;
1835         } while (ep != HEAD(lock));
1836         ASSERT(NO_DEPENDENTS(lock));
1837 }
1838 
1839 /*
1840  * The dependents of request, is checked for its dependency against the
1841  * locks in topology (called topology because the array is and should be in
1842  * topological order for this algorithm, if not in topological order the
1843  * inner loop below might add more edges than necessary. Topological ordering
1844  * of vertices satisfies the property that all edges will be from left to
1845  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1846  * If lock l1 in the dependent set of request is dependent (blocked by)
1847  * on lock l2 in topology but does not have a path to it, we add an edge
1848  * in the inner loop below.
1849  *
1850  * We don't want to add an edge between l1 and l2 if there exists
1851  * already a path from l1 to l2, so care has to be taken for those vertices
1852  * that  have two paths to 'request'. These vertices are referred to here
1853  * as barrier locks.
1854  *
1855  * The barriers has to be found (those vertex that originally had two paths
1856  * to request) because otherwise we may end up adding edges unnecessarily
1857  * to vertices in topology, and thus barrier vertices can have an edge
1858  * to a vertex in topology as well a path to it.
1859  */
1860 
1861 static void
1862 flk_recompute_dependencies(lock_descriptor_t *request,
1863                 lock_descriptor_t **topology,
1864                         int nvertex, int update_graph)
1865 {
1866         lock_descriptor_t *vertex, *lock;
1867         graph_t *gp = request->l_graph;
1868         int i, count;
1869         int barrier_found = 0;
1870         edge_t  *ep;
1871         lock_descriptor_t *vertex_stack;
1872 
1873         STACK_INIT(vertex_stack);
1874 
1875         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1876         if (nvertex == 0)
1877                 return;
1878         flk_graph_uncolor(request->l_graph);
1879         barrier_found = flk_find_barriers(request);
1880         request->l_state |= RECOMPUTE_DONE;
1881 
1882         STACK_PUSH(vertex_stack, request, l_stack);
1883         request->l_sedge = FIRST_IN(request);
1884 
1885 
1886         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1887                 if (vertex->l_state & RECOMPUTE_DONE) {
1888                         count = 0;
1889                         goto next_in_edge;
1890                 }
1891                 if (IS_BARRIER(vertex)) {
1892                         /* decrement the barrier count */
1893                         if (vertex->l_index) {
1894                                 vertex->l_index--;
1895                                 /* this guy will be pushed again anyway ? */
1896                                 STACK_POP(vertex_stack, l_stack);
1897                                 if (vertex->l_index == 0)  {
1898                                 /*
1899                                  * barrier is over we can recompute
1900                                  * dependencies for this lock in the
1901                                  * next stack pop
1902                                  */
1903                                         vertex->l_state &= ~BARRIER_LOCK;
1904                                 }
1905                                 continue;
1906                         }
1907                 }
1908                 vertex->l_state |= RECOMPUTE_DONE;
1909                 flk_graph_uncolor(gp);
1910                 count = flk_color_reachables(vertex);
1911                 for (i = 0; i < nvertex; i++) {
1912                         lock = topology[i];
1913                         if (COLORED(lock))
1914                                 continue;
1915                         if (BLOCKS(lock, vertex)) {
1916                                 (void) flk_add_edge(vertex, lock,
1917                                     NO_CHECK_CYCLE, update_graph);
1918                                 COLOR(lock);
1919                                 count++;
1920                                 count += flk_color_reachables(lock);
1921                         }
1922 
1923                 }
1924 
1925 next_in_edge:
1926                 if (count == nvertex ||
1927                                 vertex->l_sedge == HEAD(vertex)) {
1928                         /* prune the tree below this */
1929                         STACK_POP(vertex_stack, l_stack);
1930                         vertex->l_state &= ~RECOMPUTE_DONE;
1931                         /* update the barrier locks below this! */
1932                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1933                                 flk_graph_uncolor(gp);
1934                                 flk_update_barriers(vertex);
1935                         }
1936                         continue;
1937                 }
1938 
1939                 ep = vertex->l_sedge;
1940                 lock = ep->from_vertex;
1941                 STACK_PUSH(vertex_stack, lock, l_stack);
1942                 lock->l_sedge = FIRST_IN(lock);
1943                 vertex->l_sedge = NEXT_IN(ep);
1944         }
1945 
1946 }
1947 
1948 /*
1949  * Color all reachable vertices from vertex that belongs to topology (here
1950  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1951  *
1952  * Note: we need to use a different stack_link l_stack1 because this is
1953  * called from flk_recompute_dependencies() that already uses a stack with
1954  * l_stack as stack_link.
1955  */
1956 
1957 static int
1958 flk_color_reachables(lock_descriptor_t *vertex)
1959 {
1960         lock_descriptor_t *ver, *lock;
1961         int count;
1962         edge_t  *ep;
1963         lock_descriptor_t *vertex_stack;
1964 
1965         STACK_INIT(vertex_stack);
1966 
1967         STACK_PUSH(vertex_stack, vertex, l_stack1);
1968         count = 0;
1969         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1970 
1971                 STACK_POP(vertex_stack, l_stack1);
1972                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1973                                         ep = NEXT_ADJ(ep)) {
1974                         lock = ep->to_vertex;
1975                         if (COLORED(lock))
1976                                 continue;
1977                         COLOR(lock);
1978                         if (IS_RECOMPUTE(lock))
1979                                 count++;
1980                         STACK_PUSH(vertex_stack, lock, l_stack1);
1981                 }
1982 
1983         }
1984         return (count);
1985 }
1986 
1987 /*
1988  * Called from flk_recompute_dependencies() this routine decrements
1989  * the barrier count of barrier vertices that are reachable from lock.
1990  */
1991 
1992 static void
1993 flk_update_barriers(lock_descriptor_t *lock)
1994 {
1995         lock_descriptor_t *vertex, *lck;
1996         edge_t  *ep;
1997         lock_descriptor_t *vertex_stack;
1998 
1999         STACK_INIT(vertex_stack);
2000 
2001         STACK_PUSH(vertex_stack, lock, l_stack1);
2002 
2003         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2004                 STACK_POP(vertex_stack, l_stack1);
2005                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2006                                                 ep = NEXT_IN(ep)) {
2007                         lck = ep->from_vertex;
2008                         if (COLORED(lck)) {
2009                                 if (IS_BARRIER(lck)) {
2010                                         ASSERT(lck->l_index > 0);
2011                                         lck->l_index--;
2012                                         if (lck->l_index == 0)
2013                                                 lck->l_state &= ~BARRIER_LOCK;
2014                                 }
2015                                 continue;
2016                         }
2017                         COLOR(lck);
2018                         if (IS_BARRIER(lck)) {
2019                                 ASSERT(lck->l_index > 0);
2020                                 lck->l_index--;
2021                                 if (lck->l_index == 0)
2022                                         lck->l_state &= ~BARRIER_LOCK;
2023                         }
2024                         STACK_PUSH(vertex_stack, lck, l_stack1);
2025                 }
2026         }
2027 }
2028 
2029 /*
2030  * Finds all vertices that are reachable from 'lock' more than once and
2031  * mark them as barrier vertices and increment their barrier count.
2032  * The barrier count is one minus the total number of paths from lock
2033  * to that vertex.
2034  */
2035 
2036 static int
2037 flk_find_barriers(lock_descriptor_t *lock)
2038 {
2039         lock_descriptor_t *vertex, *lck;
2040         int found = 0;
2041         edge_t  *ep;
2042         lock_descriptor_t *vertex_stack;
2043 
2044         STACK_INIT(vertex_stack);
2045 
2046         STACK_PUSH(vertex_stack, lock, l_stack1);
2047 
2048         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2049                 STACK_POP(vertex_stack, l_stack1);
2050                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2051                                                 ep = NEXT_IN(ep)) {
2052                         lck = ep->from_vertex;
2053                         if (COLORED(lck)) {
2054                                 /* this is a barrier */
2055                                 lck->l_state |= BARRIER_LOCK;
2056                                 /* index will have barrier count */
2057                                 lck->l_index++;
2058                                 if (!found)
2059                                         found = 1;
2060                                 continue;
2061                         }
2062                         COLOR(lck);
2063                         lck->l_index = 0;
2064                         STACK_PUSH(vertex_stack, lck, l_stack1);
2065                 }
2066         }
2067         return (found);
2068 }
2069 
2070 /*
2071  * Finds the first lock that is mainly responsible for blocking this
2072  * request.  If there is no such lock, request->l_flock.l_type is set to
2073  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2074  * of the blocking lock.
2075  *
2076  * Note: It is possible a request is blocked by a sleeping lock because
2077  * of the fairness policy used in flk_process_request() to construct the
2078  * dependencies. (see comments before flk_process_request()).
2079  */
2080 
2081 static void
2082 flk_get_first_blocking_lock(lock_descriptor_t *request)
2083 {
2084         graph_t *gp = request->l_graph;
2085         vnode_t *vp = request->l_vnode;
2086         lock_descriptor_t *lock, *blocker;
2087 
2088         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2089         blocker = NULL;
2090         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2091 
2092         if (lock) {
2093                 do {
2094                         if (BLOCKS(lock, request)) {
2095                                 blocker = lock;
2096                                 break;
2097                         }
2098                         lock = lock->l_next;
2099                 } while (lock->l_vnode == vp);
2100         }
2101 
2102         if (blocker) {
2103                 report_blocker(blocker, request);
2104         } else
2105                 request->l_flock.l_type = F_UNLCK;
2106 }
2107 
2108 /*
2109  * Get the graph_t structure associated with a vnode.
2110  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2111  * not yet been initialized, then a new element is allocated and returned.
2112  */
2113 graph_t *
2114 flk_get_lock_graph(vnode_t *vp, int initialize)
2115 {
2116         graph_t *gp;
2117         graph_t *gp_alloc = NULL;
2118         int index = HASH_INDEX(vp);
2119 
2120         if (initialize == FLK_USE_GRAPH) {
2121                 mutex_enter(&flock_lock);
2122                 gp = lock_graph[index];
2123                 mutex_exit(&flock_lock);
2124                 return (gp);
2125         }
2126 
2127         ASSERT(initialize == FLK_INIT_GRAPH);
2128 
2129         if (lock_graph[index] == NULL) {
2130 
2131                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2132 
2133                 /* Initialize the graph */
2134 
2135                 gp_alloc->active_locks.l_next =
2136                     gp_alloc->active_locks.l_prev =
2137                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2138                 gp_alloc->sleeping_locks.l_next =
2139                     gp_alloc->sleeping_locks.l_prev =
2140                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2141                 gp_alloc->index = index;
2142                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2143         }
2144 
2145         mutex_enter(&flock_lock);
2146 
2147         gp = lock_graph[index];
2148 
2149         /* Recheck the value within flock_lock */
2150         if (gp == NULL) {
2151                 struct flock_globals *fg;
2152 
2153                 /* We must have previously allocated the graph_t structure */
2154                 ASSERT(gp_alloc != NULL);
2155                 lock_graph[index] = gp = gp_alloc;
2156                 /*
2157                  * The lockmgr status is only needed if KLM is loaded.
2158                  */
2159                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2160                         fg = flk_get_globals();
2161                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2162                 }
2163         }
2164 
2165         mutex_exit(&flock_lock);
2166 
2167         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2168                 /* There was a race to allocate the graph_t and we lost */
2169                 mutex_destroy(&gp_alloc->gp_mutex);
2170                 kmem_free(gp_alloc, sizeof (graph_t));
2171         }
2172 
2173         return (gp);
2174 }
2175 
2176 /*
2177  * PSARC case 1997/292
2178  */
2179 int
2180 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2181 {
2182         lock_descriptor_t *lock;
2183         int result = 0;
2184         graph_t *gp;
2185         int                     lock_nlmid;
2186 
2187         /*
2188          * Check to see if node is booted as a cluster. If not, return.
2189          */
2190         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2191                 return (0);
2192         }
2193 
2194         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2195         if (gp == NULL) {
2196                 return (0);
2197         }
2198 
2199         mutex_enter(&gp->gp_mutex);
2200 
2201         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2202 
2203         if (lock) {
2204                 while (lock->l_vnode == vp) {
2205                         /* get NLM id from sysid */
2206                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2207 
2208                         /*
2209                          * If NLM server request _and_ nlmid of lock matches
2210                          * nlmid of argument, then we've found a remote lock.
2211                          */
2212                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2213                                 result = 1;
2214                                 goto done;
2215                         }
2216                         lock = lock->l_next;
2217                 }
2218         }
2219 
2220         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2221 
2222         if (lock) {
2223                 while (lock->l_vnode == vp) {
2224                         /* get NLM id from sysid */
2225                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2226 
2227                         /*
2228                          * If NLM server request _and_ nlmid of lock matches
2229                          * nlmid of argument, then we've found a remote lock.
2230                          */
2231                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2232                                 result = 1;
2233                                 goto done;
2234                         }
2235                         lock = lock->l_next;
2236                 }
2237         }
2238 
2239 done:
2240         mutex_exit(&gp->gp_mutex);
2241         return (result);
2242 }
2243 
2244 /*
2245  * Determine whether there are any locks for the given vnode with a remote
2246  * sysid.  Returns zero if not, non-zero if there are.
2247  *
2248  * Note that the return value from this function is potentially invalid
2249  * once it has been returned.  The caller is responsible for providing its
2250  * own synchronization mechanism to ensure that the return value is useful
2251  * (e.g., see nfs_lockcompletion()).
2252  */
2253 int
2254 flk_has_remote_locks(vnode_t *vp)
2255 {
2256         lock_descriptor_t *lock;
2257         int result = 0;
2258         graph_t *gp;
2259 
2260         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2261         if (gp == NULL) {
2262                 return (0);
2263         }
2264 
2265         mutex_enter(&gp->gp_mutex);
2266 
2267         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2268 
2269         if (lock) {
2270                 while (lock->l_vnode == vp) {
2271                         if (IS_REMOTE(lock)) {
2272                                 result = 1;
2273                                 goto done;
2274                         }
2275                         lock = lock->l_next;
2276                 }
2277         }
2278 
2279         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2280 
2281         if (lock) {
2282                 while (lock->l_vnode == vp) {
2283                         if (IS_REMOTE(lock)) {
2284                                 result = 1;
2285                                 goto done;
2286                         }
2287                         lock = lock->l_next;
2288                 }
2289         }
2290 
2291 done:
2292         mutex_exit(&gp->gp_mutex);
2293         return (result);
2294 }
2295 
2296 /*
2297  * Determine whether there are any locks for the given vnode with a remote
2298  * sysid matching given sysid.
2299  * Used by the new (open source) NFS Lock Manager (NLM)
2300  */
2301 int
2302 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2303 {
2304         lock_descriptor_t *lock;
2305         int result = 0;
2306         graph_t *gp;
2307 
2308         if (sysid == 0)
2309                 return (0);
2310 
2311         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2312         if (gp == NULL) {
2313                 return (0);
2314         }
2315 
2316         mutex_enter(&gp->gp_mutex);
2317 
2318         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2319 
2320         if (lock) {
2321                 while (lock->l_vnode == vp) {
2322                         if (lock->l_flock.l_sysid == sysid) {
2323                                 result = 1;
2324                                 goto done;
2325                         }
2326                         lock = lock->l_next;
2327                 }
2328         }
2329 
2330         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2331 
2332         if (lock) {
2333                 while (lock->l_vnode == vp) {
2334                         if (lock->l_flock.l_sysid == sysid) {
2335                                 result = 1;
2336                                 goto done;
2337                         }
2338                         lock = lock->l_next;
2339                 }
2340         }
2341 
2342 done:
2343         mutex_exit(&gp->gp_mutex);
2344         return (result);
2345 }
2346 
2347 /*
2348  * Determine if there are any locks owned by the given sysid.
2349  * Returns zero if not, non-zero if there are.  Note that this return code
2350  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2351  * avoids all the memory allocations of those routines.
2352  *
2353  * This routine has the same synchronization issues as
2354  * flk_has_remote_locks.
2355  */
2356 
2357 int
2358 flk_sysid_has_locks(int sysid, int lck_type)
2359 {
2360         int             has_locks = 0;
2361         lock_descriptor_t       *lock;
2362         graph_t         *gp;
2363         int             i;
2364 
2365         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2366                 mutex_enter(&flock_lock);
2367                 gp = lock_graph[i];
2368                 mutex_exit(&flock_lock);
2369                 if (gp == NULL) {
2370                         continue;
2371                 }
2372 
2373                 mutex_enter(&gp->gp_mutex);
2374 
2375                 if (lck_type & FLK_QUERY_ACTIVE) {
2376                         for (lock = ACTIVE_HEAD(gp)->l_next;
2377                             lock != ACTIVE_HEAD(gp) && !has_locks;
2378                             lock = lock->l_next) {
2379                                 if (lock->l_flock.l_sysid == sysid)
2380                                         has_locks = 1;
2381                         }
2382                 }
2383 
2384                 if (lck_type & FLK_QUERY_SLEEPING) {
2385                         for (lock = SLEEPING_HEAD(gp)->l_next;
2386                                 lock != SLEEPING_HEAD(gp) && !has_locks;
2387                                 lock = lock->l_next) {
2388                                 if (lock->l_flock.l_sysid == sysid)
2389                                         has_locks = 1;
2390                         }
2391                 }
2392                 mutex_exit(&gp->gp_mutex);
2393         }
2394 
2395         return (has_locks);
2396 }
2397 
2398 
2399 /*
2400  * PSARC case 1997/292
2401  *
2402  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2403  *  quantity, the real sysid generated by the NLM server; the upper half
2404  *  identifies the node of the cluster where the NLM server ran.
2405  *  This routine is only called by an NLM server running in a cluster.
2406  * Effects: Remove all locks held on behalf of the client identified
2407  *  by "sysid."
2408  */
2409 void
2410 cl_flk_remove_locks_by_sysid(int sysid)
2411 {
2412         graph_t *gp;
2413         int i;
2414         lock_descriptor_t *lock, *nlock;
2415 
2416         /*
2417          * Check to see if node is booted as a cluster. If not, return.
2418          */
2419         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2420                 return;
2421         }
2422 
2423         ASSERT(sysid != 0);
2424         for (i = 0; i < HASH_SIZE; i++) {
2425                 mutex_enter(&flock_lock);
2426                 gp = lock_graph[i];
2427                 mutex_exit(&flock_lock);
2428 
2429                 if (gp == NULL)
2430                         continue;
2431 
2432                 mutex_enter(&gp->gp_mutex);      /*  get mutex on lock graph */
2433 
2434                 /* signal sleeping requests so that they bail out */
2435                 lock = SLEEPING_HEAD(gp)->l_next;
2436                 while (lock != SLEEPING_HEAD(gp)) {
2437                         nlock = lock->l_next;
2438                         if (lock->l_flock.l_sysid == sysid) {
2439                                 INTERRUPT_WAKEUP(lock);
2440                         }
2441                         lock = nlock;
2442                 }
2443 
2444                 /* delete active locks */
2445                 lock = ACTIVE_HEAD(gp)->l_next;
2446                 while (lock != ACTIVE_HEAD(gp)) {
2447                         nlock = lock->l_next;
2448                         if (lock->l_flock.l_sysid == sysid) {
2449                                 flk_delete_active_lock(lock, 0);
2450                                 flk_wakeup(lock, 1);
2451                                 flk_free_lock(lock);
2452                         }
2453                         lock = nlock;
2454                 }
2455                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2456         }
2457 }
2458 
2459 /*
2460  * Delete all locks in the system that belongs to the sysid of the request.
2461  */
2462 
2463 static void
2464 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2465 {
2466         int     sysid  = request->l_flock.l_sysid;
2467         lock_descriptor_t *lock, *nlock;
2468         graph_t *gp;
2469         int i;
2470 
2471         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2472         ASSERT(sysid != 0);
2473 
2474         mutex_exit(&request->l_graph->gp_mutex);
2475 
2476         for (i = 0; i < HASH_SIZE; i++) {
2477                 mutex_enter(&flock_lock);
2478                 gp = lock_graph[i];
2479                 mutex_exit(&flock_lock);
2480 
2481                 if (gp == NULL)
2482                         continue;
2483 
2484                 mutex_enter(&gp->gp_mutex);
2485 
2486                 /* signal sleeping requests so that they bail out */
2487                 lock = SLEEPING_HEAD(gp)->l_next;
2488                 while (lock != SLEEPING_HEAD(gp)) {
2489                         nlock = lock->l_next;
2490                         if (lock->l_flock.l_sysid == sysid) {
2491                                 INTERRUPT_WAKEUP(lock);
2492                         }
2493                         lock = nlock;
2494                 }
2495 
2496                 /* delete active locks */
2497                 lock = ACTIVE_HEAD(gp)->l_next;
2498                 while (lock != ACTIVE_HEAD(gp)) {
2499                         nlock = lock->l_next;
2500                         if (lock->l_flock.l_sysid == sysid) {
2501                                 flk_delete_active_lock(lock, 0);
2502                                 flk_wakeup(lock, 1);
2503                                 flk_free_lock(lock);
2504                         }
2505                         lock = nlock;
2506                 }
2507                 mutex_exit(&gp->gp_mutex);
2508         }
2509 
2510         mutex_enter(&request->l_graph->gp_mutex);
2511 }
2512 
2513 /*
2514  * Clustering: Deletes PXFS locks
2515  * Effects: Delete all locks on files in the given file system and with the
2516  *  given PXFS id.
2517  */
2518 void
2519 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2520 {
2521         lock_descriptor_t *lock, *nlock;
2522         graph_t *gp;
2523         int i;
2524 
2525         for (i = 0; i < HASH_SIZE; i++) {
2526                 mutex_enter(&flock_lock);
2527                 gp = lock_graph[i];
2528                 mutex_exit(&flock_lock);
2529 
2530                 if (gp == NULL)
2531                         continue;
2532 
2533                 mutex_enter(&gp->gp_mutex);
2534 
2535                 /* signal sleeping requests so that they bail out */
2536                 lock = SLEEPING_HEAD(gp)->l_next;
2537                 while (lock != SLEEPING_HEAD(gp)) {
2538                         nlock = lock->l_next;
2539                         if (lock->l_vnode->v_vfsp == vfsp) {
2540                                 ASSERT(IS_PXFS(lock));
2541                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2542                                     pxfsid) {
2543                                         flk_set_state(lock,
2544                                             FLK_CANCELLED_STATE);
2545                                         flk_cancel_sleeping_lock(lock, 1);
2546                                 }
2547                         }
2548                         lock = nlock;
2549                 }
2550 
2551                 /* delete active locks */
2552                 lock = ACTIVE_HEAD(gp)->l_next;
2553                 while (lock != ACTIVE_HEAD(gp)) {
2554                         nlock = lock->l_next;
2555                         if (lock->l_vnode->v_vfsp == vfsp) {
2556                                 ASSERT(IS_PXFS(lock));
2557                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2558                                     pxfsid) {
2559                                         flk_delete_active_lock(lock, 0);
2560                                         flk_wakeup(lock, 1);
2561                                         flk_free_lock(lock);
2562                                 }
2563                         }
2564                         lock = nlock;
2565                 }
2566                 mutex_exit(&gp->gp_mutex);
2567         }
2568 }
2569 
2570 /*
2571  * Search for a sleeping lock manager lock which matches exactly this lock
2572  * request; if one is found, fake a signal to cancel it.
2573  *
2574  * Return 1 if a matching lock was found, 0 otherwise.
2575  */
2576 
2577 static int
2578 flk_canceled(lock_descriptor_t *request)
2579 {
2580         lock_descriptor_t *lock, *nlock;
2581         graph_t *gp = request->l_graph;
2582         vnode_t *vp = request->l_vnode;
2583 
2584         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2585         ASSERT(IS_LOCKMGR(request));
2586         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2587 
2588         if (lock) {
2589                 while (lock->l_vnode == vp) {
2590                         nlock = lock->l_next;
2591                         if (SAME_OWNER(lock, request) &&
2592                                 lock->l_start == request->l_start &&
2593                                         lock->l_end == request->l_end) {
2594                                 INTERRUPT_WAKEUP(lock);
2595                                 return (1);
2596                         }
2597                         lock = nlock;
2598                 }
2599         }
2600         return (0);
2601 }
2602 
2603 /*
2604  * Remove all the locks for the vnode belonging to the given pid and sysid.
2605  */
2606 
2607 void
2608 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2609 {
2610         graph_t *gp;
2611         lock_descriptor_t *lock, *nlock;
2612         lock_descriptor_t *link_stack;
2613 
2614         STACK_INIT(link_stack);
2615 
2616         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2617 
2618         if (gp == NULL)
2619                 return;
2620         mutex_enter(&gp->gp_mutex);
2621 
2622         CHECK_SLEEPING_LOCKS(gp);
2623         CHECK_ACTIVE_LOCKS(gp);
2624 
2625         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2626 
2627         if (lock) {
2628                 do {
2629                         nlock = lock->l_next;
2630                         if ((lock->l_flock.l_pid == pid ||
2631                                         pid == IGN_PID) &&
2632                                 lock->l_flock.l_sysid == sysid) {
2633                                 CANCEL_WAKEUP(lock);
2634                         }
2635                         lock = nlock;
2636                 } while (lock->l_vnode == vp);
2637         }
2638 
2639         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2640 
2641         if (lock) {
2642                 do {
2643                         nlock = lock->l_next;
2644                         if ((lock->l_flock.l_pid == pid ||
2645                                         pid == IGN_PID) &&
2646                                 lock->l_flock.l_sysid == sysid) {
2647                                 flk_delete_active_lock(lock, 0);
2648                                 STACK_PUSH(link_stack, lock, l_stack);
2649                         }
2650                         lock = nlock;
2651                 } while (lock->l_vnode == vp);
2652         }
2653 
2654         while ((lock = STACK_TOP(link_stack)) != NULL) {
2655                 STACK_POP(link_stack, l_stack);
2656                 flk_wakeup(lock, 1);
2657                 flk_free_lock(lock);
2658         }
2659 
2660         CHECK_SLEEPING_LOCKS(gp);
2661         CHECK_ACTIVE_LOCKS(gp);
2662         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2663         mutex_exit(&gp->gp_mutex);
2664 }
2665 
2666 
2667 /*
2668  * Called from 'fs' read and write routines for files that have mandatory
2669  * locking enabled.
2670  */
2671 
2672 int
2673 chklock(
2674         struct vnode    *vp,
2675         int             iomode,
2676         u_offset_t      offset,
2677         ssize_t         len,
2678         int             fmode,
2679         caller_context_t *ct)
2680 {
2681         register int    i;
2682         struct flock64  bf;
2683         int             error = 0;
2684 
2685         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2686         bf.l_whence = 0;
2687         bf.l_start = offset;
2688         bf.l_len = len;
2689         if (ct == NULL) {
2690                 bf.l_pid = curproc->p_pid;
2691                 bf.l_sysid = 0;
2692         } else {
2693                 bf.l_pid = ct->cc_pid;
2694                 bf.l_sysid = ct->cc_sysid;
2695         }
2696         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2697         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2698             bf.l_type != F_UNLCK)
2699                 error = i ? i : EAGAIN;
2700         return (error);
2701 }
2702 
2703 /*
2704  * convoff - converts the given data (start, whence) to the
2705  * given whence.
2706  */
2707 int
2708 convoff(vp, lckdat, whence, offset)
2709         struct vnode    *vp;
2710         struct flock64  *lckdat;
2711         int             whence;
2712         offset_t        offset;
2713 {
2714         int             error;
2715         struct vattr    vattr;
2716 
2717         if ((lckdat->l_whence == 2) || (whence == 2)) {
2718                 vattr.va_mask = AT_SIZE;
2719                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2720                         return (error);
2721         }
2722 
2723         switch (lckdat->l_whence) {
2724         case 1:
2725                 lckdat->l_start += offset;
2726                 break;
2727         case 2:
2728                 lckdat->l_start += vattr.va_size;
2729                 /* FALLTHRU */
2730         case 0:
2731                 break;
2732         default:
2733                 return (EINVAL);
2734         }
2735 
2736         if (lckdat->l_start < 0)
2737                 return (EINVAL);
2738 
2739         switch (whence) {
2740         case 1:
2741                 lckdat->l_start -= offset;
2742                 break;
2743         case 2:
2744                 lckdat->l_start -= vattr.va_size;
2745                 /* FALLTHRU */
2746         case 0:
2747                 break;
2748         default:
2749                 return (EINVAL);
2750         }
2751 
2752         lckdat->l_whence = (short)whence;
2753         return (0);
2754 }
2755 
2756 
2757 /*      proc_graph function definitions */
2758 
2759 /*
2760  * Function checks for deadlock due to the new 'lock'. If deadlock found
2761  * edges of this lock are freed and returned.
2762  */
2763 
2764 static int
2765 flk_check_deadlock(lock_descriptor_t *lock)
2766 {
2767         proc_vertex_t   *start_vertex, *pvertex;
2768         proc_vertex_t *dvertex;
2769         proc_edge_t *pep, *ppep;
2770         edge_t  *ep, *nep;
2771         proc_vertex_t *process_stack;
2772 
2773         STACK_INIT(process_stack);
2774 
2775         mutex_enter(&flock_lock);
2776         start_vertex = flk_get_proc_vertex(lock);
2777         ASSERT(start_vertex != NULL);
2778 
2779         /* construct the edges from this process to other processes */
2780 
2781         ep = FIRST_ADJ(lock);
2782         while (ep != HEAD(lock)) {
2783                 proc_vertex_t *adj_proc;
2784 
2785                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2786                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2787                         if (pep->to_proc == adj_proc) {
2788                                 ASSERT(pep->refcount);
2789                                 pep->refcount++;
2790                                 break;
2791                         }
2792                 }
2793                 if (pep == NULL) {
2794                         pep = flk_get_proc_edge();
2795                         pep->to_proc = adj_proc;
2796                         pep->refcount = 1;
2797                         adj_proc->incount++;
2798                         pep->next = start_vertex->edge;
2799                         start_vertex->edge = pep;
2800                 }
2801                 ep = NEXT_ADJ(ep);
2802         }
2803 
2804         ep = FIRST_IN(lock);
2805 
2806         while (ep != HEAD(lock)) {
2807                 proc_vertex_t *in_proc;
2808 
2809                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2810 
2811                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2812                         if (pep->to_proc == start_vertex) {
2813                                 ASSERT(pep->refcount);
2814                                 pep->refcount++;
2815                                 break;
2816                         }
2817                 }
2818                 if (pep == NULL) {
2819                         pep = flk_get_proc_edge();
2820                         pep->to_proc = start_vertex;
2821                         pep->refcount = 1;
2822                         start_vertex->incount++;
2823                         pep->next = in_proc->edge;
2824                         in_proc->edge = pep;
2825                 }
2826                 ep = NEXT_IN(ep);
2827         }
2828 
2829         if (start_vertex->incount == 0) {
2830                 mutex_exit(&flock_lock);
2831                 return (0);
2832         }
2833 
2834         flk_proc_graph_uncolor();
2835 
2836         start_vertex->p_sedge = start_vertex->edge;
2837 
2838         STACK_PUSH(process_stack, start_vertex, p_stack);
2839 
2840         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2841                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2842                         dvertex = pep->to_proc;
2843                         if (!PROC_ARRIVED(dvertex)) {
2844                                 STACK_PUSH(process_stack, dvertex, p_stack);
2845                                 dvertex->p_sedge = dvertex->edge;
2846                                 PROC_ARRIVE(pvertex);
2847                                 pvertex->p_sedge = pep->next;
2848                                 break;
2849                         }
2850                         if (!PROC_DEPARTED(dvertex))
2851                                 goto deadlock;
2852                 }
2853                 if (pep == NULL) {
2854                         PROC_DEPART(pvertex);
2855                         STACK_POP(process_stack, p_stack);
2856                 }
2857         }
2858         mutex_exit(&flock_lock);
2859         return (0);
2860 
2861 deadlock:
2862 
2863         /* we remove all lock edges and proc edges */
2864 
2865         ep = FIRST_ADJ(lock);
2866         while (ep != HEAD(lock)) {
2867                 proc_vertex_t *adj_proc;
2868                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2869                 nep = NEXT_ADJ(ep);
2870                 IN_LIST_REMOVE(ep);
2871                 ADJ_LIST_REMOVE(ep);
2872                 flk_free_edge(ep);
2873                 ppep = start_vertex->edge;
2874                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2875                                                 pep = ppep->next) {
2876                         if (pep->to_proc == adj_proc) {
2877                                 pep->refcount--;
2878                                 if (pep->refcount == 0) {
2879                                         if (pep == ppep) {
2880                                                 start_vertex->edge = pep->next;
2881                                         } else {
2882                                                 ppep->next = pep->next;
2883                                         }
2884                                         adj_proc->incount--;
2885                                         flk_proc_release(adj_proc);
2886                                         flk_free_proc_edge(pep);
2887                                 }
2888                                 break;
2889                         }
2890                 }
2891                 ep = nep;
2892         }
2893         ep = FIRST_IN(lock);
2894         while (ep != HEAD(lock)) {
2895                 proc_vertex_t *in_proc;
2896                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2897                 nep = NEXT_IN(ep);
2898                 IN_LIST_REMOVE(ep);
2899                 ADJ_LIST_REMOVE(ep);
2900                 flk_free_edge(ep);
2901                 ppep = in_proc->edge;
2902                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2903                                                 pep = ppep->next) {
2904                         if (pep->to_proc == start_vertex) {
2905                                 pep->refcount--;
2906                                 if (pep->refcount == 0) {
2907                                         if (pep == ppep) {
2908                                                 in_proc->edge = pep->next;
2909                                         } else {
2910                                                 ppep->next = pep->next;
2911                                         }
2912                                         start_vertex->incount--;
2913                                         flk_proc_release(in_proc);
2914                                         flk_free_proc_edge(pep);
2915                                 }
2916                                 break;
2917                         }
2918                 }
2919                 ep = nep;
2920         }
2921         flk_proc_release(start_vertex);
2922         mutex_exit(&flock_lock);
2923         return (1);
2924 }
2925 
2926 /*
2927  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2928  * from the list we return that, otherwise we allocate one. If necessary,
2929  * we grow the list of vertices also.
2930  */
2931 
2932 static proc_vertex_t *
2933 flk_get_proc_vertex(lock_descriptor_t *lock)
2934 {
2935         int i;
2936         proc_vertex_t   *pv;
2937         proc_vertex_t   **palloc;
2938 
2939         ASSERT(MUTEX_HELD(&flock_lock));
2940         if (lock->pvertex != -1) {
2941                 ASSERT(lock->pvertex >= 0);
2942                 pv = pgraph.proc[lock->pvertex];
2943                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2944                         return (pv);
2945                 }
2946         }
2947         for (i = 0; i < pgraph.gcount; i++) {
2948                 pv = pgraph.proc[i];
2949                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2950                         lock->pvertex = pv->index = i;
2951                         return (pv);
2952                 }
2953         }
2954         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2955         pv->pid = lock->l_flock.l_pid;
2956         pv->sysid = lock->l_flock.l_sysid;
2957         flk_proc_vertex_allocs++;
2958         if (pgraph.free != 0) {
2959                 for (i = 0; i < pgraph.gcount; i++) {
2960                         if (pgraph.proc[i] == NULL) {
2961                                 pgraph.proc[i] = pv;
2962                                 lock->pvertex = pv->index = i;
2963                                 pgraph.free--;
2964                                 return (pv);
2965                         }
2966                 }
2967         }
2968         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2969                                 sizeof (proc_vertex_t *), KM_SLEEP);
2970 
2971         if (pgraph.proc) {
2972                 bcopy(pgraph.proc, palloc,
2973                         pgraph.gcount * sizeof (proc_vertex_t *));
2974 
2975                 kmem_free(pgraph.proc,
2976                         pgraph.gcount * sizeof (proc_vertex_t *));
2977         }
2978         pgraph.proc = palloc;
2979         pgraph.free += (PROC_CHUNK - 1);
2980         pv->index = lock->pvertex = pgraph.gcount;
2981         pgraph.gcount += PROC_CHUNK;
2982         pgraph.proc[pv->index] = pv;
2983         return (pv);
2984 }
2985 
2986 /*
2987  * Allocate a proc edge.
2988  */
2989 
2990 static proc_edge_t *
2991 flk_get_proc_edge()
2992 {
2993         proc_edge_t *pep;
2994 
2995         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2996         flk_proc_edge_allocs++;
2997         return (pep);
2998 }
2999 
3000 /*
3001  * Free the proc edge. Called whenever its reference count goes to zero.
3002  */
3003 
3004 static void
3005 flk_free_proc_edge(proc_edge_t *pep)
3006 {
3007         ASSERT(pep->refcount == 0);
3008         kmem_free((void *)pep, sizeof (proc_edge_t));
3009         flk_proc_edge_frees++;
3010 }
3011 
3012 /*
3013  * Color the graph explicitly done only when the mark value hits max value.
3014  */
3015 
3016 static void
3017 flk_proc_graph_uncolor()
3018 {
3019         int i;
3020 
3021         if (pgraph.mark == UINT_MAX) {
3022                 for (i = 0; i < pgraph.gcount; i++)
3023                         if (pgraph.proc[i] != NULL) {
3024                                 pgraph.proc[i]->atime = 0;
3025                                 pgraph.proc[i]->dtime = 0;
3026                         }
3027                 pgraph.mark = 1;
3028         } else {
3029                 pgraph.mark++;
3030         }
3031 }
3032 
3033 /*
3034  * Release the proc vertex iff both there are no in edges and out edges
3035  */
3036 
3037 static void
3038 flk_proc_release(proc_vertex_t *proc)
3039 {
3040         ASSERT(MUTEX_HELD(&flock_lock));
3041         if (proc->edge == NULL && proc->incount == 0) {
3042                 pgraph.proc[proc->index] = NULL;
3043                 pgraph.free++;
3044                 kmem_free(proc, sizeof (proc_vertex_t));
3045                 flk_proc_vertex_frees++;
3046         }
3047 }
3048 
3049 /*
3050  * Updates process graph to reflect change in a lock_graph.
3051  * Note: We should call this function only after we have a correctly
3052  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3053  * eg: in function flk_relation() we call this function after flk_recompute_
3054  * dependencies() otherwise if a process tries to lock a vnode hashed
3055  * into another graph it might sleep for ever.
3056  */
3057 
3058 static void
3059 flk_update_proc_graph(edge_t *ep, int delete)
3060 {
3061         proc_vertex_t *toproc, *fromproc;
3062         proc_edge_t *pep, *prevpep;
3063 
3064         mutex_enter(&flock_lock);
3065         toproc = flk_get_proc_vertex(ep->to_vertex);
3066         fromproc = flk_get_proc_vertex(ep->from_vertex);
3067 
3068         if (!delete)
3069                 goto add;
3070         pep = prevpep = fromproc->edge;
3071 
3072         ASSERT(pep != NULL);
3073         while (pep != NULL) {
3074                 if (pep->to_proc == toproc) {
3075                         ASSERT(pep->refcount > 0);
3076                         pep->refcount--;
3077                         if (pep->refcount == 0) {
3078                                 if (pep == prevpep) {
3079                                         fromproc->edge = pep->next;
3080                                 } else {
3081                                         prevpep->next = pep->next;
3082                                 }
3083                                 toproc->incount--;
3084                                 flk_proc_release(toproc);
3085                                 flk_free_proc_edge(pep);
3086                         }
3087                         break;
3088                 }
3089                 prevpep = pep;
3090                 pep = pep->next;
3091         }
3092         flk_proc_release(fromproc);
3093         mutex_exit(&flock_lock);
3094         return;
3095 add:
3096 
3097         pep = fromproc->edge;
3098 
3099         while (pep != NULL) {
3100                 if (pep->to_proc == toproc) {
3101                         ASSERT(pep->refcount > 0);
3102                         pep->refcount++;
3103                         break;
3104                 }
3105                 pep = pep->next;
3106         }
3107         if (pep == NULL) {
3108                 pep = flk_get_proc_edge();
3109                 pep->to_proc = toproc;
3110                 pep->refcount = 1;
3111                 toproc->incount++;
3112                 pep->next = fromproc->edge;
3113                 fromproc->edge = pep;
3114         }
3115         mutex_exit(&flock_lock);
3116 }
3117 
3118 /*
3119  * Set the control status for lock manager requests.
3120  *
3121  */
3122 
3123 /*
3124  * PSARC case 1997/292
3125  *
3126  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3127  * Effects: Set the state of the NLM server identified by "nlmid"
3128  *   in the NLM registry to state "nlm_state."
3129  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3130  *   NLM server to this LLM.
3131  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3132  *   may be locks requests that have gotten started but not finished.  In
3133  *   particular, there may be blocking requests that are in the callback code
3134  *   before sleeping (so they're not holding the lock for the graph).  If
3135  *   such a thread reacquires the graph's lock (to go to sleep) after
3136  *   NLM state in the NLM registry  is set to a non-up value,
3137  *   it will notice the status and bail out.  If the request gets
3138  *   granted before the thread can check the NLM registry, let it
3139  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3140  *
3141  * Modifies: nlm_reg_obj (global)
3142  * Arguments:
3143  *    nlmid     (IN):    id uniquely identifying an NLM server
3144  *    nlm_state (IN):    NLM server state to change "nlmid" to
3145  */
3146 void
3147 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3148 {
3149         /*
3150          * Check to see if node is booted as a cluster. If not, return.
3151          */
3152         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3153                 return;
3154         }
3155 
3156         /*
3157          * Check for development/debugging.  It is possible to boot a node
3158          * in non-cluster mode, and then run a special script, currently
3159          * available only to developers, to bring up the node as part of a
3160          * cluster.  The problem is that running such a script does not
3161          * result in the routine flk_init() being called and hence global array
3162          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3163          * but the LLM needs to do an additional check to see if the global
3164          * array has been created or not. If nlm_reg_status is NULL, then
3165          * return, else continue.
3166          */
3167         if (nlm_reg_status == NULL) {
3168                 return;
3169         }
3170 
3171         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3172         mutex_enter(&nlm_reg_lock);
3173 
3174         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3175                 /*
3176                  * If the NLM server "nlmid" is unknown in the NLM registry,
3177                  * add it to the registry in the nlm shutting down state.
3178                  */
3179                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3180                         FLK_NLM_SHUTTING_DOWN);
3181         } else {
3182                 /*
3183                  * Change the state of the NLM server identified by "nlmid"
3184                  * in the NLM registry to the argument "nlm_state."
3185                  */
3186                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3187                         nlm_state);
3188         }
3189 
3190         /*
3191          *  The reason we must register the NLM server that is shutting down
3192          *  with an LLM that doesn't already know about it (never sent a lock
3193          *  request) is to handle correctly a race between shutdown and a new
3194          *  lock request.  Suppose that a shutdown request from the NLM server
3195          *  invokes this routine at the LLM, and a thread is spawned to
3196          *  service the request. Now suppose a new lock request is in
3197          *  progress and has already passed the first line of defense in
3198          *  reclock(), which denies new locks requests from NLM servers
3199          *  that are not in the NLM_UP state.  After the current routine
3200          *  is invoked for both phases of shutdown, the routine will return,
3201          *  having done nothing, and the lock request will proceed and
3202          *  probably be granted.  The problem is that the shutdown was ignored
3203          *  by the lock request because there was no record of that NLM server
3204          *  shutting down.   We will be in the peculiar position of thinking
3205          *  that we've shutdown the NLM server and all locks at all LLMs have
3206          *  been discarded, but in fact there's still one lock held.
3207          *  The solution is to record the existence of NLM server and change
3208          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3209          *  progress may proceed because the next phase NLM_DOWN will catch
3210          *  this lock and discard it.
3211          */
3212         mutex_exit(&nlm_reg_lock);
3213 
3214         switch (nlm_state) {
3215         case FLK_NLM_UP:
3216                 /*
3217                  * Change the NLM state of all locks still held on behalf of
3218                  * the NLM server identified by "nlmid" to NLM_UP.
3219                  */
3220                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3221                 break;
3222 
3223         case FLK_NLM_SHUTTING_DOWN:
3224                 /*
3225                  * Wake up all sleeping locks for the NLM server identified
3226                  * by "nlmid." Note that eventually all woken threads will
3227                  * have their lock requests cancelled and descriptors
3228                  * removed from the sleeping lock list.  Note that the NLM
3229                  * server state associated with each lock descriptor is
3230                  * changed to FLK_NLM_SHUTTING_DOWN.
3231                  */
3232                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3233                 break;
3234 
3235         case FLK_NLM_DOWN:
3236                 /*
3237                  * Discard all active, granted locks for this NLM server
3238                  * identified by "nlmid."
3239                  */
3240                 cl_flk_unlock_nlm_granted(nlmid);
3241                 break;
3242 
3243         default:
3244                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3245         }
3246 }
3247 
3248 /*
3249  * Set the control status for lock manager requests.
3250  *
3251  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3252  * may be locks requests that have gotten started but not finished.  In
3253  * particular, there may be blocking requests that are in the callback code
3254  * before sleeping (so they're not holding the lock for the graph).  If
3255  * such a thread reacquires the graph's lock (to go to sleep) after
3256  * flk_lockmgr_status is set to a non-up value, it will notice the status
3257  * and bail out.  If the request gets granted before the thread can check
3258  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3259  * we are called with FLK_LOCKMGR_DOWN.
3260  */
3261 
3262 void
3263 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3264 {
3265         int i;
3266         graph_t *gp;
3267         struct flock_globals *fg;
3268 
3269         fg = flk_get_globals();
3270         ASSERT(fg != NULL);
3271 
3272         mutex_enter(&flock_lock);
3273         fg->flk_lockmgr_status = status;
3274         mutex_exit(&flock_lock);
3275 
3276         /*
3277          * If the lock manager is coming back up, all that's needed is to
3278          * propagate this information to the graphs.  If the lock manager
3279          * is going down, additional action is required, and each graph's
3280          * copy of the state is updated atomically with this other action.
3281          */
3282         switch (status) {
3283         case FLK_LOCKMGR_UP:
3284                 for (i = 0; i < HASH_SIZE; i++) {
3285                         mutex_enter(&flock_lock);
3286                         gp = lock_graph[i];
3287                         mutex_exit(&flock_lock);
3288                         if (gp == NULL)
3289                                 continue;
3290                         mutex_enter(&gp->gp_mutex);
3291                         fg->lockmgr_status[i] = status;
3292                         mutex_exit(&gp->gp_mutex);
3293                 }
3294                 break;
3295         case FLK_WAKEUP_SLEEPERS:
3296                 wakeup_sleeping_lockmgr_locks(fg);
3297                 break;
3298         case FLK_LOCKMGR_DOWN:
3299                 unlock_lockmgr_granted(fg);
3300                 break;
3301         default:
3302                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3303                 break;
3304         }
3305 }
3306 
3307 /*
3308  * This routine returns all the locks that are active or sleeping and are
3309  * associated with a particular set of identifiers.  If lock_state != 0, then
3310  * only locks that match the lock_state are returned. If lock_state == 0, then
3311  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3312  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3313  * vnode pointer is ignored.
3314  *
3315  * A list containing the vnode pointer and an flock structure
3316  * describing the lock is returned.  Each element in the list is
3317  * dynamically allocated and must be freed by the caller.  The
3318  * last item in the list is denoted by a NULL value in the ll_next
3319  * field.
3320  *
3321  * The vnode pointers returned are held.  The caller is responsible
3322  * for releasing these.  Note that the returned list is only a snapshot of
3323  * the current lock information, and that it is a snapshot of a moving
3324  * target (only one graph is locked at a time).
3325  */
3326 
3327 locklist_t *
3328 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3329                 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3330 {
3331         lock_descriptor_t       *lock;
3332         lock_descriptor_t       *graph_head;
3333         locklist_t              listhead;
3334         locklist_t              *llheadp;
3335         locklist_t              *llp;
3336         locklist_t              *lltp;
3337         graph_t                 *gp;
3338         int                     i;
3339         int                     first_index; /* graph index */
3340         int                     num_indexes; /* graph index */
3341 
3342         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3343             (list_type == FLK_SLEEPING_STATE));
3344 
3345         /*
3346          * Get a pointer to something to use as a list head while building
3347          * the rest of the list.
3348          */
3349         llheadp = &listhead;
3350         lltp = llheadp;
3351         llheadp->ll_next = (locklist_t *)NULL;
3352 
3353         /* Figure out which graphs we want to look at. */
3354         if (vp == NULL) {
3355                 first_index = 0;
3356                 num_indexes = HASH_SIZE;
3357         } else {
3358                 first_index = HASH_INDEX(vp);
3359                 num_indexes = 1;
3360         }
3361 
3362         for (i = first_index; i < first_index + num_indexes; i++) {
3363                 mutex_enter(&flock_lock);
3364                 gp = lock_graph[i];
3365                 mutex_exit(&flock_lock);
3366                 if (gp == NULL) {
3367                         continue;
3368                 }
3369 
3370                 mutex_enter(&gp->gp_mutex);
3371                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3372                         ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3373                 for (lock = graph_head->l_next;
3374                     lock != graph_head;
3375                     lock = lock->l_next) {
3376                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3377                                 continue;
3378                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3379                                 continue;
3380                         if (vp != NULL && lock->l_vnode != vp)
3381                                 continue;
3382                         if (lock_state && !(lock_state & lock->l_state))
3383                                 continue;
3384                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3385                                 continue;
3386                         /*
3387                          * A matching lock was found.  Allocate
3388                          * space for a new locklist entry and fill
3389                          * it in.
3390                          */
3391                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3392                         lltp->ll_next = llp;
3393                         VN_HOLD(lock->l_vnode);
3394                         llp->ll_vp = lock->l_vnode;
3395                         create_flock(lock, &(llp->ll_flock));
3396                         llp->ll_next = (locklist_t *)NULL;
3397                         lltp = llp;
3398                 }
3399                 mutex_exit(&gp->gp_mutex);
3400         }
3401 
3402         llp = llheadp->ll_next;
3403         return (llp);
3404 }
3405 
3406 /*
3407  * These two functions are simply interfaces to get_lock_list.  They return
3408  * a list of sleeping or active locks for the given sysid and pid.  See
3409  * get_lock_list for details.
3410  *
3411  * In either case we don't particularly care to specify the zone of interest;
3412  * the sysid-space is global across zones, so the sysid will map to exactly one
3413  * zone, and we'll return information for that zone.
3414  */
3415 
3416 locklist_t *
3417 flk_get_sleeping_locks(int sysid, pid_t pid)
3418 {
3419         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3420                     ALL_ZONES));
3421 }
3422 
3423 locklist_t *
3424 flk_get_active_locks(int sysid, pid_t pid)
3425 {
3426         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3427                     ALL_ZONES));
3428 }
3429 
3430 /*
3431  * Another interface to get_lock_list.  This one returns all the active
3432  * locks for a given vnode.  Again, see get_lock_list for details.
3433  *
3434  * We don't need to specify which zone's locks we're interested in.  The matter
3435  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3436  * be used by multiple zones, so the list of locks will all be from the right
3437  * zone.
3438  */
3439 
3440 locklist_t *
3441 flk_active_locks_for_vp(const vnode_t *vp)
3442 {
3443         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3444                     ALL_ZONES));
3445 }
3446 
3447 /*
3448  * Another interface to get_lock_list.  This one returns all the active
3449  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3450  *
3451  * See the comment for flk_active_locks_for_vp() for why we don't care to
3452  * specify the particular zone of interest.
3453  */
3454 locklist_t *
3455 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3456 {
3457         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3458                                 NOPID, vp, ALL_ZONES));
3459 }
3460 
3461 /*
3462  * Another interface to get_lock_list.  This one returns all the active
3463  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3464  *
3465  * The zone doesn't need to be specified here; the locks held by a
3466  * particular process will either be local (ie, non-NFS) or from the zone
3467  * the process is executing in.  This is because other parts of the system
3468  * ensure that an NFS vnode can't be used in a zone other than that in
3469  * which it was opened.
3470  */
3471 locklist_t *
3472 flk_active_nbmand_locks(pid_t pid)
3473 {
3474         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3475                                 pid, NULL, ALL_ZONES));
3476 }
3477 
3478 /*
3479  * Free up all entries in the locklist.
3480  */
3481 void
3482 flk_free_locklist(locklist_t *llp)
3483 {
3484         locklist_t *next_llp;
3485 
3486         while (llp) {
3487                 next_llp = llp->ll_next;
3488                 VN_RELE(llp->ll_vp);
3489                 kmem_free(llp, sizeof (*llp));
3490                 llp = next_llp;
3491         }
3492 }
3493 
3494 static void
3495 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3496 {
3497         /*
3498          * For each graph "lg" in the hash table lock_graph do
3499          * a.  Get the list of sleeping locks
3500          * b.  For each lock descriptor in the list do
3501          *      i.   If the requested lock is an NLM server request AND
3502          *              the nlmid is the same as the routine argument then
3503          *              change the lock descriptor's state field to
3504          *              "nlm_state."
3505          * c.  Get the list of active locks
3506          * d.  For each lock descriptor in the list do
3507          *      i.   If the requested lock is an NLM server request AND
3508          *              the nlmid is the same as the routine argument then
3509          *              change the lock descriptor's state field to
3510          *              "nlm_state."
3511          */
3512 
3513         int                     i;
3514         graph_t                 *gp;                    /* lock graph */
3515         lock_descriptor_t       *lock;                  /* lock */
3516         lock_descriptor_t       *nlock = NULL;          /* next lock */
3517         int                     lock_nlmid;
3518 
3519         for (i = 0; i < HASH_SIZE; i++) {
3520                 mutex_enter(&flock_lock);
3521                 gp = lock_graph[i];
3522                 mutex_exit(&flock_lock);
3523                 if (gp == NULL) {
3524                         continue;
3525                 }
3526 
3527                 /* Get list of sleeping locks in current lock graph. */
3528                 mutex_enter(&gp->gp_mutex);
3529                 for (lock = SLEEPING_HEAD(gp)->l_next;
3530                     lock != SLEEPING_HEAD(gp);
3531                     lock = nlock) {
3532                         nlock = lock->l_next;
3533                         /* get NLM id */
3534                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3535 
3536                         /*
3537                          * If NLM server request AND nlmid of lock matches
3538                          * nlmid of argument, then set the NLM state of the
3539                          * lock to "nlm_state."
3540                          */
3541                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3542                                 SET_NLM_STATE(lock, nlm_state);
3543                         }
3544                 }
3545 
3546                 /* Get list of active locks in current lock graph. */
3547                 for (lock = ACTIVE_HEAD(gp)->l_next;
3548                     lock != ACTIVE_HEAD(gp);
3549                     lock = nlock) {
3550                         nlock = lock->l_next;
3551                         /* get NLM id */
3552                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3553 
3554                         /*
3555                          * If NLM server request AND nlmid of lock matches
3556                          * nlmid of argument, then set the NLM state of the
3557                          * lock to "nlm_state."
3558                          */
3559                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3560                                 ASSERT(IS_ACTIVE(lock));
3561                                 SET_NLM_STATE(lock, nlm_state);
3562                         }
3563                 }
3564                 mutex_exit(&gp->gp_mutex);
3565         }
3566 }
3567 
3568 /*
3569  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3570  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3571  *   identified by "nlmid." Poke those lock requests.
3572  */
3573 static void
3574 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3575 {
3576         lock_descriptor_t *lock;
3577         lock_descriptor_t *nlock = NULL; /* next lock */
3578         int i;
3579         graph_t *gp;
3580         int     lock_nlmid;
3581 
3582         for (i = 0; i < HASH_SIZE; i++) {
3583                 mutex_enter(&flock_lock);
3584                 gp = lock_graph[i];
3585                 mutex_exit(&flock_lock);
3586                 if (gp == NULL) {
3587                         continue;
3588                 }
3589 
3590                 mutex_enter(&gp->gp_mutex);
3591                 for (lock = SLEEPING_HEAD(gp)->l_next;
3592                     lock != SLEEPING_HEAD(gp);
3593                     lock = nlock) {
3594                         nlock = lock->l_next;
3595                         /*
3596                          * If NLM server request _and_ nlmid of lock matches
3597                          * nlmid of argument, then set the NLM state of the
3598                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3599                          * request.
3600                          */
3601                         if (IS_LOCKMGR(lock)) {
3602                                 /* get NLM id */
3603                                 lock_nlmid =
3604                                         GETNLMID(lock->l_flock.l_sysid);
3605                                 if (nlmid == lock_nlmid) {
3606                                         SET_NLM_STATE(lock,
3607                                                 FLK_NLM_SHUTTING_DOWN);
3608                                         INTERRUPT_WAKEUP(lock);
3609                                 }
3610                         }
3611                 }
3612                 mutex_exit(&gp->gp_mutex);
3613         }
3614 }
3615 
3616 /*
3617  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3618  * Effects:  Find all active (granted) lock manager locks _only_ for the
3619  *   NLM server identified by "nlmid" and release them.
3620  */
3621 static void
3622 cl_flk_unlock_nlm_granted(int nlmid)
3623 {
3624         lock_descriptor_t *lock;
3625         lock_descriptor_t *nlock = NULL; /* next lock */
3626         int i;
3627         graph_t *gp;
3628         int     lock_nlmid;
3629 
3630         for (i = 0; i < HASH_SIZE; i++) {
3631                 mutex_enter(&flock_lock);
3632                 gp = lock_graph[i];
3633                 mutex_exit(&flock_lock);
3634                 if (gp == NULL) {
3635                         continue;
3636                 }
3637 
3638                 mutex_enter(&gp->gp_mutex);
3639                 for (lock = ACTIVE_HEAD(gp)->l_next;
3640                     lock != ACTIVE_HEAD(gp);
3641                     lock = nlock) {
3642                         nlock = lock->l_next;
3643                         ASSERT(IS_ACTIVE(lock));
3644 
3645                         /*
3646                          * If it's an  NLM server request _and_ nlmid of
3647                          * the lock matches nlmid of argument, then
3648                          * remove the active lock the list, wakup blocked
3649                          * threads, and free the storage for the lock.
3650                          * Note that there's no need to mark the NLM state
3651                          * of this lock to NLM_DOWN because the lock will
3652                          * be deleted anyway and its storage freed.
3653                          */
3654                         if (IS_LOCKMGR(lock)) {
3655                                 /* get NLM id */
3656                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3657                                 if (nlmid == lock_nlmid) {
3658                                         flk_delete_active_lock(lock, 0);
3659                                         flk_wakeup(lock, 1);
3660                                         flk_free_lock(lock);
3661                                 }
3662                         }
3663                 }
3664                 mutex_exit(&gp->gp_mutex);
3665         }
3666 }
3667 
3668 /*
3669  * Find all sleeping lock manager requests and poke them.
3670  */
3671 static void
3672 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3673 {
3674         lock_descriptor_t *lock;
3675         lock_descriptor_t *nlock = NULL; /* next lock */
3676         int i;
3677         graph_t *gp;
3678         zoneid_t zoneid = getzoneid();
3679 
3680         for (i = 0; i < HASH_SIZE; i++) {
3681                 mutex_enter(&flock_lock);
3682                 gp = lock_graph[i];
3683                 mutex_exit(&flock_lock);
3684                 if (gp == NULL) {
3685                         continue;
3686                 }
3687 
3688                 mutex_enter(&gp->gp_mutex);
3689                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3690                 for (lock = SLEEPING_HEAD(gp)->l_next;
3691                     lock != SLEEPING_HEAD(gp);
3692                     lock = nlock) {
3693                         nlock = lock->l_next;
3694                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3695                                 INTERRUPT_WAKEUP(lock);
3696                         }
3697                 }
3698                 mutex_exit(&gp->gp_mutex);
3699         }
3700 }
3701 
3702 
3703 /*
3704  * Find all active (granted) lock manager locks and release them.
3705  */
3706 static void
3707 unlock_lockmgr_granted(struct flock_globals *fg)
3708 {
3709         lock_descriptor_t *lock;
3710         lock_descriptor_t *nlock = NULL; /* next lock */
3711         int i;
3712         graph_t *gp;
3713         zoneid_t zoneid = getzoneid();
3714 
3715         for (i = 0; i < HASH_SIZE; i++) {
3716                 mutex_enter(&flock_lock);
3717                 gp = lock_graph[i];
3718                 mutex_exit(&flock_lock);
3719                 if (gp == NULL) {
3720                         continue;
3721                 }
3722 
3723                 mutex_enter(&gp->gp_mutex);
3724                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3725                 for (lock = ACTIVE_HEAD(gp)->l_next;
3726                     lock != ACTIVE_HEAD(gp);
3727                     lock = nlock) {
3728                         nlock = lock->l_next;
3729                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3730                                 ASSERT(IS_ACTIVE(lock));
3731                                 flk_delete_active_lock(lock, 0);
3732                                 flk_wakeup(lock, 1);
3733                                 flk_free_lock(lock);
3734                         }
3735                 }
3736                 mutex_exit(&gp->gp_mutex);
3737         }
3738 }
3739 
3740 
3741 /*
3742  * Wait until a lock is granted, cancelled, or interrupted.
3743  */
3744 
3745 static void
3746 wait_for_lock(lock_descriptor_t *request)
3747 {
3748         graph_t *gp = request->l_graph;
3749 
3750         ASSERT(MUTEX_HELD(&gp->gp_mutex));
3751 
3752         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3753             !(IS_INTERRUPTED(request))) {
3754                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3755                         flk_set_state(request, FLK_INTERRUPTED_STATE);
3756                         request->l_state |= INTERRUPTED_LOCK;
3757                 }
3758         }
3759 }
3760 
3761 /*
3762  * Create an flock structure from the existing lock information
3763  *
3764  * This routine is used to create flock structures for the lock manager
3765  * to use in a reclaim request.  Since the lock was originated on this
3766  * host, it must be conforming to UNIX semantics, so no checking is
3767  * done to make sure it falls within the lower half of the 32-bit range.
3768  */
3769 
3770 static void
3771 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3772 {
3773         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3774         ASSERT(lp->l_end >= lp->l_start);
3775 
3776         flp->l_type = lp->l_type;
3777         flp->l_whence = 0;
3778         flp->l_start = lp->l_start;
3779         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3780                 (lp->l_end - lp->l_start + 1);
3781         flp->l_sysid = lp->l_flock.l_sysid;
3782         flp->l_pid = lp->l_flock.l_pid;
3783 }
3784 
3785 /*
3786  * Convert flock_t data describing a lock range into unsigned long starting
3787  * and ending points, which are put into lock_request.  Returns 0 or an
3788  * errno value.
3789  * Large Files: max is passed by the caller and we return EOVERFLOW
3790  * as defined by LFS API.
3791  */
3792 
3793 int
3794 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3795     u_offset_t *start, u_offset_t *end, offset_t offset)
3796 {
3797         struct vattr    vattr;
3798         int     error;
3799 
3800         /*
3801          * Determine the starting point of the request
3802          */
3803         switch (flp->l_whence) {
3804         case 0:         /* SEEK_SET */
3805                 *start = (u_offset_t)flp->l_start;
3806                 break;
3807         case 1:         /* SEEK_CUR */
3808                 *start = (u_offset_t)(flp->l_start + offset);
3809                 break;
3810         case 2:         /* SEEK_END */
3811                 vattr.va_mask = AT_SIZE;
3812                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3813                         return (error);
3814                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3815                 break;
3816         default:
3817                 return (EINVAL);
3818         }
3819 
3820         /*
3821          * Determine the range covered by the request.
3822          */
3823         if (flp->l_len == 0)
3824                 *end = MAX_U_OFFSET_T;
3825         else if ((offset_t)flp->l_len > 0) {
3826                 *end = (u_offset_t)(*start + (flp->l_len - 1));
3827         } else {
3828                 /*
3829                  * Negative length; why do we even allow this ?
3830                  * Because this allows easy specification of
3831                  * the last n bytes of the file.
3832                  */
3833                 *end = *start;
3834                 *start += (u_offset_t)flp->l_len;
3835                 (*start)++;
3836         }
3837         return (0);
3838 }
3839 
3840 /*
3841  * Check the validity of lock data.  This can used by the NFS
3842  * frlock routines to check data before contacting the server.  The
3843  * server must support semantics that aren't as restrictive as
3844  * the UNIX API, so the NFS client is required to check.
3845  * The maximum is now passed in by the caller.
3846  */
3847 
3848 int
3849 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3850 {
3851         /*
3852          * The end (length) for local locking should never be greater
3853          * than MAXEND. However, the representation for
3854          * the entire file is MAX_U_OFFSET_T.
3855          */
3856         if ((start > max) ||
3857             ((end > max) && (end != MAX_U_OFFSET_T))) {
3858                 return (EINVAL);
3859         }
3860         if (start > end) {
3861             return (EINVAL);
3862         }
3863         return (0);
3864 }
3865 
3866 /*
3867  * Fill in request->l_flock with information about the lock blocking the
3868  * request.  The complexity here is that lock manager requests are allowed
3869  * to see into the upper part of the 32-bit address range, whereas local
3870  * requests are only allowed to see signed values.
3871  *
3872  * What should be done when "blocker" is a lock manager lock that uses the
3873  * upper portion of the 32-bit range, but "request" is local?  Since the
3874  * request has already been determined to have been blocked by the blocker,
3875  * at least some portion of "blocker" must be in the range of the request,
3876  * or the request extends to the end of file.  For the first case, the
3877  * portion in the lower range is returned with the indication that it goes
3878  * "to EOF."  For the second case, the last byte of the lower range is
3879  * returned with the indication that it goes "to EOF."
3880  */
3881 
3882 static void
3883 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3884 {
3885         flock64_t *flrp;                        /* l_flock portion of request */
3886 
3887         ASSERT(blocker != NULL);
3888 
3889         flrp = &request->l_flock;
3890         flrp->l_whence = 0;
3891         flrp->l_type = blocker->l_type;
3892         flrp->l_pid = blocker->l_flock.l_pid;
3893         flrp->l_sysid = blocker->l_flock.l_sysid;
3894 
3895         if (IS_LOCKMGR(request)) {
3896                 flrp->l_start = blocker->l_start;
3897                 if (blocker->l_end == MAX_U_OFFSET_T)
3898                         flrp->l_len = 0;
3899                 else
3900                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
3901         } else {
3902                 if (blocker->l_start > MAXEND) {
3903                         flrp->l_start = MAXEND;
3904                         flrp->l_len = 0;
3905                 } else {
3906                         flrp->l_start = blocker->l_start;
3907                         if (blocker->l_end == MAX_U_OFFSET_T)
3908                                 flrp->l_len = 0;
3909                         else
3910                                 flrp->l_len = blocker->l_end -
3911                                         blocker->l_start + 1;
3912                 }
3913         }
3914 }
3915 
3916 /*
3917  * PSARC case 1997/292
3918  */
3919 /*
3920  * This is the public routine exported by flock.h.
3921  */
3922 void
3923 cl_flk_change_nlm_state_to_unknown(int nlmid)
3924 {
3925         /*
3926          * Check to see if node is booted as a cluster. If not, return.
3927          */
3928         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3929                 return;
3930         }
3931 
3932         /*
3933          * See comment in cl_flk_set_nlm_status().
3934          */
3935         if (nlm_reg_status == NULL) {
3936                 return;
3937         }
3938 
3939         /*
3940          * protect NLM registry state with a mutex.
3941          */
3942         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3943         mutex_enter(&nlm_reg_lock);
3944         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3945         mutex_exit(&nlm_reg_lock);
3946 }
3947 
3948 /*
3949  * Return non-zero if the given I/O request conflicts with an active NBMAND
3950  * lock.
3951  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3952  * locks.
3953  */
3954 
3955 int
3956 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3957                 ssize_t length, int svmand, caller_context_t *ct)
3958 {
3959         int conflict = 0;
3960         graph_t                 *gp;
3961         lock_descriptor_t       *lock;
3962         pid_t pid;
3963         int sysid;
3964 
3965         if (ct == NULL) {
3966                 pid = curproc->p_pid;
3967                 sysid = 0;
3968         } else {
3969                 pid = ct->cc_pid;
3970                 sysid = ct->cc_sysid;
3971         }
3972 
3973         mutex_enter(&flock_lock);
3974         gp = lock_graph[HASH_INDEX(vp)];
3975         mutex_exit(&flock_lock);
3976         if (gp == NULL)
3977                 return (0);
3978 
3979         mutex_enter(&gp->gp_mutex);
3980         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3981 
3982         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3983                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3984                     (lock->l_flock.l_sysid != sysid ||
3985                     lock->l_flock.l_pid != pid) &&
3986                     lock_blocks_io(op, offset, length,
3987                                 lock->l_type, lock->l_start, lock->l_end)) {
3988                         conflict = 1;
3989                         break;
3990                 }
3991         }
3992         mutex_exit(&gp->gp_mutex);
3993 
3994         return (conflict);
3995 }
3996 
3997 /*
3998  * Return non-zero if the given I/O request conflicts with the given lock.
3999  */
4000 
4001 static int
4002 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4003             int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4004 {
4005         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4006         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4007 
4008         if (op == NBL_READ && lock_type == F_RDLCK)
4009                 return (0);
4010 
4011         if (offset <= lock_start && lock_start < offset + length)
4012                 return (1);
4013         if (lock_start <= offset && offset <= lock_end)
4014                 return (1);
4015 
4016         return (0);
4017 }
4018 
4019 #ifdef DEBUG
4020 static void
4021 check_active_locks(graph_t *gp)
4022 {
4023         lock_descriptor_t *lock, *lock1;
4024         edge_t  *ep;
4025 
4026         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4027                                                 lock = lock->l_next) {
4028                 ASSERT(IS_ACTIVE(lock));
4029                 ASSERT(NOT_BLOCKED(lock));
4030                 ASSERT(!IS_BARRIER(lock));
4031 
4032                 ep = FIRST_IN(lock);
4033 
4034                 while (ep != HEAD(lock)) {
4035                         ASSERT(IS_SLEEPING(ep->from_vertex));
4036                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
4037                         ep = NEXT_IN(ep);
4038                 }
4039 
4040                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4041                                         lock1 = lock1->l_next) {
4042                         if (lock1->l_vnode == lock->l_vnode) {
4043                         if (BLOCKS(lock1, lock)) {
4044                                 cmn_err(CE_PANIC,
4045                                     "active lock %p blocks %p",
4046                                     (void *)lock1, (void *)lock);
4047                         } else if (BLOCKS(lock, lock1)) {
4048                                 cmn_err(CE_PANIC,
4049                                     "active lock %p blocks %p",
4050                                     (void *)lock, (void *)lock1);
4051                         }
4052                         }
4053                 }
4054         }
4055 }
4056 
4057 /*
4058  * Effect: This functions checks to see if the transition from 'old_state' to
4059  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4060  *      and 1 if it is not.
4061  *      For a map of valid transitions, see sys/flock_impl.h
4062  */
4063 static int
4064 check_lock_transition(int old_state, int new_state)
4065 {
4066         switch (old_state) {
4067         case FLK_INITIAL_STATE:
4068                 if ((new_state == FLK_START_STATE) ||
4069                     (new_state == FLK_SLEEPING_STATE) ||
4070                     (new_state == FLK_ACTIVE_STATE) ||
4071                     (new_state == FLK_DEAD_STATE)) {
4072                         return (0);
4073                 } else {
4074                         return (1);
4075                 }
4076         case FLK_START_STATE:
4077                 if ((new_state == FLK_ACTIVE_STATE) ||
4078                     (new_state == FLK_DEAD_STATE)) {
4079                         return (0);
4080                 } else {
4081                         return (1);
4082                 }
4083         case FLK_ACTIVE_STATE:
4084                 if (new_state == FLK_DEAD_STATE) {
4085                         return (0);
4086                 } else {
4087                         return (1);
4088                 }
4089         case FLK_SLEEPING_STATE:
4090                 if ((new_state == FLK_GRANTED_STATE) ||
4091                     (new_state == FLK_INTERRUPTED_STATE) ||
4092                     (new_state == FLK_CANCELLED_STATE)) {
4093                         return (0);
4094                 } else {
4095                         return (1);
4096                 }
4097         case FLK_GRANTED_STATE:
4098                 if ((new_state == FLK_START_STATE) ||
4099                     (new_state == FLK_INTERRUPTED_STATE) ||
4100                     (new_state == FLK_CANCELLED_STATE)) {
4101                         return (0);
4102                 } else {
4103                         return (1);
4104                 }
4105         case FLK_CANCELLED_STATE:
4106                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4107                     (new_state == FLK_DEAD_STATE)) {
4108                         return (0);
4109                 } else {
4110                         return (1);
4111                 }
4112         case FLK_INTERRUPTED_STATE:
4113                 if (new_state == FLK_DEAD_STATE) {
4114                         return (0);
4115                 } else {
4116                         return (1);
4117                 }
4118         case FLK_DEAD_STATE:
4119                 /* May be set more than once */
4120                 if (new_state == FLK_DEAD_STATE) {
4121                         return (0);
4122                 } else {
4123                         return (1);
4124                 }
4125         default:
4126                 return (1);
4127         }
4128 }
4129 
4130 static void
4131 check_sleeping_locks(graph_t *gp)
4132 {
4133         lock_descriptor_t *lock1, *lock2;
4134         edge_t *ep;
4135         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4136                                 lock1 = lock1->l_next) {
4137                                 ASSERT(!IS_BARRIER(lock1));
4138         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4139                                 lock2 = lock2->l_next) {
4140                 if (lock1->l_vnode == lock2->l_vnode) {
4141                         if (BLOCKS(lock2, lock1)) {
4142                                 ASSERT(!IS_GRANTED(lock1));
4143                                 ASSERT(!NOT_BLOCKED(lock1));
4144                                 path(lock1, lock2);
4145                         }
4146                 }
4147         }
4148 
4149         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4150                                         lock2 = lock2->l_next) {
4151                                 ASSERT(!IS_BARRIER(lock1));
4152                 if (lock1->l_vnode == lock2->l_vnode) {
4153                         if (BLOCKS(lock2, lock1)) {
4154                                 ASSERT(!IS_GRANTED(lock1));
4155                                 ASSERT(!NOT_BLOCKED(lock1));
4156                                 path(lock1, lock2);
4157                         }
4158                 }
4159         }
4160         ep = FIRST_ADJ(lock1);
4161         while (ep != HEAD(lock1)) {
4162                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4163                 ep = NEXT_ADJ(ep);
4164         }
4165         }
4166 }
4167 
4168 static int
4169 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4170 {
4171         edge_t  *ep;
4172         lock_descriptor_t       *vertex;
4173         lock_descriptor_t *vertex_stack;
4174 
4175         STACK_INIT(vertex_stack);
4176 
4177         flk_graph_uncolor(lock1->l_graph);
4178         ep = FIRST_ADJ(lock1);
4179         ASSERT(ep != HEAD(lock1));
4180         while (ep != HEAD(lock1)) {
4181                 if (no_path)
4182                         ASSERT(ep->to_vertex != lock2);
4183                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4184                 COLOR(ep->to_vertex);
4185                 ep = NEXT_ADJ(ep);
4186         }
4187 
4188         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4189                 STACK_POP(vertex_stack, l_dstack);
4190                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4191                                                 ep = NEXT_ADJ(ep)) {
4192                         if (COLORED(ep->to_vertex))
4193                                 continue;
4194                         COLOR(ep->to_vertex);
4195                         if (ep->to_vertex == lock2)
4196                                 return (1);
4197 
4198                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4199                 }
4200         }
4201         return (0);
4202 }
4203 
4204 static void
4205 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4206 {
4207         lock_descriptor_t *lock;
4208 
4209         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4210 
4211         if (lock) {
4212                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4213                         if (lock->l_flock.l_pid == pid &&
4214                             lock->l_flock.l_sysid == sysid)
4215                                 cmn_err(CE_PANIC,
4216                                     "owner pid %d's lock %p in active queue",
4217                                     pid, (void *)lock);
4218                         lock = lock->l_next;
4219                 }
4220         }
4221         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4222 
4223         if (lock) {
4224                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4225                         if (lock->l_flock.l_pid == pid &&
4226                             lock->l_flock.l_sysid == sysid)
4227                                 cmn_err(CE_PANIC,
4228                                     "owner pid %d's lock %p in sleep queue",
4229                                     pid, (void *)lock);
4230                         lock = lock->l_next;
4231                 }
4232         }
4233 }
4234 
4235 static int
4236 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4237 {
4238         edge_t *ep = FIRST_ADJ(lock1);
4239 
4240         while (ep != HEAD(lock1)) {
4241                 if (ep->to_vertex == lock2)
4242                         return (1);
4243                 else
4244                         ep = NEXT_ADJ(ep);
4245         }
4246         return (0);
4247 }
4248 
4249 static int
4250 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4251 {
4252         return (!level_two_path(lock1, lock2, 1));
4253 }
4254 
4255 static void
4256 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4257 {
4258         if (level_one_path(lock1, lock2)) {
4259                 if (level_two_path(lock1, lock2, 0) != 0) {
4260                         cmn_err(CE_WARN,
4261                             "one edge one path from lock1 %p lock2 %p",
4262                             (void *)lock1, (void *)lock2);
4263                 }
4264         } else if (no_path(lock1, lock2)) {
4265                 cmn_err(CE_PANIC,
4266                     "No path from  lock1 %p to lock2 %p",
4267                     (void *)lock1, (void *)lock2);
4268         }
4269 }
4270 #endif /* DEBUG */