dlpx-os-diff Old usr/src/uts/common/os/flock.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*      All Rights Reserved */
  29 
  30 #include <sys/flock_impl.h>
  31 #include <sys/vfs.h>
  32 #include <sys/t_lock.h>           /* for <sys/callb.h> */
  33 #include <sys/callb.h>
  34 #include <sys/clconf.h>
  35 #include <sys/cladm.h>
  36 #include <sys/nbmlock.h>
  37 #include <sys/cred.h>
  38 #include <sys/policy.h>
  39 
  40 /*
  41  * The following four variables are for statistics purposes and they are
  42  * not protected by locks. They may not be accurate but will at least be
  43  * close to the actual value.
  44  */
  45 
  46 int     flk_lock_allocs;
  47 int     flk_lock_frees;
  48 int     edge_allocs;
  49 int     edge_frees;
  50 int     flk_proc_vertex_allocs;
  51 int     flk_proc_edge_allocs;
  52 int     flk_proc_vertex_frees;
  53 int     flk_proc_edge_frees;
  54 
  55 static kmutex_t flock_lock;
  56 
  57 #ifdef DEBUG
  58 int check_debug = 0;
  59 #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  60                                         check_active_locks(gp);
  61 #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  62                                                 check_sleeping_locks(gp);
  63 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  64                 if (check_debug)        \
  65                         check_owner_locks(gp, pid, sysid, vp);
  66 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  67         { \
  68                 if (check_lock_transition(old_state, new_state)) { \
  69                         cmn_err(CE_PANIC, "Illegal lock transition \
  70                             from %d to %d", old_state, new_state); \
  71                 } \
  72         }
  73 #else
  74 
  75 #define CHECK_ACTIVE_LOCKS(gp)
  76 #define CHECK_SLEEPING_LOCKS(gp)
  77 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  78 #define CHECK_LOCK_TRANSITION(old_state, new_state)
  79 
  80 #endif /* DEBUG */
  81 
  82 struct kmem_cache       *flk_edge_cache;
  83 
  84 graph_t         *lock_graph[HASH_SIZE];
  85 proc_graph_t    pgraph;
  86 
  87 /*
  88  * Clustering.
  89  *
  90  * NLM REGISTRY TYPE IMPLEMENTATION
  91  *
  92  * Assumptions:
  93  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  94  *      integers; maximum node id is returned by clconf_maximum_nodeid().
  95  *  2.  We use this node id to identify the node an NLM server runs on.
  96  */
  97 
  98 /*
  99  * NLM registry object keeps track of NLM servers via their
 100  * nlmids (which are the node ids of the node in the cluster they run on)
 101  * that have requested locks at this LLM with which this registry is
 102  * associated.
 103  *
 104  * Representation of abstraction:
 105  *    rep = record[     states: array[nlm_state],
 106  *                      lock: mutex]
 107  *
 108  *    Representation invariants:
 109  *      1. index i of rep.states is between 0 and n - 1 where n is number
 110  *         of elements in the array, which happen to be the maximum number
 111  *         of nodes in the cluster configuration + 1.
 112  *      2. map nlmid to index i of rep.states
 113  *              0   -> 0
 114  *              1   -> 1
 115  *              2   -> 2
 116  *              n-1 -> clconf_maximum_nodeid()+1
 117  *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 118  *          from forgetting to subtract 1 from the index.
 119  *      4.  The reason we keep the 0th index is the following.  A legitimate
 120  *          cluster configuration includes making a UFS file system NFS
 121  *          exportable.  The code is structured so that if you're in a cluster
 122  *          you do one thing; otherwise, you do something else.  The problem
 123  *          is what to do if you think you're in a cluster with PXFS loaded,
 124  *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 125  *          encode the node id of the node where NLM server runs; these bytes
 126  *          are zero for UFS.  Since the nodeid is used to index into the
 127  *          registry, we can record the NLM server state information at index
 128  *          0 using the same mechanism used for PXFS file locks!
 129  */
 130 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 131 static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 132 static uint_t nlm_status_size;                  /* size of state array */
 133 
 134 /*
 135  * Although we need a global lock dependency graph (and associated data
 136  * structures), we also need a per-zone notion of whether the lock manager is
 137  * running, and so whether to allow lock manager requests or not.
 138  *
 139  * Thus, on a per-zone basis we maintain a ``global'' variable
 140  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 141  * manager is determined to be changing state (starting or stopping).
 142  *
 143  * Each graph/zone pair also has a copy of this variable, which is protected by
 144  * the graph's mutex.
 145  *
 146  * The per-graph copies are used to synchronize lock requests with shutdown
 147  * requests.  The global copy is used to initialize the per-graph field when a
 148  * new graph is created.
 149  */
 150 struct flock_globals {
 151         flk_lockmgr_status_t flk_lockmgr_status;
 152         flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 153 };
 154 
 155 zone_key_t flock_zone_key;
 156 
 157 static void create_flock(lock_descriptor_t *, flock64_t *);
 158 static lock_descriptor_t        *flk_get_lock(void);
 159 static void     flk_free_lock(lock_descriptor_t *lock);
 160 static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 161 static int flk_process_request(lock_descriptor_t *);
 162 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 163 static edge_t *flk_get_edge(void);
 164 static int flk_wait_execute_request(lock_descriptor_t *);
 165 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 166 static void flk_insert_active_lock(lock_descriptor_t *);
 167 static void flk_delete_active_lock(lock_descriptor_t *, int);
 168 static void flk_insert_sleeping_lock(lock_descriptor_t *);
 169 static void flk_graph_uncolor(graph_t *);
 170 static void flk_wakeup(lock_descriptor_t *, int);
 171 static void flk_free_edge(edge_t *);
 172 static void flk_recompute_dependencies(lock_descriptor_t *,
 173                         lock_descriptor_t **,  int, int);
 174 static int flk_find_barriers(lock_descriptor_t *);
 175 static void flk_update_barriers(lock_descriptor_t *);
 176 static int flk_color_reachables(lock_descriptor_t *);
 177 static int flk_canceled(lock_descriptor_t *);
 178 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 179 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 180 static void wait_for_lock(lock_descriptor_t *);
 181 static void unlock_lockmgr_granted(struct flock_globals *);
 182 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 183 
 184 /* Clustering hooks */
 185 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 186 static void cl_flk_wakeup_sleeping_nlm_locks(int);
 187 static void cl_flk_unlock_nlm_granted(int);
 188 
 189 #ifdef DEBUG
 190 static int check_lock_transition(int, int);
 191 static void check_sleeping_locks(graph_t *);
 192 static void check_active_locks(graph_t *);
 193 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 194 static void path(lock_descriptor_t *, lock_descriptor_t *);
 195 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 196 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 197 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 198 #endif
 199 
 200 /*      proc_graph function definitions */
 201 static int flk_check_deadlock(lock_descriptor_t *);
 202 static void flk_proc_graph_uncolor(void);
 203 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 204 static proc_edge_t *flk_get_proc_edge(void);
 205 static void flk_proc_release(proc_vertex_t *);
 206 static void flk_free_proc_edge(proc_edge_t *);
 207 static void flk_update_proc_graph(edge_t *, int);
 208 
 209 /* Non-blocking mandatory locking */
 210 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 211                         u_offset_t);
 212 
 213 static struct flock_globals *
 214 flk_get_globals(void)
 215 {
 216         /*
 217          * The KLM module had better be loaded if we're attempting to handle
 218          * lockmgr requests.
 219          */
 220         ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 221         return (zone_getspecific(flock_zone_key, curproc->p_zone));
 222 }
 223 
 224 static flk_lockmgr_status_t
 225 flk_get_lockmgr_status(void)
 226 {
 227         struct flock_globals *fg;
 228 
 229         ASSERT(MUTEX_HELD(&flock_lock));
 230 
 231         if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 232                 /*
 233                  * KLM module not loaded; lock manager definitely not running.
 234                  */
 235                 return (FLK_LOCKMGR_DOWN);
 236         }
 237         fg = flk_get_globals();
 238         return (fg->flk_lockmgr_status);
 239 }
 240 
 241 /*
 242  * Routine called from fs_frlock in fs/fs_subr.c
 243  */
 244 
 245 int
 246 reclock(vnode_t         *vp,
 247         flock64_t       *lckdat,
 248         int             cmd,
 249         int             flag,
 250         u_offset_t      offset,
 251         flk_callback_t  *flk_cbp)
 252 {
 253         lock_descriptor_t       stack_lock_request;
 254         lock_descriptor_t       *lock_request;
 255         int error = 0;
 256         graph_t *gp;
 257         int                     nlmid;
 258 
 259         /*
 260          * Check access permissions
 261          */
 262         if ((cmd & SETFLCK) &&
 263                 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 264                 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 265                         return (EBADF);
 266 
 267         /*
 268          * for query and unlock we use the stack_lock_request
 269          */
 270 
 271         if ((lckdat->l_type == F_UNLCK) ||
 272                         !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 273                 lock_request = &stack_lock_request;
 274                 (void) bzero((caddr_t)lock_request,
 275                                 sizeof (lock_descriptor_t));
 276 
 277                 /*
 278                  * following is added to make the assertions in
 279                  * flk_execute_request() to pass through
 280                  */
 281 
 282                 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 283                 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 284                 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 285                 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 286                 lock_request->l_status = FLK_INITIAL_STATE;
 287         } else {
 288                 lock_request = flk_get_lock();
 289         }
 290         lock_request->l_state = 0;
 291         lock_request->l_vnode = vp;
 292         lock_request->l_zoneid = getzoneid();
 293 
 294         /*
 295          * Convert the request range into the canonical start and end
 296          * values.  The NLM protocol supports locking over the entire
 297          * 32-bit range, so there's no range checking for remote requests,
 298          * but we still need to verify that local requests obey the rules.
 299          */
 300         /* Clustering */
 301         if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 302                 ASSERT(lckdat->l_whence == 0);
 303                 lock_request->l_start = lckdat->l_start;
 304                 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 305                         lckdat->l_start + (lckdat->l_len - 1);
 306         } else {
 307                 /* check the validity of the lock range */
 308                 error = flk_convert_lock_data(vp, lckdat,
 309                         &lock_request->l_start, &lock_request->l_end,
 310                         offset);
 311                 if (error) {
 312                         goto done;
 313                 }
 314                 error = flk_check_lock_data(lock_request->l_start,
 315                                             lock_request->l_end, MAXEND);
 316                 if (error) {
 317                         goto done;
 318                 }
 319         }
 320 
 321         ASSERT(lock_request->l_end >= lock_request->l_start);
 322 
 323         lock_request->l_type = lckdat->l_type;
 324         if (cmd & INOFLCK)
 325                 lock_request->l_state |= IO_LOCK;
 326         if (cmd & SLPFLCK)
 327                 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 328         if (cmd & RCMDLCK)
 329                 lock_request->l_state |= LOCKMGR_LOCK;
 330         if (cmd & NBMLCK)
 331                 lock_request->l_state |= NBMAND_LOCK;
 332         /*
 333          * Clustering: set flag for PXFS locks
 334          * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 335          * also be of type 'RCMDLCK'.
 336          * We do not _only_ check the GETPXFSID() macro because local PXFS
 337          * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 338          */
 339 
 340         if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 341                 lock_request->l_state |= PXFS_LOCK;
 342         }
 343         if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 344                 if (lock_request->l_type == F_RDLCK ||
 345                         lock_request->l_type == F_WRLCK)
 346                         lock_request->l_state |= QUERY_LOCK;
 347         }
 348         lock_request->l_flock = (*lckdat);
 349         lock_request->l_callbacks = flk_cbp;
 350 
 351         /*
 352          * We are ready for processing the request
 353          */
 354         if (IS_LOCKMGR(lock_request)) {
 355                 /*
 356                  * If the lock request is an NLM server request ....
 357                  */
 358                 if (nlm_status_size == 0) { /* not booted as cluster */
 359                         mutex_enter(&flock_lock);
 360                         /*
 361                          * Bail out if this is a lock manager request and the
 362                          * lock manager is not supposed to be running.
 363                          */
 364                         if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 365                                 mutex_exit(&flock_lock);
 366                                 error = ENOLCK;
 367                                 goto done;
 368                         }
 369                         mutex_exit(&flock_lock);
 370                 } else {                        /* booted as a cluster */
 371                         nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 372                         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 373 
 374                         mutex_enter(&nlm_reg_lock);
 375                         /*
 376                          * If the NLM registry does not know about this
 377                          * NLM server making the request, add its nlmid
 378                          * to the registry.
 379                          */
 380                         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 381                                 nlmid)) {
 382                                 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 383                         } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 384                                 nlmid)) {
 385                                 /*
 386                                  * If the NLM server is already known (has made
 387                                  * previous lock requests) and its state is
 388                                  * not NLM_UP (means that NLM server is
 389                                  * shutting down), then bail out with an
 390                                  * error to deny the lock request.
 391                                  */
 392                                 mutex_exit(&nlm_reg_lock);
 393                                 error = ENOLCK;
 394                                 goto done;
 395                         }
 396                         mutex_exit(&nlm_reg_lock);
 397                 }
 398         }
 399 
 400         /* Now get the lock graph for a particular vnode */
 401         gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 402 
 403         /*
 404          * We drop rwlock here otherwise this might end up causing a
 405          * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 406          */
 407 
 408         if (IS_IO_LOCK(lock_request)) {
 409                 VOP_RWUNLOCK(vp,
 410                         (lock_request->l_type == F_RDLCK) ?
 411                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 412         }
 413         mutex_enter(&gp->gp_mutex);
 414 
 415         lock_request->l_state |= REFERENCED_LOCK;
 416         lock_request->l_graph = gp;
 417 
 418         switch (lock_request->l_type) {
 419         case F_RDLCK:
 420         case F_WRLCK:
 421                 if (IS_QUERY_LOCK(lock_request)) {
 422                         flk_get_first_blocking_lock(lock_request);
 423                         (*lckdat) = lock_request->l_flock;
 424                         break;
 425                 }
 426 
 427                 /* process the request now */
 428 
 429                 error = flk_process_request(lock_request);
 430                 break;
 431 
 432         case F_UNLCK:
 433                 /* unlock request will not block so execute it immediately */
 434 
 435                 if (IS_LOCKMGR(lock_request) &&
 436                     flk_canceled(lock_request)) {
 437                         error = 0;
 438                 } else {
 439                         error = flk_execute_request(lock_request);
 440                 }
 441                 break;
 442 
 443         case F_UNLKSYS:
 444                 /*
 445                  * Recovery mechanism to release lock manager locks when
 446                  * NFS client crashes and restart. NFS server will clear
 447                  * old locks and grant new locks.
 448                  */
 449 
 450                 if (lock_request->l_flock.l_sysid == 0) {
 451                         mutex_exit(&gp->gp_mutex);
 452                         return (EINVAL);
 453                 }
 454                 if (secpolicy_nfs(CRED()) != 0) {
 455                         mutex_exit(&gp->gp_mutex);
 456                         return (EPERM);
 457                 }
 458                 flk_delete_locks_by_sysid(lock_request);
 459                 lock_request->l_state &= ~REFERENCED_LOCK;
 460                 flk_set_state(lock_request, FLK_DEAD_STATE);
 461                 flk_free_lock(lock_request);
 462                 mutex_exit(&gp->gp_mutex);
 463                 return (0);
 464 
 465         default:
 466                 error = EINVAL;
 467                 break;
 468         }
 469 
 470         /* Clustering: For blocked PXFS locks, return */
 471         if (error == PXFS_LOCK_BLOCKED) {
 472                 lock_request->l_state &= ~REFERENCED_LOCK;
 473                 mutex_exit(&gp->gp_mutex);
 474                 return (error);
 475         }
 476 
 477         /*
 478          * Now that we have seen the status of locks in the system for
 479          * this vnode we acquire the rwlock if it is an IO_LOCK.
 480          */
 481 
 482         if (IS_IO_LOCK(lock_request)) {
 483                 (void) VOP_RWLOCK(vp,
 484                         (lock_request->l_type == F_RDLCK) ?
 485                                 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 486                 if (!error) {
 487                         lckdat->l_type = F_UNLCK;
 488 
 489                         /*
 490                          * This wake up is needed otherwise
 491                          * if IO_LOCK has slept the dependents on this
 492                          * will not be woken up at all. (bugid # 1185482).
 493                          */
 494 
 495                         flk_wakeup(lock_request, 1);
 496                         flk_set_state(lock_request, FLK_DEAD_STATE);
 497                         flk_free_lock(lock_request);
 498                 }
 499                 /*
 500                  * else if error had occurred either flk_process_request()
 501                  * has returned EDEADLK in which case there will be no
 502                  * dependents for this lock or EINTR from flk_wait_execute_
 503                  * request() in which case flk_cancel_sleeping_lock()
 504                  * would have been done. same is true with EBADF.
 505                  */
 506         }
 507 
 508         if (lock_request == &stack_lock_request) {
 509                 flk_set_state(lock_request, FLK_DEAD_STATE);
 510         } else {
 511                 lock_request->l_state &= ~REFERENCED_LOCK;
 512                 if ((error != 0) || IS_DELETED(lock_request)) {
 513                         flk_set_state(lock_request, FLK_DEAD_STATE);
 514                         flk_free_lock(lock_request);
 515                 }
 516         }
 517 
 518         mutex_exit(&gp->gp_mutex);
 519         return (error);
 520 
 521 done:
 522         flk_set_state(lock_request, FLK_DEAD_STATE);
 523         if (lock_request != &stack_lock_request)
 524                 flk_free_lock(lock_request);
 525         return (error);
 526 }
 527 
 528 /*
 529  * Invoke the callbacks in the given list.  If before sleeping, invoke in
 530  * list order.  If after sleeping, invoke in reverse order.
 531  *
 532  * CPR (suspend/resume) support: if one of the callbacks returns a
 533  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 534  * while it is sleeping.  There should be at most one callb_cpr_t for the
 535  * thread.
 536  * XXX This is unnecessarily complicated.  The CPR information should just
 537  * get passed in directly through VOP_FRLOCK and reclock, rather than
 538  * sneaking it in via a callback.
 539  */
 540 
 541 callb_cpr_t *
 542 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 543 {
 544         callb_cpr_t *cpr_callbackp = NULL;
 545         callb_cpr_t *one_result;
 546         flk_callback_t *cb;
 547 
 548         if (cblist == NULL)
 549                 return (NULL);
 550 
 551         if (when == FLK_BEFORE_SLEEP) {
 552                 cb = cblist;
 553                 do {
 554                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 555                         if (one_result != NULL) {
 556                                 ASSERT(cpr_callbackp == NULL);
 557                                 cpr_callbackp = one_result;
 558                         }
 559                         cb = cb->cb_next;
 560                 } while (cb != cblist);
 561         } else {
 562                 cb = cblist->cb_prev;
 563                 do {
 564                         one_result = (*cb->cb_callback)(when, cb->cb_data);
 565                         if (one_result != NULL) {
 566                                 cpr_callbackp = one_result;
 567                         }
 568                         cb = cb->cb_prev;
 569                 } while (cb != cblist->cb_prev);
 570         }
 571 
 572         return (cpr_callbackp);
 573 }
 574 
 575 /*
 576  * Initialize a flk_callback_t to hold the given callback.
 577  */
 578 
 579 void
 580 flk_init_callback(flk_callback_t *flk_cb,
 581         callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 582 {
 583         flk_cb->cb_next = flk_cb;
 584         flk_cb->cb_prev = flk_cb;
 585         flk_cb->cb_callback = cb_fcn;
 586         flk_cb->cb_data = cbdata;
 587 }
 588 
 589 /*
 590  * Initialize an flk_callback_t and then link it into the head of an
 591  * existing list (which may be NULL).
 592  */
 593 
 594 void
 595 flk_add_callback(flk_callback_t *newcb,
 596                 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 597                 void *cbdata, flk_callback_t *cblist)
 598 {
 599         flk_init_callback(newcb, cb_fcn, cbdata);
 600 
 601         if (cblist == NULL)
 602                 return;
 603 
 604         newcb->cb_prev = cblist->cb_prev;
 605         newcb->cb_next = cblist;
 606         cblist->cb_prev->cb_next = newcb;
 607         cblist->cb_prev = newcb;
 608 }
 609 
 610 /*
 611  * Initialize the flk_edge_cache data structure and create the
 612  * nlm_reg_status array.
 613  */
 614 
 615 void
 616 flk_init(void)
 617 {
 618         uint_t  i;
 619 
 620         flk_edge_cache = kmem_cache_create("flk_edges",
 621                 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 622         if (flk_edge_cache == NULL) {
 623                 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 624         }
 625         /*
 626          * Create the NLM registry object.
 627          */
 628 
 629         if (cluster_bootflags & CLUSTER_BOOTED) {
 630                 /*
 631                  * This routine tells you the maximum node id that will be used
 632                  * in the cluster.  This number will be the size of the nlm
 633                  * registry status array.  We add 1 because we will be using
 634                  * all entries indexed from 0 to maxnodeid; e.g., from 0
 635                  * to 64, for a total of 65 entries.
 636                  */
 637                 nlm_status_size = clconf_maximum_nodeid() + 1;
 638         } else {
 639                 nlm_status_size = 0;
 640         }
 641 
 642         if (nlm_status_size != 0) {     /* booted as a cluster */
 643                 nlm_reg_status = (flk_nlm_status_t *)
 644                         kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 645                                 KM_SLEEP);
 646 
 647                 /* initialize all NLM states in array to NLM_UNKNOWN */
 648                 for (i = 0; i < nlm_status_size; i++) {
 649                         nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 650                 }
 651         }
 652 }
 653 
 654 /*
 655  * Zone constructor/destructor callbacks to be executed when a zone is
 656  * created/destroyed.
 657  */
 658 /* ARGSUSED */
 659 void *
 660 flk_zone_init(zoneid_t zoneid)
 661 {
 662         struct flock_globals *fg;
 663         uint_t i;
 664 
 665         fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 666         fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 667         for (i = 0; i < HASH_SIZE; i++)
 668                 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 669         return (fg);
 670 }
 671 
 672 /* ARGSUSED */
 673 void
 674 flk_zone_fini(zoneid_t zoneid, void *data)
 675 {
 676         struct flock_globals *fg = data;
 677 
 678         kmem_free(fg, sizeof (*fg));
 679 }
 680 
 681 /*
 682  * Get a lock_descriptor structure with initialization of edge lists.
 683  */
 684 
 685 static lock_descriptor_t *
 686 flk_get_lock(void)
 687 {
 688         lock_descriptor_t       *l;
 689 
 690         l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 691 
 692         cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 693         l->l_edge.edge_in_next = &l->l_edge;
 694         l->l_edge.edge_in_prev = &l->l_edge;
 695         l->l_edge.edge_adj_next = &l->l_edge;
 696         l->l_edge.edge_adj_prev = &l->l_edge;
 697         l->pvertex = -1;
 698         l->l_status = FLK_INITIAL_STATE;
 699         flk_lock_allocs++;
 700         return (l);
 701 }
 702 
 703 /*
 704  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 705  * when some thread has a reference to it as in reclock().
 706  */
 707 
 708 void
 709 flk_free_lock(lock_descriptor_t *lock)
 710 {
 711         ASSERT(IS_DEAD(lock));
 712         if (IS_REFERENCED(lock)) {
 713                 lock->l_state |= DELETED_LOCK;
 714                 return;
 715         }
 716         flk_lock_frees++;
 717         kmem_free((void *)lock, sizeof (lock_descriptor_t));
 718 }
 719 
 720 void
 721 flk_set_state(lock_descriptor_t *lock, int new_state)
 722 {
 723         /*
 724          * Locks in the sleeping list may be woken up in a number of ways,
 725          * and more than once.  If a sleeping lock is signaled awake more
 726          * than once, then it may or may not change state depending on its
 727          * current state.
 728          * Also note that NLM locks that are sleeping could be moved to an
 729          * interrupted state more than once if the unlock request is
 730          * retransmitted by the NLM client - the second time around, this is
 731          * just a nop.
 732          * The ordering of being signaled awake is:
 733          * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 734          * The checks below implement this ordering.
 735          */
 736         if (IS_INTERRUPTED(lock)) {
 737                 if ((new_state == FLK_CANCELLED_STATE) ||
 738                     (new_state == FLK_GRANTED_STATE) ||
 739                     (new_state == FLK_INTERRUPTED_STATE)) {
 740                         return;
 741                 }
 742         }
 743         if (IS_CANCELLED(lock)) {
 744                 if ((new_state == FLK_GRANTED_STATE) ||
 745                     (new_state == FLK_CANCELLED_STATE)) {
 746                         return;
 747                 }
 748         }
 749         CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 750         if (IS_PXFS(lock)) {
 751                 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 752         }
 753         lock->l_status = new_state;
 754 }
 755 
 756 /*
 757  * Routine that checks whether there are any blocking locks in the system.
 758  *
 759  * The policy followed is if a write lock is sleeping we don't allow read
 760  * locks before this write lock even though there may not be any active
 761  * locks corresponding to the read locks' region.
 762  *
 763  * flk_add_edge() function adds an edge between l1 and l2 iff there
 764  * is no path between l1 and l2. This is done to have a "minimum
 765  * storage representation" of the dependency graph.
 766  *
 767  * Another property of the graph is since only the new request throws
 768  * edges to the existing locks in the graph, the graph is always topologically
 769  * ordered.
 770  */
 771 
 772 static int
 773 flk_process_request(lock_descriptor_t *request)
 774 {
 775         graph_t *gp = request->l_graph;
 776         lock_descriptor_t *lock;
 777         int request_blocked_by_active = 0;
 778         int request_blocked_by_granted = 0;
 779         int request_blocked_by_sleeping = 0;
 780         vnode_t *vp = request->l_vnode;
 781         int     error = 0;
 782         int request_will_wait = 0;
 783         int found_covering_lock = 0;
 784         lock_descriptor_t *covered_by = NULL;
 785 
 786         ASSERT(MUTEX_HELD(&gp->gp_mutex));
 787         request_will_wait = IS_WILLING_TO_SLEEP(request);
 788 
 789         /*
 790          * check active locks
 791          */
 792 
 793         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 794 
 795 
 796         if (lock) {
 797                 do {
 798                         if (BLOCKS(lock, request)) {
 799                                 if (!request_will_wait)
 800                                         return (EAGAIN);
 801                                 request_blocked_by_active = 1;
 802                                 break;
 803                         }
 804                         /*
 805                          * Grant lock if it is for the same owner holding active
 806                          * lock that covers the request.
 807                          */
 808 
 809                         if (SAME_OWNER(lock, request) &&
 810                                         COVERS(lock, request) &&
 811                                                 (request->l_type == F_RDLCK))
 812                                 return (flk_execute_request(request));
 813                         lock = lock->l_next;
 814                 } while (lock->l_vnode == vp);
 815         }
 816 
 817         if (!request_blocked_by_active) {
 818                         lock_descriptor_t *lk[1];
 819                         lock_descriptor_t *first_glock = NULL;
 820                 /*
 821                  * Shall we grant this?! NO!!
 822                  * What about those locks that were just granted and still
 823                  * in sleep queue. Those threads are woken up and so locks
 824                  * are almost active.
 825                  */
 826                 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 827                 if (lock) {
 828                         do {
 829                                 if (BLOCKS(lock, request)) {
 830                                         if (IS_GRANTED(lock)) {
 831                                                 request_blocked_by_granted = 1;
 832                                         } else {
 833                                                 request_blocked_by_sleeping = 1;
 834                                         }
 835                                 }
 836 
 837                                 lock = lock->l_next;
 838                         } while ((lock->l_vnode == vp));
 839                         first_glock = lock->l_prev;
 840                         ASSERT(first_glock->l_vnode == vp);
 841                 }
 842 
 843                 if (request_blocked_by_granted)
 844                         goto block;
 845 
 846                 if (!request_blocked_by_sleeping) {
 847                         /*
 848                          * If the request isn't going to be blocked by a
 849                          * sleeping request, we know that it isn't going to
 850                          * be blocked; we can just execute the request --
 851                          * without performing costly deadlock detection.
 852                          */
 853                         ASSERT(!request_blocked_by_active);
 854                         return (flk_execute_request(request));
 855                 } else if (request->l_type == F_RDLCK) {
 856                         /*
 857                          * If we have a sleeping writer in the requested
 858                          * lock's range, block.
 859                          */
 860                         goto block;
 861                 }
 862 
 863                 lk[0] = request;
 864                 request->l_state |= RECOMPUTE_LOCK;
 865                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 866                 if (lock) {
 867                         do {
 868                                 flk_recompute_dependencies(lock, lk, 1, 0);
 869                                 lock = lock->l_next;
 870                         } while (lock->l_vnode == vp);
 871                 }
 872                 lock = first_glock;
 873                 if (lock) {
 874                         do {
 875                                 if (IS_GRANTED(lock)) {
 876                                 flk_recompute_dependencies(lock, lk, 1, 0);
 877                                 }
 878                                 lock = lock->l_prev;
 879                         } while ((lock->l_vnode == vp));
 880                 }
 881                 request->l_state &= ~RECOMPUTE_LOCK;
 882                 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 883                         return (EDEADLK);
 884                 return (flk_execute_request(request));
 885         }
 886 
 887 block:
 888         if (request_will_wait)
 889                 flk_graph_uncolor(gp);
 890 
 891         /* check sleeping locks */
 892 
 893         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 894 
 895         /*
 896          * If we find a sleeping write lock that is a superset of the
 897          * region wanted by request we can be assured that by adding an
 898          * edge to this write lock we have paths to all locks in the
 899          * graph that blocks the request except in one case and that is why
 900          * another check for SAME_OWNER in the loop below. The exception
 901          * case is when this process that owns the sleeping write lock 'l1'
 902          * has other locks l2, l3, l4 that are in the system and arrived
 903          * before l1. l1 does not have path to these locks as they are from
 904          * same process. We break when we find a second covering sleeping
 905          * lock l5 owned by a process different from that owning l1, because
 906          * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 907          * it has l1 would have produced a deadlock already.
 908          */
 909 
 910         if (lock) {
 911                 do {
 912                         if (BLOCKS(lock, request)) {
 913                                 if (!request_will_wait)
 914                                         return (EAGAIN);
 915                                 if (COVERS(lock, request) &&
 916                                                 lock->l_type == F_WRLCK) {
 917                                         if (found_covering_lock &&
 918                                             !SAME_OWNER(lock, covered_by)) {
 919                                                 found_covering_lock++;
 920                                                 break;
 921                                         }
 922                                         found_covering_lock = 1;
 923                                         covered_by = lock;
 924                                 }
 925                                 if (found_covering_lock &&
 926                                         !SAME_OWNER(lock, covered_by)) {
 927                                         lock = lock->l_next;
 928                                         continue;
 929                                 }
 930                                 if ((error = flk_add_edge(request, lock,
 931                                                 !found_covering_lock, 0)))
 932                                         return (error);
 933                         }
 934                         lock = lock->l_next;
 935                 } while (lock->l_vnode == vp);
 936         }
 937 
 938 /*
 939  * found_covering_lock == 2 iff at this point 'request' has paths
 940  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 941  * point 'request' has paths to all locks that blocks 'request' whose owners
 942  * are not same as the one that covers 'request' (covered_by above) and
 943  * we can have locks whose owner is same as covered_by in the active list.
 944  */
 945 
 946         if (request_blocked_by_active && found_covering_lock != 2) {
 947                 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 948                 ASSERT(lock != NULL);
 949                 do {
 950                         if (BLOCKS(lock, request)) {
 951                                 if (found_covering_lock &&
 952                                         !SAME_OWNER(lock, covered_by)) {
 953                                         lock = lock->l_next;
 954                                         continue;
 955                                 }
 956                                 if ((error = flk_add_edge(request, lock,
 957                                                         CHECK_CYCLE, 0)))
 958                                         return (error);
 959                         }
 960                         lock = lock->l_next;
 961                 } while (lock->l_vnode == vp);
 962         }
 963 
 964         if (NOT_BLOCKED(request)) {
 965                 /*
 966                  * request not dependent on any other locks
 967                  * so execute this request
 968                  */
 969                 return (flk_execute_request(request));
 970         } else {
 971                 /*
 972                  * check for deadlock
 973                  */
 974                 if (flk_check_deadlock(request))
 975                         return (EDEADLK);
 976                 /*
 977                  * this thread has to sleep
 978                  */
 979                 return (flk_wait_execute_request(request));
 980         }
 981 }
 982 
 983 /*
 984  * The actual execution of the request in the simple case is only to
 985  * insert the 'request' in the list of active locks if it is not an
 986  * UNLOCK.
 987  * We have to consider the existing active locks' relation to
 988  * this 'request' if they are owned by same process. flk_relation() does
 989  * this job and sees to that the dependency graph information is maintained
 990  * properly.
 991  */
 992 
 993 int
 994 flk_execute_request(lock_descriptor_t *request)
 995 {
 996         graph_t *gp = request->l_graph;
 997         vnode_t *vp = request->l_vnode;
 998         lock_descriptor_t       *lock, *lock1;
 999         int done_searching = 0;
1000 
1001         CHECK_SLEEPING_LOCKS(gp);
1002         CHECK_ACTIVE_LOCKS(gp);
1003 
1004         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1005 
1006         flk_set_state(request, FLK_START_STATE);
1007 
1008         ASSERT(NOT_BLOCKED(request));
1009 
1010         /* IO_LOCK requests are only to check status */
1011 
1012         if (IS_IO_LOCK(request))
1013                 return (0);
1014 
1015         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1016 
1017         if (lock == NULL && request->l_type == F_UNLCK)
1018                 return (0);
1019         if (lock == NULL) {
1020                 flk_insert_active_lock(request);
1021                 return (0);
1022         }
1023 
1024         do {
1025                 lock1 = lock->l_next;
1026                 if (SAME_OWNER(request, lock)) {
1027                         done_searching = flk_relation(lock, request);
1028                 }
1029                 lock = lock1;
1030         } while (lock->l_vnode == vp && !done_searching);
1031 
1032         /*
1033          * insert in active queue
1034          */
1035 
1036         if (request->l_type != F_UNLCK)
1037                 flk_insert_active_lock(request);
1038 
1039         return (0);
1040 }
1041 
1042 /*
1043  * 'request' is blocked by some one therefore we put it into sleep queue.
1044  */
1045 static int
1046 flk_wait_execute_request(lock_descriptor_t *request)
1047 {
1048         graph_t *gp = request->l_graph;
1049         callb_cpr_t     *cprp;          /* CPR info from callback */
1050         struct flock_globals *fg;
1051         int index;
1052 
1053         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1054         ASSERT(IS_WILLING_TO_SLEEP(request));
1055 
1056         flk_insert_sleeping_lock(request);
1057 
1058         if (IS_LOCKMGR(request)) {
1059                 index = HASH_INDEX(request->l_vnode);
1060                 fg = flk_get_globals();
1061 
1062                 if (nlm_status_size == 0) {     /* not booted as a cluster */
1063                         if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1064                                 flk_cancel_sleeping_lock(request, 1);
1065                                 return (ENOLCK);
1066                         }
1067                 } else {                        /* booted as a cluster */
1068                         /*
1069                          * If the request is an NLM server lock request,
1070                          * and the NLM state of the lock request is not
1071                          * NLM_UP (because the NLM server is shutting
1072                          * down), then cancel the sleeping lock and
1073                          * return error ENOLCK that will encourage the
1074                          * client to retransmit.
1075                          */
1076                         if (!IS_NLM_UP(request)) {
1077                                 flk_cancel_sleeping_lock(request, 1);
1078                                 return (ENOLCK);
1079                         }
1080                 }
1081         }
1082 
1083         /* Clustering: For blocking PXFS locks, return */
1084         if (IS_PXFS(request)) {
1085                 /*
1086                  * PXFS locks sleep on the client side.
1087                  * The callback argument is used to wake up the sleeper
1088                  * when the lock is granted.
1089                  * We return -1 (rather than an errno value) to indicate
1090                  * the client side should sleep
1091                  */
1092                 return (PXFS_LOCK_BLOCKED);
1093         }
1094 
1095         if (request->l_callbacks != NULL) {
1096                 /*
1097                  * To make sure the shutdown code works correctly, either
1098                  * the callback must happen after putting the lock on the
1099                  * sleep list, or we must check the shutdown status after
1100                  * returning from the callback (and before sleeping).  At
1101                  * least for now, we'll use the first option.  If a
1102                  * shutdown or signal or whatever happened while the graph
1103                  * mutex was dropped, that will be detected by
1104                  * wait_for_lock().
1105                  */
1106                 mutex_exit(&gp->gp_mutex);
1107 
1108                 cprp = flk_invoke_callbacks(request->l_callbacks,
1109                                             FLK_BEFORE_SLEEP);
1110 
1111                 mutex_enter(&gp->gp_mutex);
1112 
1113                 if (cprp == NULL) {
1114                         wait_for_lock(request);
1115                 } else {
1116                         mutex_enter(cprp->cc_lockp);
1117                         CALLB_CPR_SAFE_BEGIN(cprp);
1118                         mutex_exit(cprp->cc_lockp);
1119                         wait_for_lock(request);
1120                         mutex_enter(cprp->cc_lockp);
1121                         CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1122                         mutex_exit(cprp->cc_lockp);
1123                 }
1124 
1125                 mutex_exit(&gp->gp_mutex);
1126                 (void) flk_invoke_callbacks(request->l_callbacks,
1127                                             FLK_AFTER_SLEEP);
1128                 mutex_enter(&gp->gp_mutex);
1129         } else {
1130                 wait_for_lock(request);
1131         }
1132 
1133         if (IS_LOCKMGR(request)) {
1134                 /*
1135                  * If the lock manager is shutting down, return an
1136                  * error that will encourage the client to retransmit.
1137                  */
1138                 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1139                         !IS_GRANTED(request)) {
1140                         flk_cancel_sleeping_lock(request, 1);
1141                         return (ENOLCK);
1142                 }
1143         }
1144 
1145         if (IS_INTERRUPTED(request)) {
1146                 /* we got a signal, or act like we did */
1147                 flk_cancel_sleeping_lock(request, 1);
1148                 return (EINTR);
1149         }
1150 
1151         /* Cancelled if some other thread has closed the file */
1152 
1153         if (IS_CANCELLED(request)) {
1154                 flk_cancel_sleeping_lock(request, 1);
1155                 return (EBADF);
1156         }
1157 
1158         request->l_state &= ~GRANTED_LOCK;
1159         REMOVE_SLEEP_QUEUE(request);
1160         return (flk_execute_request(request));
1161 }
1162 
1163 /*
1164  * This routine adds an edge between from and to because from depends
1165  * to. If asked to check for deadlock it checks whether there are any
1166  * reachable locks from "from_lock" that is owned by the same process
1167  * as "from_lock".
1168  * NOTE: It is the caller's responsibility to make sure that the color
1169  * of the graph is consistent between the calls to flk_add_edge as done
1170  * in flk_process_request. This routine does not color and check for
1171  * deadlock explicitly.
1172  */
1173 
1174 static int
1175 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1176                         int check_cycle, int update_graph)
1177 {
1178         edge_t  *edge;
1179         edge_t  *ep;
1180         lock_descriptor_t       *vertex;
1181         lock_descriptor_t *vertex_stack;
1182 
1183         STACK_INIT(vertex_stack);
1184 
1185         /*
1186          * if to vertex already has mark_color just return
1187          * don't add an edge as it is reachable from from vertex
1188          * before itself.
1189          */
1190 
1191         if (COLORED(to_lock))
1192                 return (0);
1193 
1194         edge = flk_get_edge();
1195 
1196         /*
1197          * set the from and to vertex
1198          */
1199 
1200         edge->from_vertex = from_lock;
1201         edge->to_vertex = to_lock;
1202 
1203         /*
1204          * put in adjacency list of from vertex
1205          */
1206 
1207         from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1208         edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1209         edge->edge_adj_prev = &from_lock->l_edge;
1210         from_lock->l_edge.edge_adj_next = edge;
1211 
1212         /*
1213          * put in in list of to vertex
1214          */
1215 
1216         to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1217         edge->edge_in_next = to_lock->l_edge.edge_in_next;
1218         to_lock->l_edge.edge_in_next = edge;
1219         edge->edge_in_prev = &to_lock->l_edge;
1220 
1221 
1222         if (update_graph) {
1223                 flk_update_proc_graph(edge, 0);
1224                 return (0);
1225         }
1226         if (!check_cycle) {
1227                 return (0);
1228         }
1229 
1230         STACK_PUSH(vertex_stack, from_lock, l_stack);
1231 
1232         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1233 
1234                 STACK_POP(vertex_stack, l_stack);
1235 
1236                 for (ep = FIRST_ADJ(vertex);
1237                         ep != HEAD(vertex);
1238                                 ep = NEXT_ADJ(ep)) {
1239                         if (COLORED(ep->to_vertex))
1240                                 continue;
1241                         COLOR(ep->to_vertex);
1242                         if (SAME_OWNER(ep->to_vertex, from_lock))
1243                                 goto dead_lock;
1244                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1245                 }
1246         }
1247         return (0);
1248 
1249 dead_lock:
1250 
1251         /*
1252          * remove all edges
1253          */
1254 
1255         ep = FIRST_ADJ(from_lock);
1256 
1257         while (ep != HEAD(from_lock)) {
1258                 IN_LIST_REMOVE(ep);
1259                 from_lock->l_sedge = NEXT_ADJ(ep);
1260                 ADJ_LIST_REMOVE(ep);
1261                 flk_free_edge(ep);
1262                 ep = from_lock->l_sedge;
1263         }
1264         return (EDEADLK);
1265 }
1266 
1267 /*
1268  * Get an edge structure for representing the dependency between two locks.
1269  */
1270 
1271 static edge_t *
1272 flk_get_edge()
1273 {
1274         edge_t  *ep;
1275 
1276         ASSERT(flk_edge_cache != NULL);
1277 
1278         ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1279         edge_allocs++;
1280         return (ep);
1281 }
1282 
1283 /*
1284  * Free the edge structure.
1285  */
1286 
1287 static void
1288 flk_free_edge(edge_t *ep)
1289 {
1290         edge_frees++;
1291         kmem_cache_free(flk_edge_cache, (void *)ep);
1292 }
1293 
1294 /*
1295  * Check the relationship of request with lock and perform the
1296  * recomputation of dependencies, break lock if required, and return
1297  * 1 if request cannot have any more relationship with the next
1298  * active locks.
1299  * The 'lock' and 'request' are compared and in case of overlap we
1300  * delete the 'lock' and form new locks to represent the non-overlapped
1301  * portion of original 'lock'. This function has side effects such as
1302  * 'lock' will be freed, new locks will be added to the active list.
1303  */
1304 
1305 static int
1306 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1307 {
1308         int lock_effect;
1309         lock_descriptor_t *lock1, *lock2;
1310         lock_descriptor_t *topology[3];
1311         int nvertex = 0;
1312         int i;
1313         edge_t  *ep;
1314         graph_t *gp = (lock->l_graph);
1315 
1316 
1317         CHECK_SLEEPING_LOCKS(gp);
1318         CHECK_ACTIVE_LOCKS(gp);
1319 
1320         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1321 
1322         topology[0] = topology[1] = topology[2] = NULL;
1323 
1324         if (request->l_type == F_UNLCK)
1325                 lock_effect = FLK_UNLOCK;
1326         else if (request->l_type == F_RDLCK &&
1327                         lock->l_type == F_WRLCK)
1328                 lock_effect = FLK_DOWNGRADE;
1329         else if (request->l_type == F_WRLCK &&
1330                         lock->l_type == F_RDLCK)
1331                 lock_effect = FLK_UPGRADE;
1332         else
1333                 lock_effect = FLK_STAY_SAME;
1334 
1335         if (lock->l_end < request->l_start) {
1336                 if (lock->l_end == request->l_start - 1 &&
1337                                 lock_effect == FLK_STAY_SAME) {
1338                         topology[0] = request;
1339                         request->l_start = lock->l_start;
1340                         nvertex = 1;
1341                         goto recompute;
1342                 } else {
1343                         return (0);
1344                 }
1345         }
1346 
1347         if (lock->l_start > request->l_end) {
1348                 if (request->l_end == lock->l_start - 1 &&
1349                                         lock_effect == FLK_STAY_SAME) {
1350                         topology[0] = request;
1351                         request->l_end = lock->l_end;
1352                         nvertex = 1;
1353                         goto recompute;
1354                 } else {
1355                         return (1);
1356                 }
1357         }
1358 
1359         if (request->l_end < lock->l_end) {
1360                 if (request->l_start > lock->l_start) {
1361                         if (lock_effect == FLK_STAY_SAME) {
1362                                 request->l_start = lock->l_start;
1363                                 request->l_end = lock->l_end;
1364                                 topology[0] = request;
1365                                 nvertex = 1;
1366                         } else {
1367                                 lock1 = flk_get_lock();
1368                                 lock2 = flk_get_lock();
1369                                 COPY(lock1, lock);
1370                                 COPY(lock2, lock);
1371                                 lock1->l_start = lock->l_start;
1372                                 lock1->l_end = request->l_start - 1;
1373                                 lock2->l_start = request->l_end + 1;
1374                                 lock2->l_end = lock->l_end;
1375                                 topology[0] = lock1;
1376                                 topology[1] = lock2;
1377                                 topology[2] = request;
1378                                 nvertex = 3;
1379                         }
1380                 } else if (request->l_start < lock->l_start) {
1381                         if (lock_effect == FLK_STAY_SAME) {
1382                                 request->l_end = lock->l_end;
1383                                 topology[0] = request;
1384                                 nvertex = 1;
1385                         } else {
1386                                 lock1 = flk_get_lock();
1387                                 COPY(lock1, lock);
1388                                 lock1->l_start = request->l_end + 1;
1389                                 topology[0] = lock1;
1390                                 topology[1] = request;
1391                                 nvertex = 2;
1392                         }
1393                 } else  {
1394                         if (lock_effect == FLK_STAY_SAME) {
1395                                 request->l_start = lock->l_start;
1396                                 request->l_end = lock->l_end;
1397                                 topology[0] = request;
1398                                 nvertex = 1;
1399                         } else {
1400                                 lock1 = flk_get_lock();
1401                                 COPY(lock1, lock);
1402                                 lock1->l_start = request->l_end + 1;
1403                                 topology[0] = lock1;
1404                                 topology[1] = request;
1405                                 nvertex = 2;
1406                         }
1407                 }
1408         } else if (request->l_end > lock->l_end) {
1409                 if (request->l_start > lock->l_start)  {
1410                         if (lock_effect == FLK_STAY_SAME) {
1411                                 request->l_start = lock->l_start;
1412                                 topology[0] = request;
1413                                 nvertex = 1;
1414                         } else {
1415                                 lock1 = flk_get_lock();
1416                                 COPY(lock1, lock);
1417                                 lock1->l_end = request->l_start - 1;
1418                                 topology[0] = lock1;
1419                                 topology[1] = request;
1420                                 nvertex = 2;
1421                         }
1422                 } else if (request->l_start < lock->l_start)  {
1423                         topology[0] = request;
1424                         nvertex = 1;
1425                 } else {
1426                         topology[0] = request;
1427                         nvertex = 1;
1428                 }
1429         } else {
1430                 if (request->l_start > lock->l_start) {
1431                         if (lock_effect == FLK_STAY_SAME) {
1432                                 request->l_start = lock->l_start;
1433                                 topology[0] = request;
1434                                 nvertex = 1;
1435                         } else {
1436                                 lock1 = flk_get_lock();
1437                                 COPY(lock1, lock);
1438                                 lock1->l_end = request->l_start - 1;
1439                                 topology[0] = lock1;
1440                                 topology[1] = request;
1441                                 nvertex = 2;
1442                         }
1443                 } else if (request->l_start < lock->l_start) {
1444                         topology[0] = request;
1445                         nvertex = 1;
1446                 } else {
1447                         if (lock_effect !=  FLK_UNLOCK) {
1448                                 topology[0] = request;
1449                                 nvertex = 1;
1450                         } else {
1451                                 flk_delete_active_lock(lock, 0);
1452                                 flk_wakeup(lock, 1);
1453                                 flk_free_lock(lock);
1454                                 CHECK_SLEEPING_LOCKS(gp);
1455                                 CHECK_ACTIVE_LOCKS(gp);
1456                                 return (1);
1457                         }
1458                 }
1459         }
1460 
1461 recompute:
1462 
1463         /*
1464          * For unlock we don't send the 'request' to for recomputing
1465          * dependencies because no lock will add an edge to this.
1466          */
1467 
1468         if (lock_effect == FLK_UNLOCK) {
1469                 topology[nvertex-1] = NULL;
1470                 nvertex--;
1471         }
1472         for (i = 0; i < nvertex; i++) {
1473                 topology[i]->l_state |= RECOMPUTE_LOCK;
1474                 topology[i]->l_color = NO_COLOR;
1475         }
1476 
1477         ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1478 
1479         /*
1480          * we remove the adjacent edges for all vertices' to this vertex
1481          * 'lock'.
1482          */
1483 
1484         ep = FIRST_IN(lock);
1485         while (ep != HEAD(lock)) {
1486                 ADJ_LIST_REMOVE(ep);
1487                 ep = NEXT_IN(ep);
1488         }
1489 
1490         flk_delete_active_lock(lock, 0);
1491 
1492         /* We are ready for recomputing the dependencies now */
1493 
1494         flk_recompute_dependencies(lock, topology, nvertex, 1);
1495 
1496         for (i = 0; i < nvertex; i++) {
1497                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1498                 topology[i]->l_color = NO_COLOR;
1499         }
1500 
1501 
1502         if (lock_effect == FLK_UNLOCK) {
1503                 nvertex++;
1504         }
1505         for (i = 0; i < nvertex - 1; i++) {
1506                 flk_insert_active_lock(topology[i]);
1507         }
1508 
1509 
1510         if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1511                 flk_wakeup(lock, 0);
1512         } else {
1513                 ep = FIRST_IN(lock);
1514                 while (ep != HEAD(lock)) {
1515                         lock->l_sedge = NEXT_IN(ep);
1516                         IN_LIST_REMOVE(ep);
1517                         flk_update_proc_graph(ep, 1);
1518                         flk_free_edge(ep);
1519                         ep = lock->l_sedge;
1520                 }
1521         }
1522         flk_free_lock(lock);
1523 
1524         CHECK_SLEEPING_LOCKS(gp);
1525         CHECK_ACTIVE_LOCKS(gp);
1526         return (0);
1527 }
1528 
1529 /*
1530  * Insert a lock into the active queue.
1531  */
1532 
1533 static void
1534 flk_insert_active_lock(lock_descriptor_t *new_lock)
1535 {
1536         graph_t *gp = new_lock->l_graph;
1537         vnode_t *vp = new_lock->l_vnode;
1538         lock_descriptor_t *first_lock, *lock;
1539 
1540         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1541 
1542         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1543         first_lock = lock;
1544 
1545         if (first_lock != NULL) {
1546                 for (; (lock->l_vnode == vp &&
1547                         lock->l_start < new_lock->l_start); lock = lock->l_next)
1548                         ;
1549         } else {
1550                 lock = ACTIVE_HEAD(gp);
1551         }
1552 
1553         lock->l_prev->l_next = new_lock;
1554         new_lock->l_next = lock;
1555         new_lock->l_prev = lock->l_prev;
1556         lock->l_prev = new_lock;
1557 
1558         if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1559                 vp->v_filocks = (struct filock *)new_lock;
1560         }
1561         flk_set_state(new_lock, FLK_ACTIVE_STATE);
1562         new_lock->l_state |= ACTIVE_LOCK;
1563 
1564         CHECK_ACTIVE_LOCKS(gp);
1565         CHECK_SLEEPING_LOCKS(gp);
1566 }
1567 
1568 /*
1569  * Delete the active lock : Performs two functions depending on the
1570  * value of second parameter. One is to remove from the active lists
1571  * only and other is to both remove and free the lock.
1572  */
1573 
1574 static void
1575 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1576 {
1577         vnode_t *vp = lock->l_vnode;
1578         graph_t *gp = lock->l_graph;
1579 
1580         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1581         if (free_lock)
1582                 ASSERT(NO_DEPENDENTS(lock));
1583         ASSERT(NOT_BLOCKED(lock));
1584         ASSERT(IS_ACTIVE(lock));
1585 
1586         ASSERT((vp->v_filocks != NULL));
1587 
1588         if (vp->v_filocks == (struct filock *)lock) {
1589                 vp->v_filocks = (struct filock *)
1590                                 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1591                                                                 NULL);
1592         }
1593         lock->l_next->l_prev = lock->l_prev;
1594         lock->l_prev->l_next = lock->l_next;
1595         lock->l_next = lock->l_prev = NULL;
1596         flk_set_state(lock, FLK_DEAD_STATE);
1597         lock->l_state &= ~ACTIVE_LOCK;
1598 
1599         if (free_lock)
1600                 flk_free_lock(lock);
1601         CHECK_ACTIVE_LOCKS(gp);
1602         CHECK_SLEEPING_LOCKS(gp);
1603 }
1604 
1605 /*
1606  * Insert into the sleep queue.
1607  */
1608 
1609 static void
1610 flk_insert_sleeping_lock(lock_descriptor_t *request)
1611 {
1612         graph_t *gp = request->l_graph;
1613         vnode_t *vp = request->l_vnode;
1614         lock_descriptor_t       *lock;
1615 
1616         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1617         ASSERT(IS_INITIAL(request));
1618 
1619         for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1620                 lock->l_vnode < vp); lock = lock->l_next)
1621                 ;
1622 
1623         lock->l_prev->l_next = request;
1624         request->l_prev = lock->l_prev;
1625         lock->l_prev = request;
1626         request->l_next = lock;
1627         flk_set_state(request, FLK_SLEEPING_STATE);
1628         request->l_state |= SLEEPING_LOCK;
1629 }
1630 
1631 /*
1632  * Cancelling a sleeping lock implies removing a vertex from the
1633  * dependency graph and therefore we should recompute the dependencies
1634  * of all vertices that have a path  to this vertex, w.r.t. all
1635  * vertices reachable from this vertex.
1636  */
1637 
1638 void
1639 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1640 {
1641         graph_t *gp = request->l_graph;
1642         vnode_t *vp = request->l_vnode;
1643         lock_descriptor_t **topology = NULL;
1644         edge_t  *ep;
1645         lock_descriptor_t *vertex, *lock;
1646         int nvertex = 0;
1647         int i;
1648         lock_descriptor_t *vertex_stack;
1649 
1650         STACK_INIT(vertex_stack);
1651 
1652         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1653         /*
1654          * count number of vertex pointers that has to be allocated
1655          * All vertices that are reachable from request.
1656          */
1657 
1658         STACK_PUSH(vertex_stack, request, l_stack);
1659 
1660         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1661                 STACK_POP(vertex_stack, l_stack);
1662                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1663                                         ep = NEXT_ADJ(ep)) {
1664                         if (IS_RECOMPUTE(ep->to_vertex))
1665                                 continue;
1666                         ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1667                         STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1668                         nvertex++;
1669                 }
1670         }
1671 
1672         /*
1673          * allocate memory for holding the vertex pointers
1674          */
1675 
1676         if (nvertex) {
1677                 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1678                                                 KM_SLEEP);
1679         }
1680 
1681         /*
1682          * one more pass to actually store the vertices in the
1683          * allocated array.
1684          * We first check sleeping locks and then active locks
1685          * so that topology array will be in a topological
1686          * order.
1687          */
1688 
1689         nvertex = 0;
1690         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1691 
1692         if (lock) {
1693                 do {
1694                         if (IS_RECOMPUTE(lock)) {
1695                                 lock->l_index = nvertex;
1696                                 topology[nvertex++] = lock;
1697                         }
1698                         lock->l_color = NO_COLOR;
1699                         lock = lock->l_next;
1700                 } while (lock->l_vnode == vp);
1701         }
1702 
1703         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1704 
1705         if (lock) {
1706                 do {
1707                         if (IS_RECOMPUTE(lock)) {
1708                                 lock->l_index = nvertex;
1709                                 topology[nvertex++] = lock;
1710                         }
1711                         lock->l_color = NO_COLOR;
1712                         lock = lock->l_next;
1713                 } while (lock->l_vnode == vp);
1714         }
1715 
1716         /*
1717          * remove in and out edges of request
1718          * They are freed after updating proc_graph below.
1719          */
1720 
1721         for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1722                 ADJ_LIST_REMOVE(ep);
1723         }
1724 
1725 
1726         if (remove_from_queue)
1727                 REMOVE_SLEEP_QUEUE(request);
1728 
1729         /* we are ready to recompute */
1730 
1731         flk_recompute_dependencies(request, topology, nvertex, 1);
1732 
1733         ep = FIRST_ADJ(request);
1734         while (ep != HEAD(request)) {
1735                 IN_LIST_REMOVE(ep);
1736                 request->l_sedge = NEXT_ADJ(ep);
1737                 ADJ_LIST_REMOVE(ep);
1738                 flk_update_proc_graph(ep, 1);
1739                 flk_free_edge(ep);
1740                 ep = request->l_sedge;
1741         }
1742 
1743 
1744         /*
1745          * unset the RECOMPUTE flag in those vertices
1746          */
1747 
1748         for (i = 0; i < nvertex; i++) {
1749                 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1750         }
1751 
1752         /*
1753          * free the topology
1754          */
1755         if (nvertex)
1756                 kmem_free((void *)topology,
1757                         (nvertex * sizeof (lock_descriptor_t *)));
1758         /*
1759          * Possibility of some locks unblocked now
1760          */
1761 
1762         flk_wakeup(request, 0);
1763 
1764         /*
1765          * we expect to have a correctly recomputed graph  now.
1766          */
1767         flk_set_state(request, FLK_DEAD_STATE);
1768         flk_free_lock(request);
1769         CHECK_SLEEPING_LOCKS(gp);
1770         CHECK_ACTIVE_LOCKS(gp);
1771 
1772 }
1773 
1774 /*
1775  * Uncoloring the graph is simply to increment the mark value of the graph
1776  * And only when wrap round takes place will we color all vertices in
1777  * the graph explicitly.
1778  */
1779 
1780 static void
1781 flk_graph_uncolor(graph_t *gp)
1782 {
1783         lock_descriptor_t *lock;
1784 
1785         if (gp->mark == UINT_MAX) {
1786                 gp->mark = 1;
1787         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1788                                         lock = lock->l_next)
1789                         lock->l_color  = 0;
1790 
1791         for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1792                                         lock = lock->l_next)
1793                         lock->l_color  = 0;
1794         } else {
1795                 gp->mark++;
1796         }
1797 }
1798 
1799 /*
1800  * Wake up locks that are blocked on the given lock.
1801  */
1802 
1803 static void
1804 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1805 {
1806         edge_t  *ep;
1807         graph_t *gp = lock->l_graph;
1808         lock_descriptor_t       *lck;
1809 
1810         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1811         if (NO_DEPENDENTS(lock))
1812                 return;
1813         ep = FIRST_IN(lock);
1814         do {
1815                 /*
1816                  * delete the edge from the adjacency list
1817                  * of from vertex. if no more adjacent edges
1818                  * for this vertex wake this process.
1819                  */
1820                 lck = ep->from_vertex;
1821                 if (adj_list_remove)
1822                         ADJ_LIST_REMOVE(ep);
1823                 flk_update_proc_graph(ep, 1);
1824                 if (NOT_BLOCKED(lck)) {
1825                         GRANT_WAKEUP(lck);
1826                 }
1827                 lock->l_sedge = NEXT_IN(ep);
1828                 IN_LIST_REMOVE(ep);
1829                 flk_free_edge(ep);
1830                 ep = lock->l_sedge;
1831         } while (ep != HEAD(lock));
1832         ASSERT(NO_DEPENDENTS(lock));
1833 }
1834 
1835 /*
1836  * The dependents of request, is checked for its dependency against the
1837  * locks in topology (called topology because the array is and should be in
1838  * topological order for this algorithm, if not in topological order the
1839  * inner loop below might add more edges than necessary. Topological ordering
1840  * of vertices satisfies the property that all edges will be from left to
1841  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1842  * If lock l1 in the dependent set of request is dependent (blocked by)
1843  * on lock l2 in topology but does not have a path to it, we add an edge
1844  * in the inner loop below.
1845  *
1846  * We don't want to add an edge between l1 and l2 if there exists
1847  * already a path from l1 to l2, so care has to be taken for those vertices
1848  * that  have two paths to 'request'. These vertices are referred to here
1849  * as barrier locks.
1850  *
1851  * The barriers has to be found (those vertex that originally had two paths
1852  * to request) because otherwise we may end up adding edges unnecessarily
1853  * to vertices in topology, and thus barrier vertices can have an edge
1854  * to a vertex in topology as well a path to it.
1855  */
1856 
1857 static void
1858 flk_recompute_dependencies(lock_descriptor_t *request,
1859                 lock_descriptor_t **topology,
1860                         int nvertex, int update_graph)
1861 {
1862         lock_descriptor_t *vertex, *lock;
1863         graph_t *gp = request->l_graph;
1864         int i, count;
1865         int barrier_found = 0;
1866         edge_t  *ep;
1867         lock_descriptor_t *vertex_stack;
1868 
1869         STACK_INIT(vertex_stack);
1870 
1871         ASSERT(MUTEX_HELD(&gp->gp_mutex));
1872         if (nvertex == 0)
1873                 return;
1874         flk_graph_uncolor(request->l_graph);
1875         barrier_found = flk_find_barriers(request);
1876         request->l_state |= RECOMPUTE_DONE;
1877 
1878         STACK_PUSH(vertex_stack, request, l_stack);
1879         request->l_sedge = FIRST_IN(request);
1880 
1881 
1882         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1883                 if (vertex->l_state & RECOMPUTE_DONE) {
1884                         count = 0;
1885                         goto next_in_edge;
1886                 }
1887                 if (IS_BARRIER(vertex)) {
1888                         /* decrement the barrier count */
1889                         if (vertex->l_index) {
1890                                 vertex->l_index--;
1891                                 /* this guy will be pushed again anyway ? */
1892                                 STACK_POP(vertex_stack, l_stack);
1893                                 if (vertex->l_index == 0)  {
1894                                 /*
1895                                  * barrier is over we can recompute
1896                                  * dependencies for this lock in the
1897                                  * next stack pop
1898                                  */
1899                                         vertex->l_state &= ~BARRIER_LOCK;
1900                                 }
1901                                 continue;
1902                         }
1903                 }
1904                 vertex->l_state |= RECOMPUTE_DONE;
1905                 flk_graph_uncolor(gp);
1906                 count = flk_color_reachables(vertex);
1907                 for (i = 0; i < nvertex; i++) {
1908                         lock = topology[i];
1909                         if (COLORED(lock))
1910                                 continue;
1911                         if (BLOCKS(lock, vertex)) {
1912                                 (void) flk_add_edge(vertex, lock,
1913                                     NO_CHECK_CYCLE, update_graph);
1914                                 COLOR(lock);
1915                                 count++;
1916                                 count += flk_color_reachables(lock);
1917                         }
1918 
1919                 }
1920 
1921 next_in_edge:
1922                 if (count == nvertex ||
1923                                 vertex->l_sedge == HEAD(vertex)) {
1924                         /* prune the tree below this */
1925                         STACK_POP(vertex_stack, l_stack);
1926                         vertex->l_state &= ~RECOMPUTE_DONE;
1927                         /* update the barrier locks below this! */
1928                         if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1929                                 flk_graph_uncolor(gp);
1930                                 flk_update_barriers(vertex);
1931                         }
1932                         continue;
1933                 }
1934 
1935                 ep = vertex->l_sedge;
1936                 lock = ep->from_vertex;
1937                 STACK_PUSH(vertex_stack, lock, l_stack);
1938                 lock->l_sedge = FIRST_IN(lock);
1939                 vertex->l_sedge = NEXT_IN(ep);
1940         }
1941 
1942 }
1943 
1944 /*
1945  * Color all reachable vertices from vertex that belongs to topology (here
1946  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1947  *
1948  * Note: we need to use a different stack_link l_stack1 because this is
1949  * called from flk_recompute_dependencies() that already uses a stack with
1950  * l_stack as stack_link.
1951  */
1952 
1953 static int
1954 flk_color_reachables(lock_descriptor_t *vertex)
1955 {
1956         lock_descriptor_t *ver, *lock;
1957         int count;
1958         edge_t  *ep;
1959         lock_descriptor_t *vertex_stack;
1960 
1961         STACK_INIT(vertex_stack);
1962 
1963         STACK_PUSH(vertex_stack, vertex, l_stack1);
1964         count = 0;
1965         while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1966 
1967                 STACK_POP(vertex_stack, l_stack1);
1968                 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1969                                         ep = NEXT_ADJ(ep)) {
1970                         lock = ep->to_vertex;
1971                         if (COLORED(lock))
1972                                 continue;
1973                         COLOR(lock);
1974                         if (IS_RECOMPUTE(lock))
1975                                 count++;
1976                         STACK_PUSH(vertex_stack, lock, l_stack1);
1977                 }
1978 
1979         }
1980         return (count);
1981 }
1982 
1983 /*
1984  * Called from flk_recompute_dependencies() this routine decrements
1985  * the barrier count of barrier vertices that are reachable from lock.
1986  */
1987 
1988 static void
1989 flk_update_barriers(lock_descriptor_t *lock)
1990 {
1991         lock_descriptor_t *vertex, *lck;
1992         edge_t  *ep;
1993         lock_descriptor_t *vertex_stack;
1994 
1995         STACK_INIT(vertex_stack);
1996 
1997         STACK_PUSH(vertex_stack, lock, l_stack1);
1998 
1999         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2000                 STACK_POP(vertex_stack, l_stack1);
2001                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2002                                                 ep = NEXT_IN(ep)) {
2003                         lck = ep->from_vertex;
2004                         if (COLORED(lck)) {
2005                                 if (IS_BARRIER(lck)) {
2006                                         ASSERT(lck->l_index > 0);
2007                                         lck->l_index--;
2008                                         if (lck->l_index == 0)
2009                                                 lck->l_state &= ~BARRIER_LOCK;
2010                                 }
2011                                 continue;
2012                         }
2013                         COLOR(lck);
2014                         if (IS_BARRIER(lck)) {
2015                                 ASSERT(lck->l_index > 0);
2016                                 lck->l_index--;
2017                                 if (lck->l_index == 0)
2018                                         lck->l_state &= ~BARRIER_LOCK;
2019                         }
2020                         STACK_PUSH(vertex_stack, lck, l_stack1);
2021                 }
2022         }
2023 }
2024 
2025 /*
2026  * Finds all vertices that are reachable from 'lock' more than once and
2027  * mark them as barrier vertices and increment their barrier count.
2028  * The barrier count is one minus the total number of paths from lock
2029  * to that vertex.
2030  */
2031 
2032 static int
2033 flk_find_barriers(lock_descriptor_t *lock)
2034 {
2035         lock_descriptor_t *vertex, *lck;
2036         int found = 0;
2037         edge_t  *ep;
2038         lock_descriptor_t *vertex_stack;
2039 
2040         STACK_INIT(vertex_stack);
2041 
2042         STACK_PUSH(vertex_stack, lock, l_stack1);
2043 
2044         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2045                 STACK_POP(vertex_stack, l_stack1);
2046                 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2047                                                 ep = NEXT_IN(ep)) {
2048                         lck = ep->from_vertex;
2049                         if (COLORED(lck)) {
2050                                 /* this is a barrier */
2051                                 lck->l_state |= BARRIER_LOCK;
2052                                 /* index will have barrier count */
2053                                 lck->l_index++;
2054                                 if (!found)
2055                                         found = 1;
2056                                 continue;
2057                         }
2058                         COLOR(lck);
2059                         lck->l_index = 0;
2060                         STACK_PUSH(vertex_stack, lck, l_stack1);
2061                 }
2062         }
2063         return (found);
2064 }
2065 
2066 /*
2067  * Finds the first lock that is mainly responsible for blocking this
2068  * request.  If there is no such lock, request->l_flock.l_type is set to
2069  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2070  * of the blocking lock.
2071  *
2072  * Note: It is possible a request is blocked by a sleeping lock because
2073  * of the fairness policy used in flk_process_request() to construct the
2074  * dependencies. (see comments before flk_process_request()).
2075  */
2076 
2077 static void
2078 flk_get_first_blocking_lock(lock_descriptor_t *request)
2079 {
2080         graph_t *gp = request->l_graph;
2081         vnode_t *vp = request->l_vnode;
2082         lock_descriptor_t *lock, *blocker;
2083 
2084         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2085         blocker = NULL;
2086         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2087 
2088         if (lock) {
2089                 do {
2090                         if (BLOCKS(lock, request)) {
2091                                 blocker = lock;
2092                                 break;
2093                         }
2094                         lock = lock->l_next;
2095                 } while (lock->l_vnode == vp);
2096         }
2097 
2098         if (blocker) {
2099                 report_blocker(blocker, request);
2100         } else
2101                 request->l_flock.l_type = F_UNLCK;
2102 }
2103 
2104 /*
2105  * Get the graph_t structure associated with a vnode.
2106  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2107  * not yet been initialized, then a new element is allocated and returned.
2108  */
2109 graph_t *
2110 flk_get_lock_graph(vnode_t *vp, int initialize)
2111 {
2112         graph_t *gp;
2113         graph_t *gp_alloc = NULL;
2114         int index = HASH_INDEX(vp);
2115 
2116         if (initialize == FLK_USE_GRAPH) {
2117                 mutex_enter(&flock_lock);
2118                 gp = lock_graph[index];
2119                 mutex_exit(&flock_lock);
2120                 return (gp);
2121         }
2122 
2123         ASSERT(initialize == FLK_INIT_GRAPH);
2124 
2125         if (lock_graph[index] == NULL) {
2126 
2127                 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2128 
2129                 /* Initialize the graph */
2130 
2131                 gp_alloc->active_locks.l_next =
2132                     gp_alloc->active_locks.l_prev =
2133                     (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2134                 gp_alloc->sleeping_locks.l_next =
2135                     gp_alloc->sleeping_locks.l_prev =
2136                     (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2137                 gp_alloc->index = index;
2138                 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2139         }
2140 
2141         mutex_enter(&flock_lock);
2142 
2143         gp = lock_graph[index];
2144 
2145         /* Recheck the value within flock_lock */
2146         if (gp == NULL) {
2147                 struct flock_globals *fg;
2148 
2149                 /* We must have previously allocated the graph_t structure */
2150                 ASSERT(gp_alloc != NULL);
2151                 lock_graph[index] = gp = gp_alloc;
2152                 /*
2153                  * The lockmgr status is only needed if KLM is loaded.
2154                  */
2155                 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2156                         fg = flk_get_globals();
2157                         fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2158                 }
2159         }
2160 
2161         mutex_exit(&flock_lock);
2162 
2163         if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2164                 /* There was a race to allocate the graph_t and we lost */
2165                 mutex_destroy(&gp_alloc->gp_mutex);
2166                 kmem_free(gp_alloc, sizeof (graph_t));
2167         }
2168 
2169         return (gp);
2170 }
2171 
2172 /*
2173  * PSARC case 1997/292
2174  */
2175 int
2176 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2177 {
2178         lock_descriptor_t *lock;
2179         int result = 0;
2180         graph_t *gp;
2181         int                     lock_nlmid;
2182 
2183         /*
2184          * Check to see if node is booted as a cluster. If not, return.
2185          */
2186         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2187                 return (0);
2188         }
2189 
2190         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2191         if (gp == NULL) {
2192                 return (0);
2193         }
2194 
2195         mutex_enter(&gp->gp_mutex);
2196 
2197         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2198 
2199         if (lock) {
2200                 while (lock->l_vnode == vp) {
2201                         /* get NLM id from sysid */
2202                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2203 
2204                         /*
2205                          * If NLM server request _and_ nlmid of lock matches
2206                          * nlmid of argument, then we've found a remote lock.
2207                          */
2208                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2209                                 result = 1;
2210                                 goto done;
2211                         }
2212                         lock = lock->l_next;
2213                 }
2214         }
2215 
2216         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2217 
2218         if (lock) {
2219                 while (lock->l_vnode == vp) {
2220                         /* get NLM id from sysid */
2221                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2222 
2223                         /*
2224                          * If NLM server request _and_ nlmid of lock matches
2225                          * nlmid of argument, then we've found a remote lock.
2226                          */
2227                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2228                                 result = 1;
2229                                 goto done;
2230                         }
2231                         lock = lock->l_next;
2232                 }
2233         }
2234 
2235 done:
2236         mutex_exit(&gp->gp_mutex);
2237         return (result);
2238 }
2239 
2240 /*
2241  * Determine whether there are any locks for the given vnode with a remote
2242  * sysid.  Returns zero if not, non-zero if there are.
2243  *
2244  * Note that the return value from this function is potentially invalid
2245  * once it has been returned.  The caller is responsible for providing its
2246  * own synchronization mechanism to ensure that the return value is useful
2247  * (e.g., see nfs_lockcompletion()).
2248  */
2249 int
2250 flk_has_remote_locks(vnode_t *vp)
2251 {
2252         lock_descriptor_t *lock;
2253         int result = 0;
2254         graph_t *gp;
2255 
2256         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2257         if (gp == NULL) {
2258                 return (0);
2259         }
2260 
2261         mutex_enter(&gp->gp_mutex);
2262 
2263         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2264 
2265         if (lock) {
2266                 while (lock->l_vnode == vp) {
2267                         if (IS_REMOTE(lock)) {
2268                                 result = 1;
2269                                 goto done;
2270                         }
2271                         lock = lock->l_next;
2272                 }
2273         }
2274 
2275         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2276 
2277         if (lock) {
2278                 while (lock->l_vnode == vp) {
2279                         if (IS_REMOTE(lock)) {
2280                                 result = 1;
2281                                 goto done;
2282                         }
2283                         lock = lock->l_next;
2284                 }
2285         }
2286 
2287 done:
2288         mutex_exit(&gp->gp_mutex);
2289         return (result);
2290 }
2291 
2292 /*
2293  * Determine if there are any locks owned by the given sysid.
2294  * Returns zero if not, non-zero if there are.  Note that this return code
2295  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2296  * avoids all the memory allocations of those routines.
2297  *
2298  * This routine has the same synchronization issues as
2299  * flk_has_remote_locks.
2300  */
2301 
2302 int
2303 flk_sysid_has_locks(int sysid, int lck_type)
2304 {
2305         int             has_locks = 0;
2306         lock_descriptor_t       *lock;
2307         graph_t         *gp;
2308         int             i;
2309 
2310         for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2311                 mutex_enter(&flock_lock);
2312                 gp = lock_graph[i];
2313                 mutex_exit(&flock_lock);
2314                 if (gp == NULL) {
2315                         continue;
2316                 }
2317 
2318                 mutex_enter(&gp->gp_mutex);
2319 
2320                 if (lck_type & FLK_QUERY_ACTIVE) {
2321                         for (lock = ACTIVE_HEAD(gp)->l_next;
2322                             lock != ACTIVE_HEAD(gp) && !has_locks;
2323                             lock = lock->l_next) {
2324                                 if (lock->l_flock.l_sysid == sysid)
2325                                         has_locks = 1;
2326                         }
2327                 }
2328 
2329                 if (lck_type & FLK_QUERY_SLEEPING) {
2330                         for (lock = SLEEPING_HEAD(gp)->l_next;
2331                                 lock != SLEEPING_HEAD(gp) && !has_locks;
2332                                 lock = lock->l_next) {
2333                                 if (lock->l_flock.l_sysid == sysid)
2334                                         has_locks = 1;
2335                         }
2336                 }
2337                 mutex_exit(&gp->gp_mutex);
2338         }
2339 
2340         return (has_locks);
2341 }
2342 
2343 
2344 /*
2345  * PSARC case 1997/292
2346  *
2347  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2348  *  quantity, the real sysid generated by the NLM server; the upper half
2349  *  identifies the node of the cluster where the NLM server ran.
2350  *  This routine is only called by an NLM server running in a cluster.
2351  * Effects: Remove all locks held on behalf of the client identified
2352  *  by "sysid."
2353  */
2354 void
2355 cl_flk_remove_locks_by_sysid(int sysid)
2356 {
2357         graph_t *gp;
2358         int i;
2359         lock_descriptor_t *lock, *nlock;
2360 
2361         /*
2362          * Check to see if node is booted as a cluster. If not, return.
2363          */
2364         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2365                 return;
2366         }
2367 
2368         ASSERT(sysid != 0);
2369         for (i = 0; i < HASH_SIZE; i++) {
2370                 mutex_enter(&flock_lock);
2371                 gp = lock_graph[i];
2372                 mutex_exit(&flock_lock);
2373 
2374                 if (gp == NULL)
2375                         continue;
2376 
2377                 mutex_enter(&gp->gp_mutex);      /*  get mutex on lock graph */
2378 
2379                 /* signal sleeping requests so that they bail out */
2380                 lock = SLEEPING_HEAD(gp)->l_next;
2381                 while (lock != SLEEPING_HEAD(gp)) {
2382                         nlock = lock->l_next;
2383                         if (lock->l_flock.l_sysid == sysid) {
2384                                 INTERRUPT_WAKEUP(lock);
2385                         }
2386                         lock = nlock;
2387                 }
2388 
2389                 /* delete active locks */
2390                 lock = ACTIVE_HEAD(gp)->l_next;
2391                 while (lock != ACTIVE_HEAD(gp)) {
2392                         nlock = lock->l_next;
2393                         if (lock->l_flock.l_sysid == sysid) {
2394                                 flk_delete_active_lock(lock, 0);
2395                                 flk_wakeup(lock, 1);
2396                                 flk_free_lock(lock);
2397                         }
2398                         lock = nlock;
2399                 }
2400                 mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2401         }
2402 }
2403 
2404 /*
2405  * Delete all locks in the system that belongs to the sysid of the request.
2406  */
2407 
2408 static void
2409 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2410 {
2411         int     sysid  = request->l_flock.l_sysid;
2412         lock_descriptor_t *lock, *nlock;
2413         graph_t *gp;
2414         int i;
2415 
2416         ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2417         ASSERT(sysid != 0);
2418 
2419         mutex_exit(&request->l_graph->gp_mutex);
2420 
2421         for (i = 0; i < HASH_SIZE; i++) {
2422                 mutex_enter(&flock_lock);
2423                 gp = lock_graph[i];
2424                 mutex_exit(&flock_lock);
2425 
2426                 if (gp == NULL)
2427                         continue;
2428 
2429                 mutex_enter(&gp->gp_mutex);
2430 
2431                 /* signal sleeping requests so that they bail out */
2432                 lock = SLEEPING_HEAD(gp)->l_next;
2433                 while (lock != SLEEPING_HEAD(gp)) {
2434                         nlock = lock->l_next;
2435                         if (lock->l_flock.l_sysid == sysid) {
2436                                 INTERRUPT_WAKEUP(lock);
2437                         }
2438                         lock = nlock;
2439                 }
2440 
2441                 /* delete active locks */
2442                 lock = ACTIVE_HEAD(gp)->l_next;
2443                 while (lock != ACTIVE_HEAD(gp)) {
2444                         nlock = lock->l_next;
2445                         if (lock->l_flock.l_sysid == sysid) {
2446                                 flk_delete_active_lock(lock, 0);
2447                                 flk_wakeup(lock, 1);
2448                                 flk_free_lock(lock);
2449                         }
2450                         lock = nlock;
2451                 }
2452                 mutex_exit(&gp->gp_mutex);
2453         }
2454 
2455         mutex_enter(&request->l_graph->gp_mutex);
2456 }
2457 
2458 /*
2459  * Clustering: Deletes PXFS locks
2460  * Effects: Delete all locks on files in the given file system and with the
2461  *  given PXFS id.
2462  */
2463 void
2464 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2465 {
2466         lock_descriptor_t *lock, *nlock;
2467         graph_t *gp;
2468         int i;
2469 
2470         for (i = 0; i < HASH_SIZE; i++) {
2471                 mutex_enter(&flock_lock);
2472                 gp = lock_graph[i];
2473                 mutex_exit(&flock_lock);
2474 
2475                 if (gp == NULL)
2476                         continue;
2477 
2478                 mutex_enter(&gp->gp_mutex);
2479 
2480                 /* signal sleeping requests so that they bail out */
2481                 lock = SLEEPING_HEAD(gp)->l_next;
2482                 while (lock != SLEEPING_HEAD(gp)) {
2483                         nlock = lock->l_next;
2484                         if (lock->l_vnode->v_vfsp == vfsp) {
2485                                 ASSERT(IS_PXFS(lock));
2486                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2487                                     pxfsid) {
2488                                         flk_set_state(lock,
2489                                             FLK_CANCELLED_STATE);
2490                                         flk_cancel_sleeping_lock(lock, 1);
2491                                 }
2492                         }
2493                         lock = nlock;
2494                 }
2495 
2496                 /* delete active locks */
2497                 lock = ACTIVE_HEAD(gp)->l_next;
2498                 while (lock != ACTIVE_HEAD(gp)) {
2499                         nlock = lock->l_next;
2500                         if (lock->l_vnode->v_vfsp == vfsp) {
2501                                 ASSERT(IS_PXFS(lock));
2502                                 if (GETPXFSID(lock->l_flock.l_sysid) ==
2503                                     pxfsid) {
2504                                         flk_delete_active_lock(lock, 0);
2505                                         flk_wakeup(lock, 1);
2506                                         flk_free_lock(lock);
2507                                 }
2508                         }
2509                         lock = nlock;
2510                 }
2511                 mutex_exit(&gp->gp_mutex);
2512         }
2513 }
2514 
2515 /*
2516  * Search for a sleeping lock manager lock which matches exactly this lock
2517  * request; if one is found, fake a signal to cancel it.
2518  *
2519  * Return 1 if a matching lock was found, 0 otherwise.
2520  */
2521 
2522 static int
2523 flk_canceled(lock_descriptor_t *request)
2524 {
2525         lock_descriptor_t *lock, *nlock;
2526         graph_t *gp = request->l_graph;
2527         vnode_t *vp = request->l_vnode;
2528 
2529         ASSERT(MUTEX_HELD(&gp->gp_mutex));
2530         ASSERT(IS_LOCKMGR(request));
2531         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2532 
2533         if (lock) {
2534                 while (lock->l_vnode == vp) {
2535                         nlock = lock->l_next;
2536                         if (SAME_OWNER(lock, request) &&
2537                                 lock->l_start == request->l_start &&
2538                                         lock->l_end == request->l_end) {
2539                                 INTERRUPT_WAKEUP(lock);
2540                                 return (1);
2541                         }
2542                         lock = nlock;
2543                 }
2544         }
2545         return (0);
2546 }
2547 
2548 /*
2549  * Remove all the locks for the vnode belonging to the given pid and sysid.
2550  */
2551 
2552 void
2553 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2554 {
2555         graph_t *gp;
2556         lock_descriptor_t *lock, *nlock;
2557         lock_descriptor_t *link_stack;
2558 
2559         STACK_INIT(link_stack);
2560 
2561         gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2562 
2563         if (gp == NULL)
2564                 return;
2565         mutex_enter(&gp->gp_mutex);
2566 
2567         CHECK_SLEEPING_LOCKS(gp);
2568         CHECK_ACTIVE_LOCKS(gp);
2569 
2570         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2571 
2572         if (lock) {
2573                 do {
2574                         nlock = lock->l_next;
2575                         if ((lock->l_flock.l_pid == pid ||
2576                                         pid == IGN_PID) &&
2577                                 lock->l_flock.l_sysid == sysid) {
2578                                 CANCEL_WAKEUP(lock);
2579                         }
2580                         lock = nlock;
2581                 } while (lock->l_vnode == vp);
2582         }
2583 
2584         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2585 
2586         if (lock) {
2587                 do {
2588                         nlock = lock->l_next;
2589                         if ((lock->l_flock.l_pid == pid ||
2590                                         pid == IGN_PID) &&
2591                                 lock->l_flock.l_sysid == sysid) {
2592                                 flk_delete_active_lock(lock, 0);
2593                                 STACK_PUSH(link_stack, lock, l_stack);
2594                         }
2595                         lock = nlock;
2596                 } while (lock->l_vnode == vp);
2597         }
2598 
2599         while ((lock = STACK_TOP(link_stack)) != NULL) {
2600                 STACK_POP(link_stack, l_stack);
2601                 flk_wakeup(lock, 1);
2602                 flk_free_lock(lock);
2603         }
2604 
2605         CHECK_SLEEPING_LOCKS(gp);
2606         CHECK_ACTIVE_LOCKS(gp);
2607         CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2608         mutex_exit(&gp->gp_mutex);
2609 }
2610 
2611 
2612 /*
2613  * Called from 'fs' read and write routines for files that have mandatory
2614  * locking enabled.
2615  */
2616 
2617 int
2618 chklock(
2619         struct vnode    *vp,
2620         int             iomode,
2621         u_offset_t      offset,
2622         ssize_t         len,
2623         int             fmode,
2624         caller_context_t *ct)
2625 {
2626         register int    i;
2627         struct flock64  bf;
2628         int             error = 0;
2629 
2630         bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2631         bf.l_whence = 0;
2632         bf.l_start = offset;
2633         bf.l_len = len;
2634         if (ct == NULL) {
2635                 bf.l_pid = curproc->p_pid;
2636                 bf.l_sysid = 0;
2637         } else {
2638                 bf.l_pid = ct->cc_pid;
2639                 bf.l_sysid = ct->cc_sysid;
2640         }
2641         i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2642         if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2643             bf.l_type != F_UNLCK)
2644                 error = i ? i : EAGAIN;
2645         return (error);
2646 }
2647 
2648 /*
2649  * convoff - converts the given data (start, whence) to the
2650  * given whence.
2651  */
2652 int
2653 convoff(vp, lckdat, whence, offset)
2654         struct vnode    *vp;
2655         struct flock64  *lckdat;
2656         int             whence;
2657         offset_t        offset;
2658 {
2659         int             error;
2660         struct vattr    vattr;
2661 
2662         if ((lckdat->l_whence == 2) || (whence == 2)) {
2663                 vattr.va_mask = AT_SIZE;
2664                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2665                         return (error);
2666         }
2667 
2668         switch (lckdat->l_whence) {
2669         case 1:
2670                 lckdat->l_start += offset;
2671                 break;
2672         case 2:
2673                 lckdat->l_start += vattr.va_size;
2674                 /* FALLTHRU */
2675         case 0:
2676                 break;
2677         default:
2678                 return (EINVAL);
2679         }
2680 
2681         if (lckdat->l_start < 0)
2682                 return (EINVAL);
2683 
2684         switch (whence) {
2685         case 1:
2686                 lckdat->l_start -= offset;
2687                 break;
2688         case 2:
2689                 lckdat->l_start -= vattr.va_size;
2690                 /* FALLTHRU */
2691         case 0:
2692                 break;
2693         default:
2694                 return (EINVAL);
2695         }
2696 
2697         lckdat->l_whence = (short)whence;
2698         return (0);
2699 }
2700 
2701 
2702 /*      proc_graph function definitions */
2703 
2704 /*
2705  * Function checks for deadlock due to the new 'lock'. If deadlock found
2706  * edges of this lock are freed and returned.
2707  */
2708 
2709 static int
2710 flk_check_deadlock(lock_descriptor_t *lock)
2711 {
2712         proc_vertex_t   *start_vertex, *pvertex;
2713         proc_vertex_t *dvertex;
2714         proc_edge_t *pep, *ppep;
2715         edge_t  *ep, *nep;
2716         proc_vertex_t *process_stack;
2717 
2718         STACK_INIT(process_stack);
2719 
2720         mutex_enter(&flock_lock);
2721         start_vertex = flk_get_proc_vertex(lock);
2722         ASSERT(start_vertex != NULL);
2723 
2724         /* construct the edges from this process to other processes */
2725 
2726         ep = FIRST_ADJ(lock);
2727         while (ep != HEAD(lock)) {
2728                 proc_vertex_t *adj_proc;
2729 
2730                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2731                 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2732                         if (pep->to_proc == adj_proc) {
2733                                 ASSERT(pep->refcount);
2734                                 pep->refcount++;
2735                                 break;
2736                         }
2737                 }
2738                 if (pep == NULL) {
2739                         pep = flk_get_proc_edge();
2740                         pep->to_proc = adj_proc;
2741                         pep->refcount = 1;
2742                         adj_proc->incount++;
2743                         pep->next = start_vertex->edge;
2744                         start_vertex->edge = pep;
2745                 }
2746                 ep = NEXT_ADJ(ep);
2747         }
2748 
2749         ep = FIRST_IN(lock);
2750 
2751         while (ep != HEAD(lock)) {
2752                 proc_vertex_t *in_proc;
2753 
2754                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2755 
2756                 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2757                         if (pep->to_proc == start_vertex) {
2758                                 ASSERT(pep->refcount);
2759                                 pep->refcount++;
2760                                 break;
2761                         }
2762                 }
2763                 if (pep == NULL) {
2764                         pep = flk_get_proc_edge();
2765                         pep->to_proc = start_vertex;
2766                         pep->refcount = 1;
2767                         start_vertex->incount++;
2768                         pep->next = in_proc->edge;
2769                         in_proc->edge = pep;
2770                 }
2771                 ep = NEXT_IN(ep);
2772         }
2773 
2774         if (start_vertex->incount == 0) {
2775                 mutex_exit(&flock_lock);
2776                 return (0);
2777         }
2778 
2779         flk_proc_graph_uncolor();
2780 
2781         start_vertex->p_sedge = start_vertex->edge;
2782 
2783         STACK_PUSH(process_stack, start_vertex, p_stack);
2784 
2785         while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2786                 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2787                         dvertex = pep->to_proc;
2788                         if (!PROC_ARRIVED(dvertex)) {
2789                                 STACK_PUSH(process_stack, dvertex, p_stack);
2790                                 dvertex->p_sedge = dvertex->edge;
2791                                 PROC_ARRIVE(pvertex);
2792                                 pvertex->p_sedge = pep->next;
2793                                 break;
2794                         }
2795                         if (!PROC_DEPARTED(dvertex))
2796                                 goto deadlock;
2797                 }
2798                 if (pep == NULL) {
2799                         PROC_DEPART(pvertex);
2800                         STACK_POP(process_stack, p_stack);
2801                 }
2802         }
2803         mutex_exit(&flock_lock);
2804         return (0);
2805 
2806 deadlock:
2807 
2808         /* we remove all lock edges and proc edges */
2809 
2810         ep = FIRST_ADJ(lock);
2811         while (ep != HEAD(lock)) {
2812                 proc_vertex_t *adj_proc;
2813                 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2814                 nep = NEXT_ADJ(ep);
2815                 IN_LIST_REMOVE(ep);
2816                 ADJ_LIST_REMOVE(ep);
2817                 flk_free_edge(ep);
2818                 ppep = start_vertex->edge;
2819                 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2820                                                 pep = ppep->next) {
2821                         if (pep->to_proc == adj_proc) {
2822                                 pep->refcount--;
2823                                 if (pep->refcount == 0) {
2824                                         if (pep == ppep) {
2825                                                 start_vertex->edge = pep->next;
2826                                         } else {
2827                                                 ppep->next = pep->next;
2828                                         }
2829                                         adj_proc->incount--;
2830                                         flk_proc_release(adj_proc);
2831                                         flk_free_proc_edge(pep);
2832                                 }
2833                                 break;
2834                         }
2835                 }
2836                 ep = nep;
2837         }
2838         ep = FIRST_IN(lock);
2839         while (ep != HEAD(lock)) {
2840                 proc_vertex_t *in_proc;
2841                 in_proc = flk_get_proc_vertex(ep->from_vertex);
2842                 nep = NEXT_IN(ep);
2843                 IN_LIST_REMOVE(ep);
2844                 ADJ_LIST_REMOVE(ep);
2845                 flk_free_edge(ep);
2846                 ppep = in_proc->edge;
2847                 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2848                                                 pep = ppep->next) {
2849                         if (pep->to_proc == start_vertex) {
2850                                 pep->refcount--;
2851                                 if (pep->refcount == 0) {
2852                                         if (pep == ppep) {
2853                                                 in_proc->edge = pep->next;
2854                                         } else {
2855                                                 ppep->next = pep->next;
2856                                         }
2857                                         start_vertex->incount--;
2858                                         flk_proc_release(in_proc);
2859                                         flk_free_proc_edge(pep);
2860                                 }
2861                                 break;
2862                         }
2863                 }
2864                 ep = nep;
2865         }
2866         flk_proc_release(start_vertex);
2867         mutex_exit(&flock_lock);
2868         return (1);
2869 }
2870 
2871 /*
2872  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2873  * from the list we return that, otherwise we allocate one. If necessary,
2874  * we grow the list of vertices also.
2875  */
2876 
2877 static proc_vertex_t *
2878 flk_get_proc_vertex(lock_descriptor_t *lock)
2879 {
2880         int i;
2881         proc_vertex_t   *pv;
2882         proc_vertex_t   **palloc;
2883 
2884         ASSERT(MUTEX_HELD(&flock_lock));
2885         if (lock->pvertex != -1) {
2886                 ASSERT(lock->pvertex >= 0);
2887                 pv = pgraph.proc[lock->pvertex];
2888                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2889                         return (pv);
2890                 }
2891         }
2892         for (i = 0; i < pgraph.gcount; i++) {
2893                 pv = pgraph.proc[i];
2894                 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2895                         lock->pvertex = pv->index = i;
2896                         return (pv);
2897                 }
2898         }
2899         pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2900         pv->pid = lock->l_flock.l_pid;
2901         pv->sysid = lock->l_flock.l_sysid;
2902         flk_proc_vertex_allocs++;
2903         if (pgraph.free != 0) {
2904                 for (i = 0; i < pgraph.gcount; i++) {
2905                         if (pgraph.proc[i] == NULL) {
2906                                 pgraph.proc[i] = pv;
2907                                 lock->pvertex = pv->index = i;
2908                                 pgraph.free--;
2909                                 return (pv);
2910                         }
2911                 }
2912         }
2913         palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2914                                 sizeof (proc_vertex_t *), KM_SLEEP);
2915 
2916         if (pgraph.proc) {
2917                 bcopy(pgraph.proc, palloc,
2918                         pgraph.gcount * sizeof (proc_vertex_t *));
2919 
2920                 kmem_free(pgraph.proc,
2921                         pgraph.gcount * sizeof (proc_vertex_t *));
2922         }
2923         pgraph.proc = palloc;
2924         pgraph.free += (PROC_CHUNK - 1);
2925         pv->index = lock->pvertex = pgraph.gcount;
2926         pgraph.gcount += PROC_CHUNK;
2927         pgraph.proc[pv->index] = pv;
2928         return (pv);
2929 }
2930 
2931 /*
2932  * Allocate a proc edge.
2933  */
2934 
2935 static proc_edge_t *
2936 flk_get_proc_edge()
2937 {
2938         proc_edge_t *pep;
2939 
2940         pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2941         flk_proc_edge_allocs++;
2942         return (pep);
2943 }
2944 
2945 /*
2946  * Free the proc edge. Called whenever its reference count goes to zero.
2947  */
2948 
2949 static void
2950 flk_free_proc_edge(proc_edge_t *pep)
2951 {
2952         ASSERT(pep->refcount == 0);
2953         kmem_free((void *)pep, sizeof (proc_edge_t));
2954         flk_proc_edge_frees++;
2955 }
2956 
2957 /*
2958  * Color the graph explicitly done only when the mark value hits max value.
2959  */
2960 
2961 static void
2962 flk_proc_graph_uncolor()
2963 {
2964         int i;
2965 
2966         if (pgraph.mark == UINT_MAX) {
2967                 for (i = 0; i < pgraph.gcount; i++)
2968                         if (pgraph.proc[i] != NULL) {
2969                                 pgraph.proc[i]->atime = 0;
2970                                 pgraph.proc[i]->dtime = 0;
2971                         }
2972                 pgraph.mark = 1;
2973         } else {
2974                 pgraph.mark++;
2975         }
2976 }
2977 
2978 /*
2979  * Release the proc vertex iff both there are no in edges and out edges
2980  */
2981 
2982 static void
2983 flk_proc_release(proc_vertex_t *proc)
2984 {
2985         ASSERT(MUTEX_HELD(&flock_lock));
2986         if (proc->edge == NULL && proc->incount == 0) {
2987                 pgraph.proc[proc->index] = NULL;
2988                 pgraph.free++;
2989                 kmem_free(proc, sizeof (proc_vertex_t));
2990                 flk_proc_vertex_frees++;
2991         }
2992 }
2993 
2994 /*
2995  * Updates process graph to reflect change in a lock_graph.
2996  * Note: We should call this function only after we have a correctly
2997  * recomputed lock graph. Otherwise we might miss a deadlock detection.
2998  * eg: in function flk_relation() we call this function after flk_recompute_
2999  * dependencies() otherwise if a process tries to lock a vnode hashed
3000  * into another graph it might sleep for ever.
3001  */
3002 
3003 static void
3004 flk_update_proc_graph(edge_t *ep, int delete)
3005 {
3006         proc_vertex_t *toproc, *fromproc;
3007         proc_edge_t *pep, *prevpep;
3008 
3009         mutex_enter(&flock_lock);
3010         toproc = flk_get_proc_vertex(ep->to_vertex);
3011         fromproc = flk_get_proc_vertex(ep->from_vertex);
3012 
3013         if (!delete)
3014                 goto add;
3015         pep = prevpep = fromproc->edge;
3016 
3017         ASSERT(pep != NULL);
3018         while (pep != NULL) {
3019                 if (pep->to_proc == toproc) {
3020                         ASSERT(pep->refcount > 0);
3021                         pep->refcount--;
3022                         if (pep->refcount == 0) {
3023                                 if (pep == prevpep) {
3024                                         fromproc->edge = pep->next;
3025                                 } else {
3026                                         prevpep->next = pep->next;
3027                                 }
3028                                 toproc->incount--;
3029                                 flk_proc_release(toproc);
3030                                 flk_free_proc_edge(pep);
3031                         }
3032                         break;
3033                 }
3034                 prevpep = pep;
3035                 pep = pep->next;
3036         }
3037         flk_proc_release(fromproc);
3038         mutex_exit(&flock_lock);
3039         return;
3040 add:
3041 
3042         pep = fromproc->edge;
3043 
3044         while (pep != NULL) {
3045                 if (pep->to_proc == toproc) {
3046                         ASSERT(pep->refcount > 0);
3047                         pep->refcount++;
3048                         break;
3049                 }
3050                 pep = pep->next;
3051         }
3052         if (pep == NULL) {
3053                 pep = flk_get_proc_edge();
3054                 pep->to_proc = toproc;
3055                 pep->refcount = 1;
3056                 toproc->incount++;
3057                 pep->next = fromproc->edge;
3058                 fromproc->edge = pep;
3059         }
3060         mutex_exit(&flock_lock);
3061 }
3062 
3063 /*
3064  * Set the control status for lock manager requests.
3065  *
3066  */
3067 
3068 /*
3069  * PSARC case 1997/292
3070  *
3071  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3072  * Effects: Set the state of the NLM server identified by "nlmid"
3073  *   in the NLM registry to state "nlm_state."
3074  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3075  *   NLM server to this LLM.
3076  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3077  *   may be locks requests that have gotten started but not finished.  In
3078  *   particular, there may be blocking requests that are in the callback code
3079  *   before sleeping (so they're not holding the lock for the graph).  If
3080  *   such a thread reacquires the graph's lock (to go to sleep) after
3081  *   NLM state in the NLM registry  is set to a non-up value,
3082  *   it will notice the status and bail out.  If the request gets
3083  *   granted before the thread can check the NLM registry, let it
3084  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3085  *
3086  * Modifies: nlm_reg_obj (global)
3087  * Arguments:
3088  *    nlmid     (IN):    id uniquely identifying an NLM server
3089  *    nlm_state (IN):    NLM server state to change "nlmid" to
3090  */
3091 void
3092 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3093 {
3094         /*
3095          * Check to see if node is booted as a cluster. If not, return.
3096          */
3097         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3098                 return;
3099         }
3100 
3101         /*
3102          * Check for development/debugging.  It is possible to boot a node
3103          * in non-cluster mode, and then run a special script, currently
3104          * available only to developers, to bring up the node as part of a
3105          * cluster.  The problem is that running such a script does not
3106          * result in the routine flk_init() being called and hence global array
3107          * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3108          * but the LLM needs to do an additional check to see if the global
3109          * array has been created or not. If nlm_reg_status is NULL, then
3110          * return, else continue.
3111          */
3112         if (nlm_reg_status == NULL) {
3113                 return;
3114         }
3115 
3116         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3117         mutex_enter(&nlm_reg_lock);
3118 
3119         if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3120                 /*
3121                  * If the NLM server "nlmid" is unknown in the NLM registry,
3122                  * add it to the registry in the nlm shutting down state.
3123                  */
3124                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3125                         FLK_NLM_SHUTTING_DOWN);
3126         } else {
3127                 /*
3128                  * Change the state of the NLM server identified by "nlmid"
3129                  * in the NLM registry to the argument "nlm_state."
3130                  */
3131                 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3132                         nlm_state);
3133         }
3134 
3135         /*
3136          *  The reason we must register the NLM server that is shutting down
3137          *  with an LLM that doesn't already know about it (never sent a lock
3138          *  request) is to handle correctly a race between shutdown and a new
3139          *  lock request.  Suppose that a shutdown request from the NLM server
3140          *  invokes this routine at the LLM, and a thread is spawned to
3141          *  service the request. Now suppose a new lock request is in
3142          *  progress and has already passed the first line of defense in
3143          *  reclock(), which denies new locks requests from NLM servers
3144          *  that are not in the NLM_UP state.  After the current routine
3145          *  is invoked for both phases of shutdown, the routine will return,
3146          *  having done nothing, and the lock request will proceed and
3147          *  probably be granted.  The problem is that the shutdown was ignored
3148          *  by the lock request because there was no record of that NLM server
3149          *  shutting down.   We will be in the peculiar position of thinking
3150          *  that we've shutdown the NLM server and all locks at all LLMs have
3151          *  been discarded, but in fact there's still one lock held.
3152          *  The solution is to record the existence of NLM server and change
3153          *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3154          *  progress may proceed because the next phase NLM_DOWN will catch
3155          *  this lock and discard it.
3156          */
3157         mutex_exit(&nlm_reg_lock);
3158 
3159         switch (nlm_state) {
3160         case FLK_NLM_UP:
3161                 /*
3162                  * Change the NLM state of all locks still held on behalf of
3163                  * the NLM server identified by "nlmid" to NLM_UP.
3164                  */
3165                 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3166                 break;
3167 
3168         case FLK_NLM_SHUTTING_DOWN:
3169                 /*
3170                  * Wake up all sleeping locks for the NLM server identified
3171                  * by "nlmid." Note that eventually all woken threads will
3172                  * have their lock requests cancelled and descriptors
3173                  * removed from the sleeping lock list.  Note that the NLM
3174                  * server state associated with each lock descriptor is
3175                  * changed to FLK_NLM_SHUTTING_DOWN.
3176                  */
3177                 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3178                 break;
3179 
3180         case FLK_NLM_DOWN:
3181                 /*
3182                  * Discard all active, granted locks for this NLM server
3183                  * identified by "nlmid."
3184                  */
3185                 cl_flk_unlock_nlm_granted(nlmid);
3186                 break;
3187 
3188         default:
3189                 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3190         }
3191 }
3192 
3193 /*
3194  * Set the control status for lock manager requests.
3195  *
3196  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3197  * may be locks requests that have gotten started but not finished.  In
3198  * particular, there may be blocking requests that are in the callback code
3199  * before sleeping (so they're not holding the lock for the graph).  If
3200  * such a thread reacquires the graph's lock (to go to sleep) after
3201  * flk_lockmgr_status is set to a non-up value, it will notice the status
3202  * and bail out.  If the request gets granted before the thread can check
3203  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3204  * we are called with FLK_LOCKMGR_DOWN.
3205  */
3206 
3207 void
3208 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3209 {
3210         int i;
3211         graph_t *gp;
3212         struct flock_globals *fg;
3213 
3214         fg = flk_get_globals();
3215         ASSERT(fg != NULL);
3216 
3217         mutex_enter(&flock_lock);
3218         fg->flk_lockmgr_status = status;
3219         mutex_exit(&flock_lock);
3220 
3221         /*
3222          * If the lock manager is coming back up, all that's needed is to
3223          * propagate this information to the graphs.  If the lock manager
3224          * is going down, additional action is required, and each graph's
3225          * copy of the state is updated atomically with this other action.
3226          */
3227         switch (status) {
3228         case FLK_LOCKMGR_UP:
3229                 for (i = 0; i < HASH_SIZE; i++) {
3230                         mutex_enter(&flock_lock);
3231                         gp = lock_graph[i];
3232                         mutex_exit(&flock_lock);
3233                         if (gp == NULL)
3234                                 continue;
3235                         mutex_enter(&gp->gp_mutex);
3236                         fg->lockmgr_status[i] = status;
3237                         mutex_exit(&gp->gp_mutex);
3238                 }
3239                 break;
3240         case FLK_WAKEUP_SLEEPERS:
3241                 wakeup_sleeping_lockmgr_locks(fg);
3242                 break;
3243         case FLK_LOCKMGR_DOWN:
3244                 unlock_lockmgr_granted(fg);
3245                 break;
3246         default:
3247                 panic("flk_set_lockmgr_status: bad status (%d)", status);
3248                 break;
3249         }
3250 }
3251 
3252 /*
3253  * This routine returns all the locks that are active or sleeping and are
3254  * associated with a particular set of identifiers.  If lock_state != 0, then
3255  * only locks that match the lock_state are returned. If lock_state == 0, then
3256  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3257  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3258  * vnode pointer is ignored.
3259  *
3260  * A list containing the vnode pointer and an flock structure
3261  * describing the lock is returned.  Each element in the list is
3262  * dynamically allocated and must be freed by the caller.  The
3263  * last item in the list is denoted by a NULL value in the ll_next
3264  * field.
3265  *
3266  * The vnode pointers returned are held.  The caller is responsible
3267  * for releasing these.  Note that the returned list is only a snapshot of
3268  * the current lock information, and that it is a snapshot of a moving
3269  * target (only one graph is locked at a time).
3270  */
3271 
3272 locklist_t *
3273 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3274                 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3275 {
3276         lock_descriptor_t       *lock;
3277         lock_descriptor_t       *graph_head;
3278         locklist_t              listhead;
3279         locklist_t              *llheadp;
3280         locklist_t              *llp;
3281         locklist_t              *lltp;
3282         graph_t                 *gp;
3283         int                     i;
3284         int                     first_index; /* graph index */
3285         int                     num_indexes; /* graph index */
3286 
3287         ASSERT((list_type == FLK_ACTIVE_STATE) ||
3288             (list_type == FLK_SLEEPING_STATE));
3289 
3290         /*
3291          * Get a pointer to something to use as a list head while building
3292          * the rest of the list.
3293          */
3294         llheadp = &listhead;
3295         lltp = llheadp;
3296         llheadp->ll_next = (locklist_t *)NULL;
3297 
3298         /* Figure out which graphs we want to look at. */
3299         if (vp == NULL) {
3300                 first_index = 0;
3301                 num_indexes = HASH_SIZE;
3302         } else {
3303                 first_index = HASH_INDEX(vp);
3304                 num_indexes = 1;
3305         }
3306 
3307         for (i = first_index; i < first_index + num_indexes; i++) {
3308                 mutex_enter(&flock_lock);
3309                 gp = lock_graph[i];
3310                 mutex_exit(&flock_lock);
3311                 if (gp == NULL) {
3312                         continue;
3313                 }
3314 
3315                 mutex_enter(&gp->gp_mutex);
3316                 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3317                         ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3318                 for (lock = graph_head->l_next;
3319                     lock != graph_head;
3320                     lock = lock->l_next) {
3321                         if (use_sysid && lock->l_flock.l_sysid != sysid)
3322                                 continue;
3323                         if (pid != NOPID && lock->l_flock.l_pid != pid)
3324                                 continue;
3325                         if (vp != NULL && lock->l_vnode != vp)
3326                                 continue;
3327                         if (lock_state && !(lock_state & lock->l_state))
3328                                 continue;
3329                         if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3330                                 continue;
3331                         /*
3332                          * A matching lock was found.  Allocate
3333                          * space for a new locklist entry and fill
3334                          * it in.
3335                          */
3336                         llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3337                         lltp->ll_next = llp;
3338                         VN_HOLD(lock->l_vnode);
3339                         llp->ll_vp = lock->l_vnode;
3340                         create_flock(lock, &(llp->ll_flock));
3341                         llp->ll_next = (locklist_t *)NULL;
3342                         lltp = llp;
3343                 }
3344                 mutex_exit(&gp->gp_mutex);
3345         }
3346 
3347         llp = llheadp->ll_next;
3348         return (llp);
3349 }
3350 
3351 /*
3352  * These two functions are simply interfaces to get_lock_list.  They return
3353  * a list of sleeping or active locks for the given sysid and pid.  See
3354  * get_lock_list for details.
3355  *
3356  * In either case we don't particularly care to specify the zone of interest;
3357  * the sysid-space is global across zones, so the sysid will map to exactly one
3358  * zone, and we'll return information for that zone.
3359  */
3360 
3361 locklist_t *
3362 flk_get_sleeping_locks(int sysid, pid_t pid)
3363 {
3364         return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3365                     ALL_ZONES));
3366 }
3367 
3368 locklist_t *
3369 flk_get_active_locks(int sysid, pid_t pid)
3370 {
3371         return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3372                     ALL_ZONES));
3373 }
3374 
3375 /*
3376  * Another interface to get_lock_list.  This one returns all the active
3377  * locks for a given vnode.  Again, see get_lock_list for details.
3378  *
3379  * We don't need to specify which zone's locks we're interested in.  The matter
3380  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3381  * be used by multiple zones, so the list of locks will all be from the right
3382  * zone.
3383  */
3384 
3385 locklist_t *
3386 flk_active_locks_for_vp(const vnode_t *vp)
3387 {
3388         return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3389                     ALL_ZONES));
3390 }
3391 
3392 /*
3393  * Another interface to get_lock_list.  This one returns all the active
3394  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3395  *
3396  * See the comment for flk_active_locks_for_vp() for why we don't care to
3397  * specify the particular zone of interest.
3398  */
3399 locklist_t *
3400 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3401 {
3402         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3403                                 NOPID, vp, ALL_ZONES));
3404 }
3405 
3406 /*
3407  * Another interface to get_lock_list.  This one returns all the active
3408  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3409  *
3410  * The zone doesn't need to be specified here; the locks held by a
3411  * particular process will either be local (ie, non-NFS) or from the zone
3412  * the process is executing in.  This is because other parts of the system
3413  * ensure that an NFS vnode can't be used in a zone other than that in
3414  * which it was opened.
3415  */
3416 locklist_t *
3417 flk_active_nbmand_locks(pid_t pid)
3418 {
3419         return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3420                                 pid, NULL, ALL_ZONES));
3421 }
3422 
3423 /*
3424  * Free up all entries in the locklist.
3425  */
3426 void
3427 flk_free_locklist(locklist_t *llp)
3428 {
3429         locklist_t *next_llp;
3430 
3431         while (llp) {
3432                 next_llp = llp->ll_next;
3433                 VN_RELE(llp->ll_vp);
3434                 kmem_free(llp, sizeof (*llp));
3435                 llp = next_llp;
3436         }
3437 }
3438 
3439 static void
3440 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3441 {
3442         /*
3443          * For each graph "lg" in the hash table lock_graph do
3444          * a.  Get the list of sleeping locks
3445          * b.  For each lock descriptor in the list do
3446          *      i.   If the requested lock is an NLM server request AND
3447          *              the nlmid is the same as the routine argument then
3448          *              change the lock descriptor's state field to
3449          *              "nlm_state."
3450          * c.  Get the list of active locks
3451          * d.  For each lock descriptor in the list do
3452          *      i.   If the requested lock is an NLM server request AND
3453          *              the nlmid is the same as the routine argument then
3454          *              change the lock descriptor's state field to
3455          *              "nlm_state."
3456          */
3457 
3458         int                     i;
3459         graph_t                 *gp;                    /* lock graph */
3460         lock_descriptor_t       *lock;                  /* lock */
3461         lock_descriptor_t       *nlock = NULL;          /* next lock */
3462         int                     lock_nlmid;
3463 
3464         for (i = 0; i < HASH_SIZE; i++) {
3465                 mutex_enter(&flock_lock);
3466                 gp = lock_graph[i];
3467                 mutex_exit(&flock_lock);
3468                 if (gp == NULL) {
3469                         continue;
3470                 }
3471 
3472                 /* Get list of sleeping locks in current lock graph. */
3473                 mutex_enter(&gp->gp_mutex);
3474                 for (lock = SLEEPING_HEAD(gp)->l_next;
3475                     lock != SLEEPING_HEAD(gp);
3476                     lock = nlock) {
3477                         nlock = lock->l_next;
3478                         /* get NLM id */
3479                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3480 
3481                         /*
3482                          * If NLM server request AND nlmid of lock matches
3483                          * nlmid of argument, then set the NLM state of the
3484                          * lock to "nlm_state."
3485                          */
3486                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3487                                 SET_NLM_STATE(lock, nlm_state);
3488                         }
3489                 }
3490 
3491                 /* Get list of active locks in current lock graph. */
3492                 for (lock = ACTIVE_HEAD(gp)->l_next;
3493                     lock != ACTIVE_HEAD(gp);
3494                     lock = nlock) {
3495                         nlock = lock->l_next;
3496                         /* get NLM id */
3497                         lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3498 
3499                         /*
3500                          * If NLM server request AND nlmid of lock matches
3501                          * nlmid of argument, then set the NLM state of the
3502                          * lock to "nlm_state."
3503                          */
3504                         if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3505                                 ASSERT(IS_ACTIVE(lock));
3506                                 SET_NLM_STATE(lock, nlm_state);
3507                         }
3508                 }
3509                 mutex_exit(&gp->gp_mutex);
3510         }
3511 }
3512 
3513 /*
3514  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3515  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3516  *   identified by "nlmid." Poke those lock requests.
3517  */
3518 static void
3519 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3520 {
3521         lock_descriptor_t *lock;
3522         lock_descriptor_t *nlock = NULL; /* next lock */
3523         int i;
3524         graph_t *gp;
3525         int     lock_nlmid;
3526 
3527         for (i = 0; i < HASH_SIZE; i++) {
3528                 mutex_enter(&flock_lock);
3529                 gp = lock_graph[i];
3530                 mutex_exit(&flock_lock);
3531                 if (gp == NULL) {
3532                         continue;
3533                 }
3534 
3535                 mutex_enter(&gp->gp_mutex);
3536                 for (lock = SLEEPING_HEAD(gp)->l_next;
3537                     lock != SLEEPING_HEAD(gp);
3538                     lock = nlock) {
3539                         nlock = lock->l_next;
3540                         /*
3541                          * If NLM server request _and_ nlmid of lock matches
3542                          * nlmid of argument, then set the NLM state of the
3543                          * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3544                          * request.
3545                          */
3546                         if (IS_LOCKMGR(lock)) {
3547                                 /* get NLM id */
3548                                 lock_nlmid =
3549                                         GETNLMID(lock->l_flock.l_sysid);
3550                                 if (nlmid == lock_nlmid) {
3551                                         SET_NLM_STATE(lock,
3552                                                 FLK_NLM_SHUTTING_DOWN);
3553                                         INTERRUPT_WAKEUP(lock);
3554                                 }
3555                         }
3556                 }
3557                 mutex_exit(&gp->gp_mutex);
3558         }
3559 }
3560 
3561 /*
3562  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3563  * Effects:  Find all active (granted) lock manager locks _only_ for the
3564  *   NLM server identified by "nlmid" and release them.
3565  */
3566 static void
3567 cl_flk_unlock_nlm_granted(int nlmid)
3568 {
3569         lock_descriptor_t *lock;
3570         lock_descriptor_t *nlock = NULL; /* next lock */
3571         int i;
3572         graph_t *gp;
3573         int     lock_nlmid;
3574 
3575         for (i = 0; i < HASH_SIZE; i++) {
3576                 mutex_enter(&flock_lock);
3577                 gp = lock_graph[i];
3578                 mutex_exit(&flock_lock);
3579                 if (gp == NULL) {
3580                         continue;
3581                 }
3582 
3583                 mutex_enter(&gp->gp_mutex);
3584                 for (lock = ACTIVE_HEAD(gp)->l_next;
3585                     lock != ACTIVE_HEAD(gp);
3586                     lock = nlock) {
3587                         nlock = lock->l_next;
3588                         ASSERT(IS_ACTIVE(lock));
3589 
3590                         /*
3591                          * If it's an  NLM server request _and_ nlmid of
3592                          * the lock matches nlmid of argument, then
3593                          * remove the active lock the list, wakup blocked
3594                          * threads, and free the storage for the lock.
3595                          * Note that there's no need to mark the NLM state
3596                          * of this lock to NLM_DOWN because the lock will
3597                          * be deleted anyway and its storage freed.
3598                          */
3599                         if (IS_LOCKMGR(lock)) {
3600                                 /* get NLM id */
3601                                 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3602                                 if (nlmid == lock_nlmid) {
3603                                         flk_delete_active_lock(lock, 0);
3604                                         flk_wakeup(lock, 1);
3605                                         flk_free_lock(lock);
3606                                 }
3607                         }
3608                 }
3609                 mutex_exit(&gp->gp_mutex);
3610         }
3611 }
3612 
3613 /*
3614  * Find all sleeping lock manager requests and poke them.
3615  */
3616 static void
3617 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3618 {
3619         lock_descriptor_t *lock;
3620         lock_descriptor_t *nlock = NULL; /* next lock */
3621         int i;
3622         graph_t *gp;
3623         zoneid_t zoneid = getzoneid();
3624 
3625         for (i = 0; i < HASH_SIZE; i++) {
3626                 mutex_enter(&flock_lock);
3627                 gp = lock_graph[i];
3628                 mutex_exit(&flock_lock);
3629                 if (gp == NULL) {
3630                         continue;
3631                 }
3632 
3633                 mutex_enter(&gp->gp_mutex);
3634                 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3635                 for (lock = SLEEPING_HEAD(gp)->l_next;
3636                     lock != SLEEPING_HEAD(gp);
3637                     lock = nlock) {
3638                         nlock = lock->l_next;
3639                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3640                                 INTERRUPT_WAKEUP(lock);
3641                         }
3642                 }
3643                 mutex_exit(&gp->gp_mutex);
3644         }
3645 }
3646 
3647 
3648 /*
3649  * Find all active (granted) lock manager locks and release them.
3650  */
3651 static void
3652 unlock_lockmgr_granted(struct flock_globals *fg)
3653 {
3654         lock_descriptor_t *lock;
3655         lock_descriptor_t *nlock = NULL; /* next lock */
3656         int i;
3657         graph_t *gp;
3658         zoneid_t zoneid = getzoneid();
3659 
3660         for (i = 0; i < HASH_SIZE; i++) {
3661                 mutex_enter(&flock_lock);
3662                 gp = lock_graph[i];
3663                 mutex_exit(&flock_lock);
3664                 if (gp == NULL) {
3665                         continue;
3666                 }
3667 
3668                 mutex_enter(&gp->gp_mutex);
3669                 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3670                 for (lock = ACTIVE_HEAD(gp)->l_next;
3671                     lock != ACTIVE_HEAD(gp);
3672                     lock = nlock) {
3673                         nlock = lock->l_next;
3674                         if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3675                                 ASSERT(IS_ACTIVE(lock));
3676                                 flk_delete_active_lock(lock, 0);
3677                                 flk_wakeup(lock, 1);
3678                                 flk_free_lock(lock);
3679                         }
3680                 }
3681                 mutex_exit(&gp->gp_mutex);
3682         }
3683 }
3684 
3685 
3686 /*
3687  * Wait until a lock is granted, cancelled, or interrupted.
3688  */
3689 
3690 static void
3691 wait_for_lock(lock_descriptor_t *request)
3692 {
3693         graph_t *gp = request->l_graph;
3694 
3695         ASSERT(MUTEX_HELD(&gp->gp_mutex));
3696 
3697         while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3698             !(IS_INTERRUPTED(request))) {
3699                 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3700                         flk_set_state(request, FLK_INTERRUPTED_STATE);
3701                         request->l_state |= INTERRUPTED_LOCK;
3702                 }
3703         }
3704 }
3705 
3706 /*
3707  * Create an flock structure from the existing lock information
3708  *
3709  * This routine is used to create flock structures for the lock manager
3710  * to use in a reclaim request.  Since the lock was originated on this
3711  * host, it must be conforming to UNIX semantics, so no checking is
3712  * done to make sure it falls within the lower half of the 32-bit range.
3713  */
3714 
3715 static void
3716 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3717 {
3718         ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3719         ASSERT(lp->l_end >= lp->l_start);
3720 
3721         flp->l_type = lp->l_type;
3722         flp->l_whence = 0;
3723         flp->l_start = lp->l_start;
3724         flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3725                 (lp->l_end - lp->l_start + 1);
3726         flp->l_sysid = lp->l_flock.l_sysid;
3727         flp->l_pid = lp->l_flock.l_pid;
3728 }
3729 
3730 /*
3731  * Convert flock_t data describing a lock range into unsigned long starting
3732  * and ending points, which are put into lock_request.  Returns 0 or an
3733  * errno value.
3734  * Large Files: max is passed by the caller and we return EOVERFLOW
3735  * as defined by LFS API.
3736  */
3737 
3738 int
3739 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3740     u_offset_t *start, u_offset_t *end, offset_t offset)
3741 {
3742         struct vattr    vattr;
3743         int     error;
3744 
3745         /*
3746          * Determine the starting point of the request
3747          */
3748         switch (flp->l_whence) {
3749         case 0:         /* SEEK_SET */
3750                 *start = (u_offset_t)flp->l_start;
3751                 break;
3752         case 1:         /* SEEK_CUR */
3753                 *start = (u_offset_t)(flp->l_start + offset);
3754                 break;
3755         case 2:         /* SEEK_END */
3756                 vattr.va_mask = AT_SIZE;
3757                 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3758                         return (error);
3759                 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3760                 break;
3761         default:
3762                 return (EINVAL);
3763         }
3764 
3765         /*
3766          * Determine the range covered by the request.
3767          */
3768         if (flp->l_len == 0)
3769                 *end = MAX_U_OFFSET_T;
3770         else if ((offset_t)flp->l_len > 0) {
3771                 *end = (u_offset_t)(*start + (flp->l_len - 1));
3772         } else {
3773                 /*
3774                  * Negative length; why do we even allow this ?
3775                  * Because this allows easy specification of
3776                  * the last n bytes of the file.
3777                  */
3778                 *end = *start;
3779                 *start += (u_offset_t)flp->l_len;
3780                 (*start)++;
3781         }
3782         return (0);
3783 }
3784 
3785 /*
3786  * Check the validity of lock data.  This can used by the NFS
3787  * frlock routines to check data before contacting the server.  The
3788  * server must support semantics that aren't as restrictive as
3789  * the UNIX API, so the NFS client is required to check.
3790  * The maximum is now passed in by the caller.
3791  */
3792 
3793 int
3794 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3795 {
3796         /*
3797          * The end (length) for local locking should never be greater
3798          * than MAXEND. However, the representation for
3799          * the entire file is MAX_U_OFFSET_T.
3800          */
3801         if ((start > max) ||
3802             ((end > max) && (end != MAX_U_OFFSET_T))) {
3803                 return (EINVAL);
3804         }
3805         if (start > end) {
3806             return (EINVAL);
3807         }
3808         return (0);
3809 }
3810 
3811 /*
3812  * Fill in request->l_flock with information about the lock blocking the
3813  * request.  The complexity here is that lock manager requests are allowed
3814  * to see into the upper part of the 32-bit address range, whereas local
3815  * requests are only allowed to see signed values.
3816  *
3817  * What should be done when "blocker" is a lock manager lock that uses the
3818  * upper portion of the 32-bit range, but "request" is local?  Since the
3819  * request has already been determined to have been blocked by the blocker,
3820  * at least some portion of "blocker" must be in the range of the request,
3821  * or the request extends to the end of file.  For the first case, the
3822  * portion in the lower range is returned with the indication that it goes
3823  * "to EOF."  For the second case, the last byte of the lower range is
3824  * returned with the indication that it goes "to EOF."
3825  */
3826 
3827 static void
3828 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3829 {
3830         flock64_t *flrp;                        /* l_flock portion of request */
3831 
3832         ASSERT(blocker != NULL);
3833 
3834         flrp = &request->l_flock;
3835         flrp->l_whence = 0;
3836         flrp->l_type = blocker->l_type;
3837         flrp->l_pid = blocker->l_flock.l_pid;
3838         flrp->l_sysid = blocker->l_flock.l_sysid;
3839 
3840         if (IS_LOCKMGR(request)) {
3841                 flrp->l_start = blocker->l_start;
3842                 if (blocker->l_end == MAX_U_OFFSET_T)
3843                         flrp->l_len = 0;
3844                 else
3845                         flrp->l_len = blocker->l_end - blocker->l_start + 1;
3846         } else {
3847                 if (blocker->l_start > MAXEND) {
3848                         flrp->l_start = MAXEND;
3849                         flrp->l_len = 0;
3850                 } else {
3851                         flrp->l_start = blocker->l_start;
3852                         if (blocker->l_end == MAX_U_OFFSET_T)
3853                                 flrp->l_len = 0;
3854                         else
3855                                 flrp->l_len = blocker->l_end -
3856                                         blocker->l_start + 1;
3857                 }
3858         }
3859 }
3860 
3861 /*
3862  * PSARC case 1997/292
3863  */
3864 /*
3865  * This is the public routine exported by flock.h.
3866  */
3867 void
3868 cl_flk_change_nlm_state_to_unknown(int nlmid)
3869 {
3870         /*
3871          * Check to see if node is booted as a cluster. If not, return.
3872          */
3873         if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3874                 return;
3875         }
3876 
3877         /*
3878          * See comment in cl_flk_set_nlm_status().
3879          */
3880         if (nlm_reg_status == NULL) {
3881                 return;
3882         }
3883 
3884         /*
3885          * protect NLM registry state with a mutex.
3886          */
3887         ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3888         mutex_enter(&nlm_reg_lock);
3889         FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3890         mutex_exit(&nlm_reg_lock);
3891 }
3892 
3893 /*
3894  * Return non-zero if the given I/O request conflicts with an active NBMAND
3895  * lock.
3896  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3897  * locks.
3898  */
3899 
3900 int
3901 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3902                 ssize_t length, int svmand, caller_context_t *ct)
3903 {
3904         int conflict = 0;
3905         graph_t                 *gp;
3906         lock_descriptor_t       *lock;
3907         pid_t pid;
3908         int sysid;
3909 
3910         if (ct == NULL) {
3911                 pid = curproc->p_pid;
3912                 sysid = 0;
3913         } else {
3914                 pid = ct->cc_pid;
3915                 sysid = ct->cc_sysid;
3916         }
3917 
3918         mutex_enter(&flock_lock);
3919         gp = lock_graph[HASH_INDEX(vp)];
3920         mutex_exit(&flock_lock);
3921         if (gp == NULL)
3922                 return (0);
3923 
3924         mutex_enter(&gp->gp_mutex);
3925         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3926 
3927         for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3928                 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3929                     (lock->l_flock.l_sysid != sysid ||
3930                     lock->l_flock.l_pid != pid) &&
3931                     lock_blocks_io(op, offset, length,
3932                                 lock->l_type, lock->l_start, lock->l_end)) {
3933                         conflict = 1;
3934                         break;
3935                 }
3936         }
3937         mutex_exit(&gp->gp_mutex);
3938 
3939         return (conflict);
3940 }
3941 
3942 /*
3943  * Return non-zero if the given I/O request conflicts with the given lock.
3944  */
3945 
3946 static int
3947 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3948             int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3949 {
3950         ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3951         ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3952 
3953         if (op == NBL_READ && lock_type == F_RDLCK)
3954                 return (0);
3955 
3956         if (offset <= lock_start && lock_start < offset + length)
3957                 return (1);
3958         if (lock_start <= offset && offset <= lock_end)
3959                 return (1);
3960 
3961         return (0);
3962 }
3963 
3964 #ifdef DEBUG
3965 static void
3966 check_active_locks(graph_t *gp)
3967 {
3968         lock_descriptor_t *lock, *lock1;
3969         edge_t  *ep;
3970 
3971         for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3972                                                 lock = lock->l_next) {
3973                 ASSERT(IS_ACTIVE(lock));
3974                 ASSERT(NOT_BLOCKED(lock));
3975                 ASSERT(!IS_BARRIER(lock));
3976 
3977                 ep = FIRST_IN(lock);
3978 
3979                 while (ep != HEAD(lock)) {
3980                         ASSERT(IS_SLEEPING(ep->from_vertex));
3981                         ASSERT(!NOT_BLOCKED(ep->from_vertex));
3982                         ep = NEXT_IN(ep);
3983                 }
3984 
3985                 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
3986                                         lock1 = lock1->l_next) {
3987                         if (lock1->l_vnode == lock->l_vnode) {
3988                         if (BLOCKS(lock1, lock)) {
3989                                 cmn_err(CE_PANIC,
3990                                     "active lock %p blocks %p",
3991                                     (void *)lock1, (void *)lock);
3992                         } else if (BLOCKS(lock, lock1)) {
3993                                 cmn_err(CE_PANIC,
3994                                     "active lock %p blocks %p",
3995                                     (void *)lock, (void *)lock1);
3996                         }
3997                         }
3998                 }
3999         }
4000 }
4001 
4002 /*
4003  * Effect: This functions checks to see if the transition from 'old_state' to
4004  *      'new_state' is a valid one.  It returns 0 if the transition is valid
4005  *      and 1 if it is not.
4006  *      For a map of valid transitions, see sys/flock_impl.h
4007  */
4008 static int
4009 check_lock_transition(int old_state, int new_state)
4010 {
4011         switch (old_state) {
4012         case FLK_INITIAL_STATE:
4013                 if ((new_state == FLK_START_STATE) ||
4014                     (new_state == FLK_SLEEPING_STATE) ||
4015                     (new_state == FLK_ACTIVE_STATE) ||
4016                     (new_state == FLK_DEAD_STATE)) {
4017                         return (0);
4018                 } else {
4019                         return (1);
4020                 }
4021         case FLK_START_STATE:
4022                 if ((new_state == FLK_ACTIVE_STATE) ||
4023                     (new_state == FLK_DEAD_STATE)) {
4024                         return (0);
4025                 } else {
4026                         return (1);
4027                 }
4028         case FLK_ACTIVE_STATE:
4029                 if (new_state == FLK_DEAD_STATE) {
4030                         return (0);
4031                 } else {
4032                         return (1);
4033                 }
4034         case FLK_SLEEPING_STATE:
4035                 if ((new_state == FLK_GRANTED_STATE) ||
4036                     (new_state == FLK_INTERRUPTED_STATE) ||
4037                     (new_state == FLK_CANCELLED_STATE)) {
4038                         return (0);
4039                 } else {
4040                         return (1);
4041                 }
4042         case FLK_GRANTED_STATE:
4043                 if ((new_state == FLK_START_STATE) ||
4044                     (new_state == FLK_INTERRUPTED_STATE) ||
4045                     (new_state == FLK_CANCELLED_STATE)) {
4046                         return (0);
4047                 } else {
4048                         return (1);
4049                 }
4050         case FLK_CANCELLED_STATE:
4051                 if ((new_state == FLK_INTERRUPTED_STATE) ||
4052                     (new_state == FLK_DEAD_STATE)) {
4053                         return (0);
4054                 } else {
4055                         return (1);
4056                 }
4057         case FLK_INTERRUPTED_STATE:
4058                 if (new_state == FLK_DEAD_STATE) {
4059                         return (0);
4060                 } else {
4061                         return (1);
4062                 }
4063         case FLK_DEAD_STATE:
4064                 /* May be set more than once */
4065                 if (new_state == FLK_DEAD_STATE) {
4066                         return (0);
4067                 } else {
4068                         return (1);
4069                 }
4070         default:
4071                 return (1);
4072         }
4073 }
4074 
4075 static void
4076 check_sleeping_locks(graph_t *gp)
4077 {
4078         lock_descriptor_t *lock1, *lock2;
4079         edge_t *ep;
4080         for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4081                                 lock1 = lock1->l_next) {
4082                                 ASSERT(!IS_BARRIER(lock1));
4083         for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4084                                 lock2 = lock2->l_next) {
4085                 if (lock1->l_vnode == lock2->l_vnode) {
4086                         if (BLOCKS(lock2, lock1)) {
4087                                 ASSERT(!IS_GRANTED(lock1));
4088                                 ASSERT(!NOT_BLOCKED(lock1));
4089                                 path(lock1, lock2);
4090                         }
4091                 }
4092         }
4093 
4094         for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4095                                         lock2 = lock2->l_next) {
4096                                 ASSERT(!IS_BARRIER(lock1));
4097                 if (lock1->l_vnode == lock2->l_vnode) {
4098                         if (BLOCKS(lock2, lock1)) {
4099                                 ASSERT(!IS_GRANTED(lock1));
4100                                 ASSERT(!NOT_BLOCKED(lock1));
4101                                 path(lock1, lock2);
4102                         }
4103                 }
4104         }
4105         ep = FIRST_ADJ(lock1);
4106         while (ep != HEAD(lock1)) {
4107                 ASSERT(BLOCKS(ep->to_vertex, lock1));
4108                 ep = NEXT_ADJ(ep);
4109         }
4110         }
4111 }
4112 
4113 static int
4114 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4115 {
4116         edge_t  *ep;
4117         lock_descriptor_t       *vertex;
4118         lock_descriptor_t *vertex_stack;
4119 
4120         STACK_INIT(vertex_stack);
4121 
4122         flk_graph_uncolor(lock1->l_graph);
4123         ep = FIRST_ADJ(lock1);
4124         ASSERT(ep != HEAD(lock1));
4125         while (ep != HEAD(lock1)) {
4126                 if (no_path)
4127                         ASSERT(ep->to_vertex != lock2);
4128                 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4129                 COLOR(ep->to_vertex);
4130                 ep = NEXT_ADJ(ep);
4131         }
4132 
4133         while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4134                 STACK_POP(vertex_stack, l_dstack);
4135                 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4136                                                 ep = NEXT_ADJ(ep)) {
4137                         if (COLORED(ep->to_vertex))
4138                                 continue;
4139                         COLOR(ep->to_vertex);
4140                         if (ep->to_vertex == lock2)
4141                                 return (1);
4142 
4143                         STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4144                 }
4145         }
4146         return (0);
4147 }
4148 
4149 static void
4150 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4151 {
4152         lock_descriptor_t *lock;
4153 
4154         SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4155 
4156         if (lock) {
4157                 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4158                         if (lock->l_flock.l_pid == pid &&
4159                             lock->l_flock.l_sysid == sysid)
4160                                 cmn_err(CE_PANIC,
4161                                     "owner pid %d's lock %p in active queue",
4162                                     pid, (void *)lock);
4163                         lock = lock->l_next;
4164                 }
4165         }
4166         SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4167 
4168         if (lock) {
4169                 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4170                         if (lock->l_flock.l_pid == pid &&
4171                             lock->l_flock.l_sysid == sysid)
4172                                 cmn_err(CE_PANIC,
4173                                     "owner pid %d's lock %p in sleep queue",
4174                                     pid, (void *)lock);
4175                         lock = lock->l_next;
4176                 }
4177         }
4178 }
4179 
4180 static int
4181 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4182 {
4183         edge_t *ep = FIRST_ADJ(lock1);
4184 
4185         while (ep != HEAD(lock1)) {
4186                 if (ep->to_vertex == lock2)
4187                         return (1);
4188                 else
4189                         ep = NEXT_ADJ(ep);
4190         }
4191         return (0);
4192 }
4193 
4194 static int
4195 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4196 {
4197         return (!level_two_path(lock1, lock2, 1));
4198 }
4199 
4200 static void
4201 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4202 {
4203         if (level_one_path(lock1, lock2)) {
4204                 if (level_two_path(lock1, lock2, 0) != 0) {
4205                         cmn_err(CE_WARN,
4206                             "one edge one path from lock1 %p lock2 %p",
4207                             (void *)lock1, (void *)lock2);
4208                 }
4209         } else if (no_path(lock1, lock2)) {
4210                 cmn_err(CE_PANIC,
4211                     "No path from  lock1 %p to lock2 %p",
4212                     (void *)lock1, (void *)lock2);
4213         }
4214 }
4215 #endif /* DEBUG */