illumos-gate Wdiff usr/src/uts/common/os/flock.c

Print this page

6253 F_GETLK doesn't always return lock owner
The F_GETLK fcntl doesn't return the offending lock if there is a read lock
on the file, a waiting write lock, and a read lock is requested.
The write lock blocks the locking request, but without this patch isn't
returned by GETLK.

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/flock.c
          +++ new/usr/src/uts/common/os/flock.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*      All Rights Reserved */
  29   29  
  30   30  /*
  31   31   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  32   32   */
  33   33  
  34   34  #include <sys/flock_impl.h>
  35   35  #include <sys/vfs.h>
  36   36  #include <sys/t_lock.h>         /* for <sys/callb.h> */
  37   37  #include <sys/callb.h>
  38   38  #include <sys/clconf.h>
  39   39  #include <sys/cladm.h>
  40   40  #include <sys/nbmlock.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/policy.h>
  43   43  
  44   44  /*
  45   45   * The following four variables are for statistics purposes and they are
  46   46   * not protected by locks. They may not be accurate but will at least be
  47   47   * close to the actual value.
  48   48   */
  49   49  
  50   50  int     flk_lock_allocs;
  51   51  int     flk_lock_frees;
  52   52  int     edge_allocs;
  53   53  int     edge_frees;
  54   54  int     flk_proc_vertex_allocs;
  55   55  int     flk_proc_edge_allocs;
  56   56  int     flk_proc_vertex_frees;
  57   57  int     flk_proc_edge_frees;
  58   58  
  59   59  static kmutex_t flock_lock;
  60   60  
  61   61  #ifdef DEBUG
  62   62  int check_debug = 0;
  63   63  #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  64   64                                          check_active_locks(gp);
  65   65  #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  66   66                                                  check_sleeping_locks(gp);
  67   67  #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  68   68                  if (check_debug)        \
  69   69                          check_owner_locks(gp, pid, sysid, vp);
  70   70  #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  71   71          { \
  72   72                  if (check_lock_transition(old_state, new_state)) { \
  73   73                          cmn_err(CE_PANIC, "Illegal lock transition \
  74   74                              from %d to %d", old_state, new_state); \
  75   75                  } \
  76   76          }
  77   77  #else
  78   78  
  79   79  #define CHECK_ACTIVE_LOCKS(gp)
  80   80  #define CHECK_SLEEPING_LOCKS(gp)
  81   81  #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  82   82  #define CHECK_LOCK_TRANSITION(old_state, new_state)
  83   83  
  84   84  #endif /* DEBUG */
  85   85  
  86   86  struct kmem_cache       *flk_edge_cache;
  87   87  
  88   88  graph_t         *lock_graph[HASH_SIZE];
  89   89  proc_graph_t    pgraph;
  90   90  
  91   91  /*
  92   92   * Clustering.
  93   93   *
  94   94   * NLM REGISTRY TYPE IMPLEMENTATION
  95   95   *
  96   96   * Assumptions:
  97   97   *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  98   98   *      integers; maximum node id is returned by clconf_maximum_nodeid().
  99   99   *  2.  We use this node id to identify the node an NLM server runs on.
 100  100   */
 101  101  
 102  102  /*
 103  103   * NLM registry object keeps track of NLM servers via their
 104  104   * nlmids (which are the node ids of the node in the cluster they run on)
 105  105   * that have requested locks at this LLM with which this registry is
 106  106   * associated.
 107  107   *
 108  108   * Representation of abstraction:
 109  109   *    rep = record[     states: array[nlm_state],
 110  110   *                      lock: mutex]
 111  111   *
 112  112   *    Representation invariants:
 113  113   *      1. index i of rep.states is between 0 and n - 1 where n is number
 114  114   *         of elements in the array, which happen to be the maximum number
 115  115   *         of nodes in the cluster configuration + 1.
 116  116   *      2. map nlmid to index i of rep.states
 117  117   *              0   -> 0
 118  118   *              1   -> 1
 119  119   *              2   -> 2
 120  120   *              n-1 -> clconf_maximum_nodeid()+1
 121  121   *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 122  122   *          from forgetting to subtract 1 from the index.
 123  123   *      4.  The reason we keep the 0th index is the following.  A legitimate
 124  124   *          cluster configuration includes making a UFS file system NFS
 125  125   *          exportable.  The code is structured so that if you're in a cluster
 126  126   *          you do one thing; otherwise, you do something else.  The problem
 127  127   *          is what to do if you think you're in a cluster with PXFS loaded,
 128  128   *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 129  129   *          encode the node id of the node where NLM server runs; these bytes
 130  130   *          are zero for UFS.  Since the nodeid is used to index into the
 131  131   *          registry, we can record the NLM server state information at index
 132  132   *          0 using the same mechanism used for PXFS file locks!
 133  133   */
 134  134  static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 135  135  static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 136  136  static uint_t nlm_status_size;                  /* size of state array */
 137  137  
 138  138  /*
 139  139   * Although we need a global lock dependency graph (and associated data
 140  140   * structures), we also need a per-zone notion of whether the lock manager is
 141  141   * running, and so whether to allow lock manager requests or not.
 142  142   *
 143  143   * Thus, on a per-zone basis we maintain a ``global'' variable
 144  144   * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 145  145   * manager is determined to be changing state (starting or stopping).
 146  146   *
 147  147   * Each graph/zone pair also has a copy of this variable, which is protected by
 148  148   * the graph's mutex.
 149  149   *
 150  150   * The per-graph copies are used to synchronize lock requests with shutdown
 151  151   * requests.  The global copy is used to initialize the per-graph field when a
 152  152   * new graph is created.
 153  153   */
 154  154  struct flock_globals {
 155  155          flk_lockmgr_status_t flk_lockmgr_status;
 156  156          flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 157  157  };
 158  158  
 159  159  zone_key_t flock_zone_key;
 160  160  
 161  161  static void create_flock(lock_descriptor_t *, flock64_t *);
 162  162  static lock_descriptor_t        *flk_get_lock(void);
 163  163  static void     flk_free_lock(lock_descriptor_t *lock);
 164  164  static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 165  165  static int flk_process_request(lock_descriptor_t *);
 166  166  static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 167  167  static edge_t *flk_get_edge(void);
 168  168  static int flk_wait_execute_request(lock_descriptor_t *);
 169  169  static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 170  170  static void flk_insert_active_lock(lock_descriptor_t *);
 171  171  static void flk_delete_active_lock(lock_descriptor_t *, int);
 172  172  static void flk_insert_sleeping_lock(lock_descriptor_t *);
 173  173  static void flk_graph_uncolor(graph_t *);
 174  174  static void flk_wakeup(lock_descriptor_t *, int);
 175  175  static void flk_free_edge(edge_t *);
 176  176  static void flk_recompute_dependencies(lock_descriptor_t *,
 177  177                          lock_descriptor_t **,  int, int);
 178  178  static int flk_find_barriers(lock_descriptor_t *);
 179  179  static void flk_update_barriers(lock_descriptor_t *);
 180  180  static int flk_color_reachables(lock_descriptor_t *);
 181  181  static int flk_canceled(lock_descriptor_t *);
 182  182  static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 183  183  static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 184  184  static void wait_for_lock(lock_descriptor_t *);
 185  185  static void unlock_lockmgr_granted(struct flock_globals *);
 186  186  static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 187  187  
 188  188  /* Clustering hooks */
 189  189  static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 190  190  static void cl_flk_wakeup_sleeping_nlm_locks(int);
 191  191  static void cl_flk_unlock_nlm_granted(int);
 192  192  
 193  193  #ifdef DEBUG
 194  194  static int check_lock_transition(int, int);
 195  195  static void check_sleeping_locks(graph_t *);
 196  196  static void check_active_locks(graph_t *);
 197  197  static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 198  198  static void path(lock_descriptor_t *, lock_descriptor_t *);
 199  199  static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 200  200  static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 201  201  static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 202  202  #endif
 203  203  
 204  204  /*      proc_graph function definitions */
 205  205  static int flk_check_deadlock(lock_descriptor_t *);
 206  206  static void flk_proc_graph_uncolor(void);
 207  207  static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 208  208  static proc_edge_t *flk_get_proc_edge(void);
 209  209  static void flk_proc_release(proc_vertex_t *);
 210  210  static void flk_free_proc_edge(proc_edge_t *);
 211  211  static void flk_update_proc_graph(edge_t *, int);
 212  212  
 213  213  /* Non-blocking mandatory locking */
 214  214  static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 215  215                          u_offset_t);
 216  216  
 217  217  static struct flock_globals *
 218  218  flk_get_globals(void)
 219  219  {
 220  220          /*
 221  221           * The KLM module had better be loaded if we're attempting to handle
 222  222           * lockmgr requests.
 223  223           */
 224  224          ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 225  225          return (zone_getspecific(flock_zone_key, curproc->p_zone));
 226  226  }
 227  227  
 228  228  static flk_lockmgr_status_t
 229  229  flk_get_lockmgr_status(void)
 230  230  {
 231  231          struct flock_globals *fg;
 232  232  
 233  233          ASSERT(MUTEX_HELD(&flock_lock));
 234  234  
 235  235          if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 236  236                  /*
 237  237                   * KLM module not loaded; lock manager definitely not running.
 238  238                   */
 239  239                  return (FLK_LOCKMGR_DOWN);
 240  240          }
 241  241          fg = flk_get_globals();
 242  242          return (fg->flk_lockmgr_status);
 243  243  }
 244  244  
 245  245  /*
 246  246   * Routine called from fs_frlock in fs/fs_subr.c
 247  247   */
 248  248  
 249  249  int
 250  250  reclock(vnode_t         *vp,
 251  251          flock64_t       *lckdat,
 252  252          int             cmd,
 253  253          int             flag,
 254  254          u_offset_t      offset,
 255  255          flk_callback_t  *flk_cbp)
 256  256  {
 257  257          lock_descriptor_t       stack_lock_request;
 258  258          lock_descriptor_t       *lock_request;
 259  259          int error = 0;
 260  260          graph_t *gp;
 261  261          int                     nlmid;
 262  262  
 263  263          /*
 264  264           * Check access permissions
 265  265           */
 266  266          if ((cmd & SETFLCK) &&
 267  267              ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 268  268              (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 269  269                          return (EBADF);
 270  270  
 271  271          /*
 272  272           * for query and unlock we use the stack_lock_request
 273  273           */
 274  274  
 275  275          if ((lckdat->l_type == F_UNLCK) ||
 276  276              !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 277  277                  lock_request = &stack_lock_request;
 278  278                  (void) bzero((caddr_t)lock_request,
 279  279                      sizeof (lock_descriptor_t));
 280  280  
 281  281                  /*
 282  282                   * following is added to make the assertions in
 283  283                   * flk_execute_request() to pass through
 284  284                   */
 285  285  
 286  286                  lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 287  287                  lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 288  288                  lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 289  289                  lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 290  290                  lock_request->l_status = FLK_INITIAL_STATE;
 291  291          } else {
 292  292                  lock_request = flk_get_lock();
 293  293          }
 294  294          lock_request->l_state = 0;
 295  295          lock_request->l_vnode = vp;
 296  296          lock_request->l_zoneid = getzoneid();
 297  297  
 298  298          /*
 299  299           * Convert the request range into the canonical start and end
 300  300           * values.  The NLM protocol supports locking over the entire
 301  301           * 32-bit range, so there's no range checking for remote requests,
 302  302           * but we still need to verify that local requests obey the rules.
 303  303           */
 304  304          /* Clustering */
 305  305          if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 306  306                  ASSERT(lckdat->l_whence == 0);
 307  307                  lock_request->l_start = lckdat->l_start;
 308  308                  lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 309  309                      lckdat->l_start + (lckdat->l_len - 1);
 310  310          } else {
 311  311                  /* check the validity of the lock range */
 312  312                  error = flk_convert_lock_data(vp, lckdat,
 313  313                      &lock_request->l_start, &lock_request->l_end,
 314  314                      offset);
 315  315                  if (error) {
 316  316                          goto done;
 317  317                  }
 318  318                  error = flk_check_lock_data(lock_request->l_start,
 319  319                      lock_request->l_end, MAXEND);
 320  320                  if (error) {
 321  321                          goto done;
 322  322                  }
 323  323          }
 324  324  
 325  325          ASSERT(lock_request->l_end >= lock_request->l_start);
 326  326  
 327  327          lock_request->l_type = lckdat->l_type;
 328  328          if (cmd & INOFLCK)
 329  329                  lock_request->l_state |= IO_LOCK;
 330  330          if (cmd & SLPFLCK)
 331  331                  lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 332  332          if (cmd & RCMDLCK)
 333  333                  lock_request->l_state |= LOCKMGR_LOCK;
 334  334          if (cmd & NBMLCK)
 335  335                  lock_request->l_state |= NBMAND_LOCK;
 336  336          /*
 337  337           * Clustering: set flag for PXFS locks
 338  338           * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 339  339           * also be of type 'RCMDLCK'.
 340  340           * We do not _only_ check the GETPXFSID() macro because local PXFS
 341  341           * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 342  342           */
 343  343  
 344  344          if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 345  345                  lock_request->l_state |= PXFS_LOCK;
 346  346          }
 347  347          if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 348  348                  if (lock_request->l_type == F_RDLCK ||
 349  349                      lock_request->l_type == F_WRLCK)
 350  350                          lock_request->l_state |= QUERY_LOCK;
 351  351          }
 352  352          lock_request->l_flock = (*lckdat);
 353  353          lock_request->l_callbacks = flk_cbp;
 354  354  
 355  355          /*
 356  356           * We are ready for processing the request
 357  357           */
 358  358          if (IS_LOCKMGR(lock_request)) {
 359  359                  /*
 360  360                   * If the lock request is an NLM server request ....
 361  361                   */
 362  362                  if (nlm_status_size == 0) { /* not booted as cluster */
 363  363                          mutex_enter(&flock_lock);
 364  364                          /*
 365  365                           * Bail out if this is a lock manager request and the
 366  366                           * lock manager is not supposed to be running.
 367  367                           */
 368  368                          if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 369  369                                  mutex_exit(&flock_lock);
 370  370                                  error = ENOLCK;
 371  371                                  goto done;
 372  372                          }
 373  373                          mutex_exit(&flock_lock);
 374  374                  } else {                        /* booted as a cluster */
 375  375                          nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 376  376                          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 377  377  
 378  378                          mutex_enter(&nlm_reg_lock);
 379  379                          /*
 380  380                           * If the NLM registry does not know about this
 381  381                           * NLM server making the request, add its nlmid
 382  382                           * to the registry.
 383  383                           */
 384  384                          if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 385  385                              nlmid)) {
 386  386                                  FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 387  387                          } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 388  388                              nlmid)) {
 389  389                                  /*
 390  390                                   * If the NLM server is already known (has made
 391  391                                   * previous lock requests) and its state is
 392  392                                   * not NLM_UP (means that NLM server is
 393  393                                   * shutting down), then bail out with an
 394  394                                   * error to deny the lock request.
 395  395                                   */
 396  396                                  mutex_exit(&nlm_reg_lock);
 397  397                                  error = ENOLCK;
 398  398                                  goto done;
 399  399                          }
 400  400                          mutex_exit(&nlm_reg_lock);
 401  401                  }
 402  402          }
 403  403  
 404  404          /* Now get the lock graph for a particular vnode */
 405  405          gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 406  406  
 407  407          /*
 408  408           * We drop rwlock here otherwise this might end up causing a
 409  409           * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 410  410           */
 411  411  
 412  412          if (IS_IO_LOCK(lock_request)) {
 413  413                  VOP_RWUNLOCK(vp,
 414  414                      (lock_request->l_type == F_RDLCK) ?
 415  415                      V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 416  416          }
 417  417          mutex_enter(&gp->gp_mutex);
 418  418  
 419  419          lock_request->l_state |= REFERENCED_LOCK;
 420  420          lock_request->l_graph = gp;
 421  421  
 422  422          switch (lock_request->l_type) {
 423  423          case F_RDLCK:
 424  424          case F_WRLCK:
 425  425                  if (IS_QUERY_LOCK(lock_request)) {
 426  426                          flk_get_first_blocking_lock(lock_request);
 427  427                          (*lckdat) = lock_request->l_flock;
 428  428                          break;
 429  429                  }
 430  430  
 431  431                  /* process the request now */
 432  432  
 433  433                  error = flk_process_request(lock_request);
 434  434                  break;
 435  435  
 436  436          case F_UNLCK:
 437  437                  /* unlock request will not block so execute it immediately */
 438  438  
 439  439                  if (IS_LOCKMGR(lock_request) &&
 440  440                      flk_canceled(lock_request)) {
 441  441                          error = 0;
 442  442                  } else {
 443  443                          error = flk_execute_request(lock_request);
 444  444                  }
 445  445                  break;
 446  446  
 447  447          case F_UNLKSYS:
 448  448                  /*
 449  449                   * Recovery mechanism to release lock manager locks when
 450  450                   * NFS client crashes and restart. NFS server will clear
 451  451                   * old locks and grant new locks.
 452  452                   */
 453  453  
 454  454                  if (lock_request->l_flock.l_sysid == 0) {
 455  455                          mutex_exit(&gp->gp_mutex);
 456  456                          return (EINVAL);
 457  457                  }
 458  458                  if (secpolicy_nfs(CRED()) != 0) {
 459  459                          mutex_exit(&gp->gp_mutex);
 460  460                          return (EPERM);
 461  461                  }
 462  462                  flk_delete_locks_by_sysid(lock_request);
 463  463                  lock_request->l_state &= ~REFERENCED_LOCK;
 464  464                  flk_set_state(lock_request, FLK_DEAD_STATE);
 465  465                  flk_free_lock(lock_request);
 466  466                  mutex_exit(&gp->gp_mutex);
 467  467                  return (0);
 468  468  
 469  469          default:
 470  470                  error = EINVAL;
 471  471                  break;
 472  472          }
 473  473  
 474  474          /* Clustering: For blocked PXFS locks, return */
 475  475          if (error == PXFS_LOCK_BLOCKED) {
 476  476                  lock_request->l_state &= ~REFERENCED_LOCK;
 477  477                  mutex_exit(&gp->gp_mutex);
 478  478                  return (error);
 479  479          }
 480  480  
 481  481          /*
 482  482           * Now that we have seen the status of locks in the system for
 483  483           * this vnode we acquire the rwlock if it is an IO_LOCK.
 484  484           */
 485  485  
 486  486          if (IS_IO_LOCK(lock_request)) {
 487  487                  (void) VOP_RWLOCK(vp,
 488  488                      (lock_request->l_type == F_RDLCK) ?
 489  489                      V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 490  490                  if (!error) {
 491  491                          lckdat->l_type = F_UNLCK;
 492  492  
 493  493                          /*
 494  494                           * This wake up is needed otherwise
 495  495                           * if IO_LOCK has slept the dependents on this
 496  496                           * will not be woken up at all. (bugid # 1185482).
 497  497                           */
 498  498  
 499  499                          flk_wakeup(lock_request, 1);
 500  500                          flk_set_state(lock_request, FLK_DEAD_STATE);
 501  501                          flk_free_lock(lock_request);
 502  502                  }
 503  503                  /*
 504  504                   * else if error had occurred either flk_process_request()
 505  505                   * has returned EDEADLK in which case there will be no
 506  506                   * dependents for this lock or EINTR from flk_wait_execute_
 507  507                   * request() in which case flk_cancel_sleeping_lock()
 508  508                   * would have been done. same is true with EBADF.
 509  509                   */
 510  510          }
 511  511  
 512  512          if (lock_request == &stack_lock_request) {
 513  513                  flk_set_state(lock_request, FLK_DEAD_STATE);
 514  514          } else {
 515  515                  lock_request->l_state &= ~REFERENCED_LOCK;
 516  516                  if ((error != 0) || IS_DELETED(lock_request)) {
 517  517                          flk_set_state(lock_request, FLK_DEAD_STATE);
 518  518                          flk_free_lock(lock_request);
 519  519                  }
 520  520          }
 521  521  
 522  522          mutex_exit(&gp->gp_mutex);
 523  523          return (error);
 524  524  
 525  525  done:
 526  526          flk_set_state(lock_request, FLK_DEAD_STATE);
 527  527          if (lock_request != &stack_lock_request)
 528  528                  flk_free_lock(lock_request);
 529  529          return (error);
 530  530  }
 531  531  
 532  532  /*
 533  533   * Invoke the callbacks in the given list.  If before sleeping, invoke in
 534  534   * list order.  If after sleeping, invoke in reverse order.
 535  535   *
 536  536   * CPR (suspend/resume) support: if one of the callbacks returns a
 537  537   * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 538  538   * while it is sleeping.  There should be at most one callb_cpr_t for the
 539  539   * thread.
 540  540   * XXX This is unnecessarily complicated.  The CPR information should just
 541  541   * get passed in directly through VOP_FRLOCK and reclock, rather than
 542  542   * sneaking it in via a callback.
 543  543   */
 544  544  
 545  545  callb_cpr_t *
 546  546  flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 547  547  {
 548  548          callb_cpr_t *cpr_callbackp = NULL;
 549  549          callb_cpr_t *one_result;
 550  550          flk_callback_t *cb;
 551  551  
 552  552          if (cblist == NULL)
 553  553                  return (NULL);
 554  554  
 555  555          if (when == FLK_BEFORE_SLEEP) {
 556  556                  cb = cblist;
 557  557                  do {
 558  558                          one_result = (*cb->cb_callback)(when, cb->cb_data);
 559  559                          if (one_result != NULL) {
 560  560                                  ASSERT(cpr_callbackp == NULL);
 561  561                                  cpr_callbackp = one_result;
 562  562                          }
 563  563                          cb = cb->cb_next;
 564  564                  } while (cb != cblist);
 565  565          } else {
 566  566                  cb = cblist->cb_prev;
 567  567                  do {
 568  568                          one_result = (*cb->cb_callback)(when, cb->cb_data);
 569  569                          if (one_result != NULL) {
 570  570                                  cpr_callbackp = one_result;
 571  571                          }
 572  572                          cb = cb->cb_prev;
 573  573                  } while (cb != cblist->cb_prev);
 574  574          }
 575  575  
 576  576          return (cpr_callbackp);
 577  577  }
 578  578  
 579  579  /*
 580  580   * Initialize a flk_callback_t to hold the given callback.
 581  581   */
 582  582  
 583  583  void
 584  584  flk_init_callback(flk_callback_t *flk_cb,
 585  585          callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 586  586  {
 587  587          flk_cb->cb_next = flk_cb;
 588  588          flk_cb->cb_prev = flk_cb;
 589  589          flk_cb->cb_callback = cb_fcn;
 590  590          flk_cb->cb_data = cbdata;
 591  591  }
 592  592  
 593  593  /*
 594  594   * Initialize an flk_callback_t and then link it into the head of an
 595  595   * existing list (which may be NULL).
 596  596   */
 597  597  
 598  598  void
 599  599  flk_add_callback(flk_callback_t *newcb,
 600  600                  callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 601  601                  void *cbdata, flk_callback_t *cblist)
 602  602  {
 603  603          flk_init_callback(newcb, cb_fcn, cbdata);
 604  604  
 605  605          if (cblist == NULL)
 606  606                  return;
 607  607  
 608  608          newcb->cb_prev = cblist->cb_prev;
 609  609          newcb->cb_next = cblist;
 610  610          cblist->cb_prev->cb_next = newcb;
 611  611          cblist->cb_prev = newcb;
 612  612  }
 613  613  
 614  614  /*
 615  615   * Initialize the flk_edge_cache data structure and create the
 616  616   * nlm_reg_status array.
 617  617   */
 618  618  
 619  619  void
 620  620  flk_init(void)
 621  621  {
 622  622          uint_t  i;
 623  623  
 624  624          flk_edge_cache = kmem_cache_create("flk_edges",
 625  625              sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 626  626          if (flk_edge_cache == NULL) {
 627  627                  cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 628  628          }
 629  629          /*
 630  630           * Create the NLM registry object.
 631  631           */
 632  632  
 633  633          if (cluster_bootflags & CLUSTER_BOOTED) {
 634  634                  /*
 635  635                   * This routine tells you the maximum node id that will be used
 636  636                   * in the cluster.  This number will be the size of the nlm
 637  637                   * registry status array.  We add 1 because we will be using
 638  638                   * all entries indexed from 0 to maxnodeid; e.g., from 0
 639  639                   * to 64, for a total of 65 entries.
 640  640                   */
 641  641                  nlm_status_size = clconf_maximum_nodeid() + 1;
 642  642          } else {
 643  643                  nlm_status_size = 0;
 644  644          }
 645  645  
 646  646          if (nlm_status_size != 0) {     /* booted as a cluster */
 647  647                  nlm_reg_status = (flk_nlm_status_t *)
 648  648                      kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 649  649                      KM_SLEEP);
 650  650  
 651  651                  /* initialize all NLM states in array to NLM_UNKNOWN */
 652  652                  for (i = 0; i < nlm_status_size; i++) {
 653  653                          nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 654  654                  }
 655  655          }
 656  656  }
 657  657  
 658  658  /*
 659  659   * Zone constructor/destructor callbacks to be executed when a zone is
 660  660   * created/destroyed.
 661  661   */
 662  662  /* ARGSUSED */
 663  663  void *
 664  664  flk_zone_init(zoneid_t zoneid)
 665  665  {
 666  666          struct flock_globals *fg;
 667  667          uint_t i;
 668  668  
 669  669          fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 670  670          fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 671  671          for (i = 0; i < HASH_SIZE; i++)
 672  672                  fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 673  673          return (fg);
 674  674  }
 675  675  
 676  676  /* ARGSUSED */
 677  677  void
 678  678  flk_zone_fini(zoneid_t zoneid, void *data)
 679  679  {
 680  680          struct flock_globals *fg = data;
 681  681  
 682  682          kmem_free(fg, sizeof (*fg));
 683  683  }
 684  684  
 685  685  /*
 686  686   * Get a lock_descriptor structure with initialization of edge lists.
 687  687   */
 688  688  
 689  689  static lock_descriptor_t *
 690  690  flk_get_lock(void)
 691  691  {
 692  692          lock_descriptor_t       *l;
 693  693  
 694  694          l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 695  695  
 696  696          cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 697  697          l->l_edge.edge_in_next = &l->l_edge;
 698  698          l->l_edge.edge_in_prev = &l->l_edge;
 699  699          l->l_edge.edge_adj_next = &l->l_edge;
 700  700          l->l_edge.edge_adj_prev = &l->l_edge;
 701  701          l->pvertex = -1;
 702  702          l->l_status = FLK_INITIAL_STATE;
 703  703          flk_lock_allocs++;
 704  704          return (l);
 705  705  }
 706  706  
 707  707  /*
 708  708   * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 709  709   * when some thread has a reference to it as in reclock().
 710  710   */
 711  711  
 712  712  void
 713  713  flk_free_lock(lock_descriptor_t *lock)
 714  714  {
 715  715          ASSERT(IS_DEAD(lock));
 716  716          if (IS_REFERENCED(lock)) {
 717  717                  lock->l_state |= DELETED_LOCK;
 718  718                  return;
 719  719          }
 720  720          flk_lock_frees++;
 721  721          kmem_free((void *)lock, sizeof (lock_descriptor_t));
 722  722  }
 723  723  
 724  724  void
 725  725  flk_set_state(lock_descriptor_t *lock, int new_state)
 726  726  {
 727  727          /*
 728  728           * Locks in the sleeping list may be woken up in a number of ways,
 729  729           * and more than once.  If a sleeping lock is signaled awake more
 730  730           * than once, then it may or may not change state depending on its
 731  731           * current state.
 732  732           * Also note that NLM locks that are sleeping could be moved to an
 733  733           * interrupted state more than once if the unlock request is
 734  734           * retransmitted by the NLM client - the second time around, this is
 735  735           * just a nop.
 736  736           * The ordering of being signaled awake is:
 737  737           * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 738  738           * The checks below implement this ordering.
 739  739           */
 740  740          if (IS_INTERRUPTED(lock)) {
 741  741                  if ((new_state == FLK_CANCELLED_STATE) ||
 742  742                      (new_state == FLK_GRANTED_STATE) ||
 743  743                      (new_state == FLK_INTERRUPTED_STATE)) {
 744  744                          return;
 745  745                  }
 746  746          }
 747  747          if (IS_CANCELLED(lock)) {
 748  748                  if ((new_state == FLK_GRANTED_STATE) ||
 749  749                      (new_state == FLK_CANCELLED_STATE)) {
 750  750                          return;
 751  751                  }
 752  752          }
 753  753          CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 754  754          if (IS_PXFS(lock)) {
 755  755                  cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 756  756          }
 757  757          lock->l_status = new_state;
 758  758  }
 759  759  
 760  760  /*
 761  761   * Routine that checks whether there are any blocking locks in the system.
 762  762   *
 763  763   * The policy followed is if a write lock is sleeping we don't allow read
 764  764   * locks before this write lock even though there may not be any active
 765  765   * locks corresponding to the read locks' region.
 766  766   *
 767  767   * flk_add_edge() function adds an edge between l1 and l2 iff there
 768  768   * is no path between l1 and l2. This is done to have a "minimum
 769  769   * storage representation" of the dependency graph.
 770  770   *
 771  771   * Another property of the graph is since only the new request throws
 772  772   * edges to the existing locks in the graph, the graph is always topologically
 773  773   * ordered.
 774  774   */
 775  775  
 776  776  static int
 777  777  flk_process_request(lock_descriptor_t *request)
 778  778  {
 779  779          graph_t *gp = request->l_graph;
 780  780          lock_descriptor_t *lock;
 781  781          int request_blocked_by_active = 0;
 782  782          int request_blocked_by_granted = 0;
 783  783          int request_blocked_by_sleeping = 0;
 784  784          vnode_t *vp = request->l_vnode;
 785  785          int     error = 0;
 786  786          int request_will_wait = 0;
 787  787          int found_covering_lock = 0;
 788  788          lock_descriptor_t *covered_by = NULL;
 789  789  
 790  790          ASSERT(MUTEX_HELD(&gp->gp_mutex));
 791  791          request_will_wait = IS_WILLING_TO_SLEEP(request);
 792  792  
 793  793          /*
 794  794           * check active locks
 795  795           */
 796  796  
 797  797          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 798  798  
 799  799  
 800  800          if (lock) {
 801  801                  do {
 802  802                          if (BLOCKS(lock, request)) {
 803  803                                  if (!request_will_wait)
 804  804                                          return (EAGAIN);
 805  805                                  request_blocked_by_active = 1;
 806  806                                  break;
 807  807                          }
 808  808                          /*
 809  809                           * Grant lock if it is for the same owner holding active
 810  810                           * lock that covers the request.
 811  811                           */
 812  812  
 813  813                          if (SAME_OWNER(lock, request) &&
 814  814                              COVERS(lock, request) &&
 815  815                              (request->l_type == F_RDLCK))
 816  816                                  return (flk_execute_request(request));
 817  817                          lock = lock->l_next;
 818  818                  } while (lock->l_vnode == vp);
 819  819          }
 820  820  
 821  821          if (!request_blocked_by_active) {
 822  822                          lock_descriptor_t *lk[1];
 823  823                          lock_descriptor_t *first_glock = NULL;
 824  824                  /*
 825  825                   * Shall we grant this?! NO!!
 826  826                   * What about those locks that were just granted and still
 827  827                   * in sleep queue. Those threads are woken up and so locks
 828  828                   * are almost active.
 829  829                   */
 830  830                  SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 831  831                  if (lock) {
 832  832                          do {
 833  833                                  if (BLOCKS(lock, request)) {
 834  834                                          if (IS_GRANTED(lock)) {
 835  835                                                  request_blocked_by_granted = 1;
 836  836                                          } else {
 837  837                                                  request_blocked_by_sleeping = 1;
 838  838                                          }
 839  839                                  }
 840  840  
 841  841                                  lock = lock->l_next;
 842  842                          } while ((lock->l_vnode == vp));
 843  843                          first_glock = lock->l_prev;
 844  844                          ASSERT(first_glock->l_vnode == vp);
 845  845                  }
 846  846  
 847  847                  if (request_blocked_by_granted)
 848  848                          goto block;
 849  849  
 850  850                  if (!request_blocked_by_sleeping) {
 851  851                          /*
 852  852                           * If the request isn't going to be blocked by a
 853  853                           * sleeping request, we know that it isn't going to
 854  854                           * be blocked; we can just execute the request --
 855  855                           * without performing costly deadlock detection.
 856  856                           */
 857  857                          ASSERT(!request_blocked_by_active);
 858  858                          return (flk_execute_request(request));
 859  859                  } else if (request->l_type == F_RDLCK) {
 860  860                          /*
 861  861                           * If we have a sleeping writer in the requested
 862  862                           * lock's range, block.
 863  863                           */
 864  864                          goto block;
 865  865                  }
 866  866  
 867  867                  lk[0] = request;
 868  868                  request->l_state |= RECOMPUTE_LOCK;
 869  869                  SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 870  870                  if (lock) {
 871  871                          do {
 872  872                                  flk_recompute_dependencies(lock, lk, 1, 0);
 873  873                                  lock = lock->l_next;
 874  874                          } while (lock->l_vnode == vp);
 875  875                  }
 876  876                  lock = first_glock;
 877  877                  if (lock) {
 878  878                          do {
 879  879                                  if (IS_GRANTED(lock)) {
 880  880                                  flk_recompute_dependencies(lock, lk, 1, 0);
 881  881                                  }
 882  882                                  lock = lock->l_prev;
 883  883                          } while ((lock->l_vnode == vp));
 884  884                  }
 885  885                  request->l_state &= ~RECOMPUTE_LOCK;
 886  886                  if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 887  887                          return (EDEADLK);
 888  888                  return (flk_execute_request(request));
 889  889          }
 890  890  
 891  891  block:
 892  892          if (request_will_wait)
 893  893                  flk_graph_uncolor(gp);
 894  894  
 895  895          /* check sleeping locks */
 896  896  
 897  897          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 898  898  
 899  899          /*
 900  900           * If we find a sleeping write lock that is a superset of the
 901  901           * region wanted by request we can be assured that by adding an
 902  902           * edge to this write lock we have paths to all locks in the
 903  903           * graph that blocks the request except in one case and that is why
 904  904           * another check for SAME_OWNER in the loop below. The exception
 905  905           * case is when this process that owns the sleeping write lock 'l1'
 906  906           * has other locks l2, l3, l4 that are in the system and arrived
 907  907           * before l1. l1 does not have path to these locks as they are from
 908  908           * same process. We break when we find a second covering sleeping
 909  909           * lock l5 owned by a process different from that owning l1, because
 910  910           * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 911  911           * it has l1 would have produced a deadlock already.
 912  912           */
 913  913  
 914  914          if (lock) {
 915  915                  do {
 916  916                          if (BLOCKS(lock, request)) {
 917  917                                  if (!request_will_wait)
 918  918                                          return (EAGAIN);
 919  919                                  if (COVERS(lock, request) &&
 920  920                                      lock->l_type == F_WRLCK) {
 921  921                                          if (found_covering_lock &&
 922  922                                              !SAME_OWNER(lock, covered_by)) {
 923  923                                                  found_covering_lock++;
 924  924                                                  break;
 925  925                                          }
 926  926                                          found_covering_lock = 1;
 927  927                                          covered_by = lock;
 928  928                                  }
 929  929                                  if (found_covering_lock &&
 930  930                                      !SAME_OWNER(lock, covered_by)) {
 931  931                                          lock = lock->l_next;
 932  932                                          continue;
 933  933                                  }
 934  934                                  if ((error = flk_add_edge(request, lock,
 935  935                                      !found_covering_lock, 0)))
 936  936                                          return (error);
 937  937                          }
 938  938                          lock = lock->l_next;
 939  939                  } while (lock->l_vnode == vp);
 940  940          }
 941  941  
 942  942  /*
 943  943   * found_covering_lock == 2 iff at this point 'request' has paths
 944  944   * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 945  945   * point 'request' has paths to all locks that blocks 'request' whose owners
 946  946   * are not same as the one that covers 'request' (covered_by above) and
 947  947   * we can have locks whose owner is same as covered_by in the active list.
 948  948   */
 949  949  
 950  950          if (request_blocked_by_active && found_covering_lock != 2) {
 951  951                  SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 952  952                  ASSERT(lock != NULL);
 953  953                  do {
 954  954                          if (BLOCKS(lock, request)) {
 955  955                                  if (found_covering_lock &&
 956  956                                      !SAME_OWNER(lock, covered_by)) {
 957  957                                          lock = lock->l_next;
 958  958                                          continue;
 959  959                                  }
 960  960                                  if ((error = flk_add_edge(request, lock,
 961  961                                      CHECK_CYCLE, 0)))
 962  962                                          return (error);
 963  963                          }
 964  964                          lock = lock->l_next;
 965  965                  } while (lock->l_vnode == vp);
 966  966          }
 967  967  
 968  968          if (NOT_BLOCKED(request)) {
 969  969                  /*
 970  970                   * request not dependent on any other locks
 971  971                   * so execute this request
 972  972                   */
 973  973                  return (flk_execute_request(request));
 974  974          } else {
 975  975                  /*
 976  976                   * check for deadlock
 977  977                   */
 978  978                  if (flk_check_deadlock(request))
 979  979                          return (EDEADLK);
 980  980                  /*
 981  981                   * this thread has to sleep
 982  982                   */
 983  983                  return (flk_wait_execute_request(request));
 984  984          }
 985  985  }
 986  986  
 987  987  /*
 988  988   * The actual execution of the request in the simple case is only to
 989  989   * insert the 'request' in the list of active locks if it is not an
 990  990   * UNLOCK.
 991  991   * We have to consider the existing active locks' relation to
 992  992   * this 'request' if they are owned by same process. flk_relation() does
 993  993   * this job and sees to that the dependency graph information is maintained
 994  994   * properly.
 995  995   */
 996  996  
 997  997  int
 998  998  flk_execute_request(lock_descriptor_t *request)
 999  999  {
1000 1000          graph_t *gp = request->l_graph;
1001 1001          vnode_t *vp = request->l_vnode;
1002 1002          lock_descriptor_t       *lock, *lock1;
1003 1003          int done_searching = 0;
1004 1004  
1005 1005          CHECK_SLEEPING_LOCKS(gp);
1006 1006          CHECK_ACTIVE_LOCKS(gp);
1007 1007  
1008 1008          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1009 1009  
1010 1010          flk_set_state(request, FLK_START_STATE);
1011 1011  
1012 1012          ASSERT(NOT_BLOCKED(request));
1013 1013  
1014 1014          /* IO_LOCK requests are only to check status */
1015 1015  
1016 1016          if (IS_IO_LOCK(request))
1017 1017                  return (0);
1018 1018  
1019 1019          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1020 1020  
1021 1021          if (lock == NULL && request->l_type == F_UNLCK)
1022 1022                  return (0);
1023 1023          if (lock == NULL) {
1024 1024                  flk_insert_active_lock(request);
1025 1025                  return (0);
1026 1026          }
1027 1027  
1028 1028          do {
1029 1029                  lock1 = lock->l_next;
1030 1030                  if (SAME_OWNER(request, lock)) {
1031 1031                          done_searching = flk_relation(lock, request);
1032 1032                  }
1033 1033                  lock = lock1;
1034 1034          } while (lock->l_vnode == vp && !done_searching);
1035 1035  
1036 1036          /*
1037 1037           * insert in active queue
1038 1038           */
1039 1039  
1040 1040          if (request->l_type != F_UNLCK)
1041 1041                  flk_insert_active_lock(request);
1042 1042  
1043 1043          return (0);
1044 1044  }
1045 1045  
1046 1046  /*
1047 1047   * 'request' is blocked by some one therefore we put it into sleep queue.
1048 1048   */
1049 1049  static int
1050 1050  flk_wait_execute_request(lock_descriptor_t *request)
1051 1051  {
1052 1052          graph_t *gp = request->l_graph;
1053 1053          callb_cpr_t     *cprp;          /* CPR info from callback */
1054 1054          struct flock_globals *fg;
1055 1055          int index;
1056 1056  
1057 1057          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1058 1058          ASSERT(IS_WILLING_TO_SLEEP(request));
1059 1059  
1060 1060          flk_insert_sleeping_lock(request);
1061 1061  
1062 1062          if (IS_LOCKMGR(request)) {
1063 1063                  index = HASH_INDEX(request->l_vnode);
1064 1064                  fg = flk_get_globals();
1065 1065  
1066 1066                  if (nlm_status_size == 0) {     /* not booted as a cluster */
1067 1067                          if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1068 1068                                  flk_cancel_sleeping_lock(request, 1);
1069 1069                                  return (ENOLCK);
1070 1070                          }
1071 1071                  } else {                        /* booted as a cluster */
1072 1072                          /*
1073 1073                           * If the request is an NLM server lock request,
1074 1074                           * and the NLM state of the lock request is not
1075 1075                           * NLM_UP (because the NLM server is shutting
1076 1076                           * down), then cancel the sleeping lock and
1077 1077                           * return error ENOLCK that will encourage the
1078 1078                           * client to retransmit.
1079 1079                           */
1080 1080                          if (!IS_NLM_UP(request)) {
1081 1081                                  flk_cancel_sleeping_lock(request, 1);
1082 1082                                  return (ENOLCK);
1083 1083                          }
1084 1084                  }
1085 1085          }
1086 1086  
1087 1087          /* Clustering: For blocking PXFS locks, return */
1088 1088          if (IS_PXFS(request)) {
1089 1089                  /*
1090 1090                   * PXFS locks sleep on the client side.
1091 1091                   * The callback argument is used to wake up the sleeper
1092 1092                   * when the lock is granted.
1093 1093                   * We return -1 (rather than an errno value) to indicate
1094 1094                   * the client side should sleep
1095 1095                   */
1096 1096                  return (PXFS_LOCK_BLOCKED);
1097 1097          }
1098 1098  
1099 1099          if (request->l_callbacks != NULL) {
1100 1100                  /*
1101 1101                   * To make sure the shutdown code works correctly, either
1102 1102                   * the callback must happen after putting the lock on the
1103 1103                   * sleep list, or we must check the shutdown status after
1104 1104                   * returning from the callback (and before sleeping).  At
1105 1105                   * least for now, we'll use the first option.  If a
1106 1106                   * shutdown or signal or whatever happened while the graph
1107 1107                   * mutex was dropped, that will be detected by
1108 1108                   * wait_for_lock().
1109 1109                   */
1110 1110                  mutex_exit(&gp->gp_mutex);
1111 1111  
1112 1112                  cprp = flk_invoke_callbacks(request->l_callbacks,
1113 1113                      FLK_BEFORE_SLEEP);
1114 1114  
1115 1115                  mutex_enter(&gp->gp_mutex);
1116 1116  
1117 1117                  if (cprp == NULL) {
1118 1118                          wait_for_lock(request);
1119 1119                  } else {
1120 1120                          mutex_enter(cprp->cc_lockp);
1121 1121                          CALLB_CPR_SAFE_BEGIN(cprp);
1122 1122                          mutex_exit(cprp->cc_lockp);
1123 1123                          wait_for_lock(request);
1124 1124                          mutex_enter(cprp->cc_lockp);
1125 1125                          CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1126 1126                          mutex_exit(cprp->cc_lockp);
1127 1127                  }
1128 1128  
1129 1129                  mutex_exit(&gp->gp_mutex);
1130 1130                  (void) flk_invoke_callbacks(request->l_callbacks,
1131 1131                      FLK_AFTER_SLEEP);
1132 1132                  mutex_enter(&gp->gp_mutex);
1133 1133          } else {
1134 1134                  wait_for_lock(request);
1135 1135          }
1136 1136  
1137 1137          if (IS_LOCKMGR(request)) {
1138 1138                  /*
1139 1139                   * If the lock manager is shutting down, return an
1140 1140                   * error that will encourage the client to retransmit.
1141 1141                   */
1142 1142                  if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1143 1143                      !IS_GRANTED(request)) {
1144 1144                          flk_cancel_sleeping_lock(request, 1);
1145 1145                          return (ENOLCK);
1146 1146                  }
1147 1147          }
1148 1148  
1149 1149          if (IS_INTERRUPTED(request)) {
1150 1150                  /* we got a signal, or act like we did */
1151 1151                  flk_cancel_sleeping_lock(request, 1);
1152 1152                  return (EINTR);
1153 1153          }
1154 1154  
1155 1155          /* Cancelled if some other thread has closed the file */
1156 1156  
1157 1157          if (IS_CANCELLED(request)) {
1158 1158                  flk_cancel_sleeping_lock(request, 1);
1159 1159                  return (EBADF);
1160 1160          }
1161 1161  
1162 1162          request->l_state &= ~GRANTED_LOCK;
1163 1163          REMOVE_SLEEP_QUEUE(request);
1164 1164          return (flk_execute_request(request));
1165 1165  }
1166 1166  
1167 1167  /*
1168 1168   * This routine adds an edge between from and to because from depends
1169 1169   * to. If asked to check for deadlock it checks whether there are any
1170 1170   * reachable locks from "from_lock" that is owned by the same process
1171 1171   * as "from_lock".
1172 1172   * NOTE: It is the caller's responsibility to make sure that the color
1173 1173   * of the graph is consistent between the calls to flk_add_edge as done
1174 1174   * in flk_process_request. This routine does not color and check for
1175 1175   * deadlock explicitly.
1176 1176   */
1177 1177  
1178 1178  static int
1179 1179  flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1180 1180                          int check_cycle, int update_graph)
1181 1181  {
1182 1182          edge_t  *edge;
1183 1183          edge_t  *ep;
1184 1184          lock_descriptor_t       *vertex;
1185 1185          lock_descriptor_t *vertex_stack;
1186 1186  
1187 1187          STACK_INIT(vertex_stack);
1188 1188  
1189 1189          /*
1190 1190           * if to vertex already has mark_color just return
1191 1191           * don't add an edge as it is reachable from from vertex
1192 1192           * before itself.
1193 1193           */
1194 1194  
1195 1195          if (COLORED(to_lock))
1196 1196                  return (0);
1197 1197  
1198 1198          edge = flk_get_edge();
1199 1199  
1200 1200          /*
1201 1201           * set the from and to vertex
1202 1202           */
1203 1203  
1204 1204          edge->from_vertex = from_lock;
1205 1205          edge->to_vertex = to_lock;
1206 1206  
1207 1207          /*
1208 1208           * put in adjacency list of from vertex
1209 1209           */
1210 1210  
1211 1211          from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1212 1212          edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1213 1213          edge->edge_adj_prev = &from_lock->l_edge;
1214 1214          from_lock->l_edge.edge_adj_next = edge;
1215 1215  
1216 1216          /*
1217 1217           * put in in list of to vertex
1218 1218           */
1219 1219  
1220 1220          to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1221 1221          edge->edge_in_next = to_lock->l_edge.edge_in_next;
1222 1222          to_lock->l_edge.edge_in_next = edge;
1223 1223          edge->edge_in_prev = &to_lock->l_edge;
1224 1224  
1225 1225  
1226 1226          if (update_graph) {
1227 1227                  flk_update_proc_graph(edge, 0);
1228 1228                  return (0);
1229 1229          }
1230 1230          if (!check_cycle) {
1231 1231                  return (0);
1232 1232          }
1233 1233  
1234 1234          STACK_PUSH(vertex_stack, from_lock, l_stack);
1235 1235  
1236 1236          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1237 1237  
1238 1238                  STACK_POP(vertex_stack, l_stack);
1239 1239  
1240 1240                  for (ep = FIRST_ADJ(vertex);
1241 1241                      ep != HEAD(vertex);
1242 1242                      ep = NEXT_ADJ(ep)) {
1243 1243                          if (COLORED(ep->to_vertex))
1244 1244                                  continue;
1245 1245                          COLOR(ep->to_vertex);
1246 1246                          if (SAME_OWNER(ep->to_vertex, from_lock))
1247 1247                                  goto dead_lock;
1248 1248                          STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1249 1249                  }
1250 1250          }
1251 1251          return (0);
1252 1252  
1253 1253  dead_lock:
1254 1254  
1255 1255          /*
1256 1256           * remove all edges
1257 1257           */
1258 1258  
1259 1259          ep = FIRST_ADJ(from_lock);
1260 1260  
1261 1261          while (ep != HEAD(from_lock)) {
1262 1262                  IN_LIST_REMOVE(ep);
1263 1263                  from_lock->l_sedge = NEXT_ADJ(ep);
1264 1264                  ADJ_LIST_REMOVE(ep);
1265 1265                  flk_free_edge(ep);
1266 1266                  ep = from_lock->l_sedge;
1267 1267          }
1268 1268          return (EDEADLK);
1269 1269  }
1270 1270  
1271 1271  /*
1272 1272   * Get an edge structure for representing the dependency between two locks.
1273 1273   */
1274 1274  
1275 1275  static edge_t *
1276 1276  flk_get_edge()
1277 1277  {
1278 1278          edge_t  *ep;
1279 1279  
1280 1280          ASSERT(flk_edge_cache != NULL);
1281 1281  
1282 1282          ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1283 1283          edge_allocs++;
1284 1284          return (ep);
1285 1285  }
1286 1286  
1287 1287  /*
1288 1288   * Free the edge structure.
1289 1289   */
1290 1290  
1291 1291  static void
1292 1292  flk_free_edge(edge_t *ep)
1293 1293  {
1294 1294          edge_frees++;
1295 1295          kmem_cache_free(flk_edge_cache, (void *)ep);
1296 1296  }
1297 1297  
1298 1298  /*
1299 1299   * Check the relationship of request with lock and perform the
1300 1300   * recomputation of dependencies, break lock if required, and return
1301 1301   * 1 if request cannot have any more relationship with the next
1302 1302   * active locks.
1303 1303   * The 'lock' and 'request' are compared and in case of overlap we
1304 1304   * delete the 'lock' and form new locks to represent the non-overlapped
1305 1305   * portion of original 'lock'. This function has side effects such as
1306 1306   * 'lock' will be freed, new locks will be added to the active list.
1307 1307   */
1308 1308  
1309 1309  static int
1310 1310  flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1311 1311  {
1312 1312          int lock_effect;
1313 1313          lock_descriptor_t *lock1, *lock2;
1314 1314          lock_descriptor_t *topology[3];
1315 1315          int nvertex = 0;
1316 1316          int i;
1317 1317          edge_t  *ep;
1318 1318          graph_t *gp = (lock->l_graph);
1319 1319  
1320 1320  
1321 1321          CHECK_SLEEPING_LOCKS(gp);
1322 1322          CHECK_ACTIVE_LOCKS(gp);
1323 1323  
1324 1324          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1325 1325  
1326 1326          topology[0] = topology[1] = topology[2] = NULL;
1327 1327  
1328 1328          if (request->l_type == F_UNLCK)
1329 1329                  lock_effect = FLK_UNLOCK;
1330 1330          else if (request->l_type == F_RDLCK &&
1331 1331              lock->l_type == F_WRLCK)
1332 1332                  lock_effect = FLK_DOWNGRADE;
1333 1333          else if (request->l_type == F_WRLCK &&
1334 1334              lock->l_type == F_RDLCK)
1335 1335                  lock_effect = FLK_UPGRADE;
1336 1336          else
1337 1337                  lock_effect = FLK_STAY_SAME;
1338 1338  
1339 1339          if (lock->l_end < request->l_start) {
1340 1340                  if (lock->l_end == request->l_start - 1 &&
1341 1341                      lock_effect == FLK_STAY_SAME) {
1342 1342                          topology[0] = request;
1343 1343                          request->l_start = lock->l_start;
1344 1344                          nvertex = 1;
1345 1345                          goto recompute;
1346 1346                  } else {
1347 1347                          return (0);
1348 1348                  }
1349 1349          }
1350 1350  
1351 1351          if (lock->l_start > request->l_end) {
1352 1352                  if (request->l_end == lock->l_start - 1 &&
1353 1353                      lock_effect == FLK_STAY_SAME) {
1354 1354                          topology[0] = request;
1355 1355                          request->l_end = lock->l_end;
1356 1356                          nvertex = 1;
1357 1357                          goto recompute;
1358 1358                  } else {
1359 1359                          return (1);
1360 1360                  }
1361 1361          }
1362 1362  
1363 1363          if (request->l_end < lock->l_end) {
1364 1364                  if (request->l_start > lock->l_start) {
1365 1365                          if (lock_effect == FLK_STAY_SAME) {
1366 1366                                  request->l_start = lock->l_start;
1367 1367                                  request->l_end = lock->l_end;
1368 1368                                  topology[0] = request;
1369 1369                                  nvertex = 1;
1370 1370                          } else {
1371 1371                                  lock1 = flk_get_lock();
1372 1372                                  lock2 = flk_get_lock();
1373 1373                                  COPY(lock1, lock);
1374 1374                                  COPY(lock2, lock);
1375 1375                                  lock1->l_start = lock->l_start;
1376 1376                                  lock1->l_end = request->l_start - 1;
1377 1377                                  lock2->l_start = request->l_end + 1;
1378 1378                                  lock2->l_end = lock->l_end;
1379 1379                                  topology[0] = lock1;
1380 1380                                  topology[1] = lock2;
1381 1381                                  topology[2] = request;
1382 1382                                  nvertex = 3;
1383 1383                          }
1384 1384                  } else if (request->l_start < lock->l_start) {
1385 1385                          if (lock_effect == FLK_STAY_SAME) {
1386 1386                                  request->l_end = lock->l_end;
1387 1387                                  topology[0] = request;
1388 1388                                  nvertex = 1;
1389 1389                          } else {
1390 1390                                  lock1 = flk_get_lock();
1391 1391                                  COPY(lock1, lock);
1392 1392                                  lock1->l_start = request->l_end + 1;
1393 1393                                  topology[0] = lock1;
1394 1394                                  topology[1] = request;
1395 1395                                  nvertex = 2;
1396 1396                          }
1397 1397                  } else  {
1398 1398                          if (lock_effect == FLK_STAY_SAME) {
1399 1399                                  request->l_start = lock->l_start;
1400 1400                                  request->l_end = lock->l_end;
1401 1401                                  topology[0] = request;
1402 1402                                  nvertex = 1;
1403 1403                          } else {
1404 1404                                  lock1 = flk_get_lock();
1405 1405                                  COPY(lock1, lock);
1406 1406                                  lock1->l_start = request->l_end + 1;
1407 1407                                  topology[0] = lock1;
1408 1408                                  topology[1] = request;
1409 1409                                  nvertex = 2;
1410 1410                          }
1411 1411                  }
1412 1412          } else if (request->l_end > lock->l_end) {
1413 1413                  if (request->l_start > lock->l_start)  {
1414 1414                          if (lock_effect == FLK_STAY_SAME) {
1415 1415                                  request->l_start = lock->l_start;
1416 1416                                  topology[0] = request;
1417 1417                                  nvertex = 1;
1418 1418                          } else {
1419 1419                                  lock1 = flk_get_lock();
1420 1420                                  COPY(lock1, lock);
1421 1421                                  lock1->l_end = request->l_start - 1;
1422 1422                                  topology[0] = lock1;
1423 1423                                  topology[1] = request;
1424 1424                                  nvertex = 2;
1425 1425                          }
1426 1426                  } else if (request->l_start < lock->l_start)  {
1427 1427                          topology[0] = request;
1428 1428                          nvertex = 1;
1429 1429                  } else {
1430 1430                          topology[0] = request;
1431 1431                          nvertex = 1;
1432 1432                  }
1433 1433          } else {
1434 1434                  if (request->l_start > lock->l_start) {
1435 1435                          if (lock_effect == FLK_STAY_SAME) {
1436 1436                                  request->l_start = lock->l_start;
1437 1437                                  topology[0] = request;
1438 1438                                  nvertex = 1;
1439 1439                          } else {
1440 1440                                  lock1 = flk_get_lock();
1441 1441                                  COPY(lock1, lock);
1442 1442                                  lock1->l_end = request->l_start - 1;
1443 1443                                  topology[0] = lock1;
1444 1444                                  topology[1] = request;
1445 1445                                  nvertex = 2;
1446 1446                          }
1447 1447                  } else if (request->l_start < lock->l_start) {
1448 1448                          topology[0] = request;
1449 1449                          nvertex = 1;
1450 1450                  } else {
1451 1451                          if (lock_effect !=  FLK_UNLOCK) {
1452 1452                                  topology[0] = request;
1453 1453                                  nvertex = 1;
1454 1454                          } else {
1455 1455                                  flk_delete_active_lock(lock, 0);
1456 1456                                  flk_wakeup(lock, 1);
1457 1457                                  flk_free_lock(lock);
1458 1458                                  CHECK_SLEEPING_LOCKS(gp);
1459 1459                                  CHECK_ACTIVE_LOCKS(gp);
1460 1460                                  return (1);
1461 1461                          }
1462 1462                  }
1463 1463          }
1464 1464  
1465 1465  recompute:
1466 1466  
1467 1467          /*
1468 1468           * For unlock we don't send the 'request' to for recomputing
1469 1469           * dependencies because no lock will add an edge to this.
1470 1470           */
1471 1471  
1472 1472          if (lock_effect == FLK_UNLOCK) {
1473 1473                  topology[nvertex-1] = NULL;
1474 1474                  nvertex--;
1475 1475          }
1476 1476          for (i = 0; i < nvertex; i++) {
1477 1477                  topology[i]->l_state |= RECOMPUTE_LOCK;
1478 1478                  topology[i]->l_color = NO_COLOR;
1479 1479          }
1480 1480  
1481 1481          ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1482 1482  
1483 1483          /*
1484 1484           * we remove the adjacent edges for all vertices' to this vertex
1485 1485           * 'lock'.
1486 1486           */
1487 1487  
1488 1488          ep = FIRST_IN(lock);
1489 1489          while (ep != HEAD(lock)) {
1490 1490                  ADJ_LIST_REMOVE(ep);
1491 1491                  ep = NEXT_IN(ep);
1492 1492          }
1493 1493  
1494 1494          flk_delete_active_lock(lock, 0);
1495 1495  
1496 1496          /* We are ready for recomputing the dependencies now */
1497 1497  
1498 1498          flk_recompute_dependencies(lock, topology, nvertex, 1);
1499 1499  
1500 1500          for (i = 0; i < nvertex; i++) {
1501 1501                  topology[i]->l_state &= ~RECOMPUTE_LOCK;
1502 1502                  topology[i]->l_color = NO_COLOR;
1503 1503          }
1504 1504  
1505 1505  
1506 1506          if (lock_effect == FLK_UNLOCK) {
1507 1507                  nvertex++;
1508 1508          }
1509 1509          for (i = 0; i < nvertex - 1; i++) {
1510 1510                  flk_insert_active_lock(topology[i]);
1511 1511          }
1512 1512  
1513 1513  
1514 1514          if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1515 1515                  flk_wakeup(lock, 0);
1516 1516          } else {
1517 1517                  ep = FIRST_IN(lock);
1518 1518                  while (ep != HEAD(lock)) {
1519 1519                          lock->l_sedge = NEXT_IN(ep);
1520 1520                          IN_LIST_REMOVE(ep);
1521 1521                          flk_update_proc_graph(ep, 1);
1522 1522                          flk_free_edge(ep);
1523 1523                          ep = lock->l_sedge;
1524 1524                  }
1525 1525          }
1526 1526          flk_free_lock(lock);
1527 1527  
1528 1528          CHECK_SLEEPING_LOCKS(gp);
1529 1529          CHECK_ACTIVE_LOCKS(gp);
1530 1530          return (0);
1531 1531  }
1532 1532  
1533 1533  /*
1534 1534   * Insert a lock into the active queue.
1535 1535   */
1536 1536  
1537 1537  static void
1538 1538  flk_insert_active_lock(lock_descriptor_t *new_lock)
1539 1539  {
1540 1540          graph_t *gp = new_lock->l_graph;
1541 1541          vnode_t *vp = new_lock->l_vnode;
1542 1542          lock_descriptor_t *first_lock, *lock;
1543 1543  
1544 1544          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1545 1545  
1546 1546          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1547 1547          first_lock = lock;
1548 1548  
1549 1549          if (first_lock != NULL) {
1550 1550                  for (; (lock->l_vnode == vp &&
1551 1551                      lock->l_start < new_lock->l_start); lock = lock->l_next)
1552 1552                          ;
1553 1553          } else {
1554 1554                  lock = ACTIVE_HEAD(gp);
1555 1555          }
1556 1556  
1557 1557          lock->l_prev->l_next = new_lock;
1558 1558          new_lock->l_next = lock;
1559 1559          new_lock->l_prev = lock->l_prev;
1560 1560          lock->l_prev = new_lock;
1561 1561  
1562 1562          if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1563 1563                  vp->v_filocks = (struct filock *)new_lock;
1564 1564          }
1565 1565          flk_set_state(new_lock, FLK_ACTIVE_STATE);
1566 1566          new_lock->l_state |= ACTIVE_LOCK;
1567 1567  
1568 1568          CHECK_ACTIVE_LOCKS(gp);
1569 1569          CHECK_SLEEPING_LOCKS(gp);
1570 1570  }
1571 1571  
1572 1572  /*
1573 1573   * Delete the active lock : Performs two functions depending on the
1574 1574   * value of second parameter. One is to remove from the active lists
1575 1575   * only and other is to both remove and free the lock.
1576 1576   */
1577 1577  
1578 1578  static void
1579 1579  flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1580 1580  {
1581 1581          vnode_t *vp = lock->l_vnode;
1582 1582          graph_t *gp = lock->l_graph;
1583 1583  
1584 1584          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1585 1585          if (free_lock)
1586 1586                  ASSERT(NO_DEPENDENTS(lock));
1587 1587          ASSERT(NOT_BLOCKED(lock));
1588 1588          ASSERT(IS_ACTIVE(lock));
1589 1589  
1590 1590          ASSERT((vp->v_filocks != NULL));
1591 1591  
1592 1592          if (vp->v_filocks == (struct filock *)lock) {
1593 1593                  vp->v_filocks = (struct filock *)
1594 1594                      ((lock->l_next->l_vnode == vp) ? lock->l_next :
1595 1595                      NULL);
1596 1596          }
1597 1597          lock->l_next->l_prev = lock->l_prev;
1598 1598          lock->l_prev->l_next = lock->l_next;
1599 1599          lock->l_next = lock->l_prev = NULL;
1600 1600          flk_set_state(lock, FLK_DEAD_STATE);
1601 1601          lock->l_state &= ~ACTIVE_LOCK;
1602 1602  
1603 1603          if (free_lock)
1604 1604                  flk_free_lock(lock);
1605 1605          CHECK_ACTIVE_LOCKS(gp);
1606 1606          CHECK_SLEEPING_LOCKS(gp);
1607 1607  }
1608 1608  
1609 1609  /*
1610 1610   * Insert into the sleep queue.
1611 1611   */
1612 1612  
1613 1613  static void
1614 1614  flk_insert_sleeping_lock(lock_descriptor_t *request)
1615 1615  {
1616 1616          graph_t *gp = request->l_graph;
1617 1617          vnode_t *vp = request->l_vnode;
1618 1618          lock_descriptor_t       *lock;
1619 1619  
1620 1620          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1621 1621          ASSERT(IS_INITIAL(request));
1622 1622  
1623 1623          for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1624 1624              lock->l_vnode < vp); lock = lock->l_next)
1625 1625                  ;
1626 1626  
1627 1627          lock->l_prev->l_next = request;
1628 1628          request->l_prev = lock->l_prev;
1629 1629          lock->l_prev = request;
1630 1630          request->l_next = lock;
1631 1631          flk_set_state(request, FLK_SLEEPING_STATE);
1632 1632          request->l_state |= SLEEPING_LOCK;
1633 1633  }
1634 1634  
1635 1635  /*
1636 1636   * Cancelling a sleeping lock implies removing a vertex from the
1637 1637   * dependency graph and therefore we should recompute the dependencies
1638 1638   * of all vertices that have a path  to this vertex, w.r.t. all
1639 1639   * vertices reachable from this vertex.
1640 1640   */
1641 1641  
1642 1642  void
1643 1643  flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1644 1644  {
1645 1645          graph_t *gp = request->l_graph;
1646 1646          vnode_t *vp = request->l_vnode;
1647 1647          lock_descriptor_t **topology = NULL;
1648 1648          edge_t  *ep;
1649 1649          lock_descriptor_t *vertex, *lock;
1650 1650          int nvertex = 0;
1651 1651          int i;
1652 1652          lock_descriptor_t *vertex_stack;
1653 1653  
1654 1654          STACK_INIT(vertex_stack);
1655 1655  
1656 1656          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1657 1657          /*
1658 1658           * count number of vertex pointers that has to be allocated
1659 1659           * All vertices that are reachable from request.
1660 1660           */
1661 1661  
1662 1662          STACK_PUSH(vertex_stack, request, l_stack);
1663 1663  
1664 1664          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1665 1665                  STACK_POP(vertex_stack, l_stack);
1666 1666                  for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1667 1667                      ep = NEXT_ADJ(ep)) {
1668 1668                          if (IS_RECOMPUTE(ep->to_vertex))
1669 1669                                  continue;
1670 1670                          ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1671 1671                          STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1672 1672                          nvertex++;
1673 1673                  }
1674 1674          }
1675 1675  
1676 1676          /*
1677 1677           * allocate memory for holding the vertex pointers
1678 1678           */
1679 1679  
1680 1680          if (nvertex) {
1681 1681                  topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1682 1682                      KM_SLEEP);
1683 1683          }
1684 1684  
1685 1685          /*
1686 1686           * one more pass to actually store the vertices in the
1687 1687           * allocated array.
1688 1688           * We first check sleeping locks and then active locks
1689 1689           * so that topology array will be in a topological
1690 1690           * order.
1691 1691           */
1692 1692  
1693 1693          nvertex = 0;
1694 1694          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1695 1695  
1696 1696          if (lock) {
1697 1697                  do {
1698 1698                          if (IS_RECOMPUTE(lock)) {
1699 1699                                  lock->l_index = nvertex;
1700 1700                                  topology[nvertex++] = lock;
1701 1701                          }
1702 1702                          lock->l_color = NO_COLOR;
1703 1703                          lock = lock->l_next;
1704 1704                  } while (lock->l_vnode == vp);
1705 1705          }
1706 1706  
1707 1707          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1708 1708  
1709 1709          if (lock) {
1710 1710                  do {
1711 1711                          if (IS_RECOMPUTE(lock)) {
1712 1712                                  lock->l_index = nvertex;
1713 1713                                  topology[nvertex++] = lock;
1714 1714                          }
1715 1715                          lock->l_color = NO_COLOR;
1716 1716                          lock = lock->l_next;
1717 1717                  } while (lock->l_vnode == vp);
1718 1718          }
1719 1719  
1720 1720          /*
1721 1721           * remove in and out edges of request
1722 1722           * They are freed after updating proc_graph below.
1723 1723           */
1724 1724  
1725 1725          for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1726 1726                  ADJ_LIST_REMOVE(ep);
1727 1727          }
1728 1728  
1729 1729  
1730 1730          if (remove_from_queue)
1731 1731                  REMOVE_SLEEP_QUEUE(request);
1732 1732  
1733 1733          /* we are ready to recompute */
1734 1734  
1735 1735          flk_recompute_dependencies(request, topology, nvertex, 1);
1736 1736  
1737 1737          ep = FIRST_ADJ(request);
1738 1738          while (ep != HEAD(request)) {
1739 1739                  IN_LIST_REMOVE(ep);
1740 1740                  request->l_sedge = NEXT_ADJ(ep);
1741 1741                  ADJ_LIST_REMOVE(ep);
1742 1742                  flk_update_proc_graph(ep, 1);
1743 1743                  flk_free_edge(ep);
1744 1744                  ep = request->l_sedge;
1745 1745          }
1746 1746  
1747 1747  
1748 1748          /*
1749 1749           * unset the RECOMPUTE flag in those vertices
1750 1750           */
1751 1751  
1752 1752          for (i = 0; i < nvertex; i++) {
1753 1753                  topology[i]->l_state &= ~RECOMPUTE_LOCK;
1754 1754          }
1755 1755  
1756 1756          /*
1757 1757           * free the topology
1758 1758           */
1759 1759          if (nvertex)
1760 1760                  kmem_free((void *)topology,
1761 1761                      (nvertex * sizeof (lock_descriptor_t *)));
1762 1762          /*
1763 1763           * Possibility of some locks unblocked now
1764 1764           */
1765 1765  
1766 1766          flk_wakeup(request, 0);
1767 1767  
1768 1768          /*
1769 1769           * we expect to have a correctly recomputed graph  now.
1770 1770           */
1771 1771          flk_set_state(request, FLK_DEAD_STATE);
1772 1772          flk_free_lock(request);
1773 1773          CHECK_SLEEPING_LOCKS(gp);
1774 1774          CHECK_ACTIVE_LOCKS(gp);
1775 1775  
1776 1776  }
1777 1777  
1778 1778  /*
1779 1779   * Uncoloring the graph is simply to increment the mark value of the graph
1780 1780   * And only when wrap round takes place will we color all vertices in
1781 1781   * the graph explicitly.
1782 1782   */
1783 1783  
1784 1784  static void
1785 1785  flk_graph_uncolor(graph_t *gp)
1786 1786  {
1787 1787          lock_descriptor_t *lock;
1788 1788  
1789 1789          if (gp->mark == UINT_MAX) {
1790 1790                  gp->mark = 1;
1791 1791          for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1792 1792              lock = lock->l_next)
1793 1793                          lock->l_color  = 0;
1794 1794  
1795 1795          for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1796 1796              lock = lock->l_next)
1797 1797                          lock->l_color  = 0;
1798 1798          } else {
1799 1799                  gp->mark++;
1800 1800          }
1801 1801  }
1802 1802  
1803 1803  /*
1804 1804   * Wake up locks that are blocked on the given lock.
1805 1805   */
1806 1806  
1807 1807  static void
1808 1808  flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1809 1809  {
1810 1810          edge_t  *ep;
1811 1811          graph_t *gp = lock->l_graph;
1812 1812          lock_descriptor_t       *lck;
1813 1813  
1814 1814          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1815 1815          if (NO_DEPENDENTS(lock))
1816 1816                  return;
1817 1817          ep = FIRST_IN(lock);
1818 1818          do {
1819 1819                  /*
1820 1820                   * delete the edge from the adjacency list
1821 1821                   * of from vertex. if no more adjacent edges
1822 1822                   * for this vertex wake this process.
1823 1823                   */
1824 1824                  lck = ep->from_vertex;
1825 1825                  if (adj_list_remove)
1826 1826                          ADJ_LIST_REMOVE(ep);
1827 1827                  flk_update_proc_graph(ep, 1);
1828 1828                  if (NOT_BLOCKED(lck)) {
1829 1829                          GRANT_WAKEUP(lck);
1830 1830                  }
1831 1831                  lock->l_sedge = NEXT_IN(ep);
1832 1832                  IN_LIST_REMOVE(ep);
1833 1833                  flk_free_edge(ep);
1834 1834                  ep = lock->l_sedge;
1835 1835          } while (ep != HEAD(lock));
1836 1836          ASSERT(NO_DEPENDENTS(lock));
1837 1837  }
1838 1838  
1839 1839  /*
1840 1840   * The dependents of request, is checked for its dependency against the
1841 1841   * locks in topology (called topology because the array is and should be in
1842 1842   * topological order for this algorithm, if not in topological order the
1843 1843   * inner loop below might add more edges than necessary. Topological ordering
1844 1844   * of vertices satisfies the property that all edges will be from left to
1845 1845   * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1846 1846   * If lock l1 in the dependent set of request is dependent (blocked by)
1847 1847   * on lock l2 in topology but does not have a path to it, we add an edge
1848 1848   * in the inner loop below.
1849 1849   *
1850 1850   * We don't want to add an edge between l1 and l2 if there exists
1851 1851   * already a path from l1 to l2, so care has to be taken for those vertices
1852 1852   * that  have two paths to 'request'. These vertices are referred to here
1853 1853   * as barrier locks.
1854 1854   *
1855 1855   * The barriers has to be found (those vertex that originally had two paths
1856 1856   * to request) because otherwise we may end up adding edges unnecessarily
1857 1857   * to vertices in topology, and thus barrier vertices can have an edge
1858 1858   * to a vertex in topology as well a path to it.
1859 1859   */
1860 1860  
1861 1861  static void
1862 1862  flk_recompute_dependencies(lock_descriptor_t *request,
1863 1863                  lock_descriptor_t **topology,
1864 1864                          int nvertex, int update_graph)
1865 1865  {
1866 1866          lock_descriptor_t *vertex, *lock;
1867 1867          graph_t *gp = request->l_graph;
1868 1868          int i, count;
1869 1869          int barrier_found = 0;
1870 1870          edge_t  *ep;
1871 1871          lock_descriptor_t *vertex_stack;
1872 1872  
1873 1873          STACK_INIT(vertex_stack);
1874 1874  
1875 1875          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1876 1876          if (nvertex == 0)
1877 1877                  return;
1878 1878          flk_graph_uncolor(request->l_graph);
1879 1879          barrier_found = flk_find_barriers(request);
1880 1880          request->l_state |= RECOMPUTE_DONE;
1881 1881  
1882 1882          STACK_PUSH(vertex_stack, request, l_stack);
1883 1883          request->l_sedge = FIRST_IN(request);
1884 1884  
1885 1885  
1886 1886          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1887 1887                  if (vertex->l_state & RECOMPUTE_DONE) {
1888 1888                          count = 0;
1889 1889                          goto next_in_edge;
1890 1890                  }
1891 1891                  if (IS_BARRIER(vertex)) {
1892 1892                          /* decrement the barrier count */
1893 1893                          if (vertex->l_index) {
1894 1894                                  vertex->l_index--;
1895 1895                                  /* this guy will be pushed again anyway ? */
1896 1896                                  STACK_POP(vertex_stack, l_stack);
1897 1897                                  if (vertex->l_index == 0)  {
1898 1898                                  /*
1899 1899                                   * barrier is over we can recompute
1900 1900                                   * dependencies for this lock in the
1901 1901                                   * next stack pop
1902 1902                                   */
1903 1903                                          vertex->l_state &= ~BARRIER_LOCK;
1904 1904                                  }
1905 1905                                  continue;
1906 1906                          }
1907 1907                  }
1908 1908                  vertex->l_state |= RECOMPUTE_DONE;
1909 1909                  flk_graph_uncolor(gp);
1910 1910                  count = flk_color_reachables(vertex);
1911 1911                  for (i = 0; i < nvertex; i++) {
1912 1912                          lock = topology[i];
1913 1913                          if (COLORED(lock))
1914 1914                                  continue;
1915 1915                          if (BLOCKS(lock, vertex)) {
1916 1916                                  (void) flk_add_edge(vertex, lock,
1917 1917                                      NO_CHECK_CYCLE, update_graph);
1918 1918                                  COLOR(lock);
1919 1919                                  count++;
1920 1920                                  count += flk_color_reachables(lock);
1921 1921                          }
1922 1922  
1923 1923                  }
1924 1924  
1925 1925  next_in_edge:
1926 1926                  if (count == nvertex ||
1927 1927                      vertex->l_sedge == HEAD(vertex)) {
1928 1928                          /* prune the tree below this */
1929 1929                          STACK_POP(vertex_stack, l_stack);
1930 1930                          vertex->l_state &= ~RECOMPUTE_DONE;
1931 1931                          /* update the barrier locks below this! */
1932 1932                          if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1933 1933                                  flk_graph_uncolor(gp);
1934 1934                                  flk_update_barriers(vertex);
1935 1935                          }
1936 1936                          continue;
1937 1937                  }
1938 1938  
1939 1939                  ep = vertex->l_sedge;
1940 1940                  lock = ep->from_vertex;
1941 1941                  STACK_PUSH(vertex_stack, lock, l_stack);
1942 1942                  lock->l_sedge = FIRST_IN(lock);
1943 1943                  vertex->l_sedge = NEXT_IN(ep);
1944 1944          }
1945 1945  
1946 1946  }
1947 1947  
1948 1948  /*
1949 1949   * Color all reachable vertices from vertex that belongs to topology (here
1950 1950   * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1951 1951   *
1952 1952   * Note: we need to use a different stack_link l_stack1 because this is
1953 1953   * called from flk_recompute_dependencies() that already uses a stack with
1954 1954   * l_stack as stack_link.
1955 1955   */
1956 1956  
1957 1957  static int
1958 1958  flk_color_reachables(lock_descriptor_t *vertex)
1959 1959  {
1960 1960          lock_descriptor_t *ver, *lock;
1961 1961          int count;
1962 1962          edge_t  *ep;
1963 1963          lock_descriptor_t *vertex_stack;
1964 1964  
1965 1965          STACK_INIT(vertex_stack);
1966 1966  
1967 1967          STACK_PUSH(vertex_stack, vertex, l_stack1);
1968 1968          count = 0;
1969 1969          while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1970 1970  
1971 1971                  STACK_POP(vertex_stack, l_stack1);
1972 1972                  for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1973 1973                      ep = NEXT_ADJ(ep)) {
1974 1974                          lock = ep->to_vertex;
1975 1975                          if (COLORED(lock))
1976 1976                                  continue;
1977 1977                          COLOR(lock);
1978 1978                          if (IS_RECOMPUTE(lock))
1979 1979                                  count++;
1980 1980                          STACK_PUSH(vertex_stack, lock, l_stack1);
1981 1981                  }
1982 1982  
1983 1983          }
1984 1984          return (count);
1985 1985  }
1986 1986  
1987 1987  /*
1988 1988   * Called from flk_recompute_dependencies() this routine decrements
1989 1989   * the barrier count of barrier vertices that are reachable from lock.
1990 1990   */
1991 1991  
1992 1992  static void
1993 1993  flk_update_barriers(lock_descriptor_t *lock)
1994 1994  {
1995 1995          lock_descriptor_t *vertex, *lck;
1996 1996          edge_t  *ep;
1997 1997          lock_descriptor_t *vertex_stack;
1998 1998  
1999 1999          STACK_INIT(vertex_stack);
2000 2000  
2001 2001          STACK_PUSH(vertex_stack, lock, l_stack1);
2002 2002  
2003 2003          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2004 2004                  STACK_POP(vertex_stack, l_stack1);
2005 2005                  for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2006 2006                      ep = NEXT_IN(ep)) {
2007 2007                          lck = ep->from_vertex;
2008 2008                          if (COLORED(lck)) {
2009 2009                                  if (IS_BARRIER(lck)) {
2010 2010                                          ASSERT(lck->l_index > 0);
2011 2011                                          lck->l_index--;
2012 2012                                          if (lck->l_index == 0)
2013 2013                                                  lck->l_state &= ~BARRIER_LOCK;
2014 2014                                  }
2015 2015                                  continue;
2016 2016                          }
2017 2017                          COLOR(lck);
2018 2018                          if (IS_BARRIER(lck)) {
2019 2019                                  ASSERT(lck->l_index > 0);
2020 2020                                  lck->l_index--;
2021 2021                                  if (lck->l_index == 0)
2022 2022                                          lck->l_state &= ~BARRIER_LOCK;
2023 2023                          }
2024 2024                          STACK_PUSH(vertex_stack, lck, l_stack1);
2025 2025                  }
2026 2026          }
2027 2027  }
2028 2028  
2029 2029  /*
2030 2030   * Finds all vertices that are reachable from 'lock' more than once and
2031 2031   * mark them as barrier vertices and increment their barrier count.
2032 2032   * The barrier count is one minus the total number of paths from lock
2033 2033   * to that vertex.
2034 2034   */
2035 2035  
2036 2036  static int
2037 2037  flk_find_barriers(lock_descriptor_t *lock)
2038 2038  {
2039 2039          lock_descriptor_t *vertex, *lck;
2040 2040          int found = 0;
2041 2041          edge_t  *ep;
2042 2042          lock_descriptor_t *vertex_stack;
2043 2043  
2044 2044          STACK_INIT(vertex_stack);
2045 2045  
2046 2046          STACK_PUSH(vertex_stack, lock, l_stack1);
2047 2047  
2048 2048          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2049 2049                  STACK_POP(vertex_stack, l_stack1);
2050 2050                  for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2051 2051                      ep = NEXT_IN(ep)) {
2052 2052                          lck = ep->from_vertex;
2053 2053                          if (COLORED(lck)) {
2054 2054                                  /* this is a barrier */
2055 2055                                  lck->l_state |= BARRIER_LOCK;
2056 2056                                  /* index will have barrier count */
2057 2057                                  lck->l_index++;
2058 2058                                  if (!found)
2059 2059                                          found = 1;
2060 2060                                  continue;
2061 2061                          }
2062 2062                          COLOR(lck);
2063 2063                          lck->l_index = 0;
2064 2064                          STACK_PUSH(vertex_stack, lck, l_stack1);
2065 2065                  }
2066 2066          }
2067 2067          return (found);
2068 2068  }
2069 2069  
2070 2070  /*
2071 2071   * Finds the first lock that is mainly responsible for blocking this
2072 2072   * request.  If there is no such lock, request->l_flock.l_type is set to
2073 2073   * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2074 2074   * of the blocking lock.
2075 2075   *
2076 2076   * Note: It is possible a request is blocked by a sleeping lock because
2077 2077   * of the fairness policy used in flk_process_request() to construct the
2078 2078   * dependencies. (see comments before flk_process_request()).
2079 2079   */
2080 2080  
2081 2081  static void
2082 2082  flk_get_first_blocking_lock(lock_descriptor_t *request)
2083 2083  {
2084 2084          graph_t *gp = request->l_graph;
2085 2085          vnode_t *vp = request->l_vnode;
2086 2086          lock_descriptor_t *lock, *blocker;
2087 2087  
2088 2088          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2089 2089          blocker = NULL;
2090 2090          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2091 2091

↓ open down ↓

2091 lines elided

↑ open up ↑

2092 2092          if (lock) {
2093 2093                  do {
2094 2094                          if (BLOCKS(lock, request)) {
2095 2095                                  blocker = lock;
2096 2096                                  break;
2097 2097                          }
2098 2098                          lock = lock->l_next;
2099 2099                  } while (lock->l_vnode == vp);
2100 2100          }
2101 2101  
     2102 +        if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
     2103 +                /*
     2104 +                 * No active lock is blocking this request, but if a read
     2105 +                 * lock is requested, it may also get blocked by a waiting
     2106 +                 * writer. So search all sleeping locks and see if there is
     2107 +                 * a writer waiting.
     2108 +                 */
     2109 +                SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
     2110 +                if (lock) {
     2111 +                        do {
     2112 +                                if (BLOCKS(lock, request)) {
     2113 +                                        blocker = lock;
     2114 +                                        break;
     2115 +                                }
     2116 +                                lock = lock->l_next;
     2117 +                        } while (lock->l_vnode == vp);
     2118 +                }
     2119 +        }
     2120 +
2102 2121          if (blocker) {
2103 2122                  report_blocker(blocker, request);
2104 2123          } else
2105 2124                  request->l_flock.l_type = F_UNLCK;
2106 2125  }
2107 2126  
2108 2127  /*
2109 2128   * Get the graph_t structure associated with a vnode.
2110 2129   * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2111 2130   * not yet been initialized, then a new element is allocated and returned.

2112 2131   */
2113 2132  graph_t *
2114 2133  flk_get_lock_graph(vnode_t *vp, int initialize)
2115 2134  {
2116 2135          graph_t *gp;
2117 2136          graph_t *gp_alloc = NULL;
2118 2137          int index = HASH_INDEX(vp);
2119 2138  
2120 2139          if (initialize == FLK_USE_GRAPH) {
2121 2140                  mutex_enter(&flock_lock);
2122 2141                  gp = lock_graph[index];
2123 2142                  mutex_exit(&flock_lock);
2124 2143                  return (gp);
2125 2144          }
2126 2145  
2127 2146          ASSERT(initialize == FLK_INIT_GRAPH);
2128 2147  
2129 2148          if (lock_graph[index] == NULL) {
2130 2149  
2131 2150                  gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2132 2151  
2133 2152                  /* Initialize the graph */
2134 2153  
2135 2154                  gp_alloc->active_locks.l_next =
2136 2155                      gp_alloc->active_locks.l_prev =
2137 2156                      (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2138 2157                  gp_alloc->sleeping_locks.l_next =
2139 2158                      gp_alloc->sleeping_locks.l_prev =
2140 2159                      (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2141 2160                  gp_alloc->index = index;
2142 2161                  mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2143 2162          }
2144 2163  
2145 2164          mutex_enter(&flock_lock);
2146 2165  
2147 2166          gp = lock_graph[index];
2148 2167  
2149 2168          /* Recheck the value within flock_lock */
2150 2169          if (gp == NULL) {
2151 2170                  struct flock_globals *fg;
2152 2171  
2153 2172                  /* We must have previously allocated the graph_t structure */
2154 2173                  ASSERT(gp_alloc != NULL);
2155 2174                  lock_graph[index] = gp = gp_alloc;
2156 2175                  /*
2157 2176                   * The lockmgr status is only needed if KLM is loaded.
2158 2177                   */
2159 2178                  if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2160 2179                          fg = flk_get_globals();
2161 2180                          fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2162 2181                  }
2163 2182          }
2164 2183  
2165 2184          mutex_exit(&flock_lock);
2166 2185  
2167 2186          if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2168 2187                  /* There was a race to allocate the graph_t and we lost */
2169 2188                  mutex_destroy(&gp_alloc->gp_mutex);
2170 2189                  kmem_free(gp_alloc, sizeof (graph_t));
2171 2190          }
2172 2191  
2173 2192          return (gp);
2174 2193  }
2175 2194  
2176 2195  /*
2177 2196   * PSARC case 1997/292
2178 2197   */
2179 2198  int
2180 2199  cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2181 2200  {
2182 2201          lock_descriptor_t *lock;
2183 2202          int result = 0;
2184 2203          graph_t *gp;
2185 2204          int                     lock_nlmid;
2186 2205  
2187 2206          /*
2188 2207           * Check to see if node is booted as a cluster. If not, return.
2189 2208           */
2190 2209          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2191 2210                  return (0);
2192 2211          }
2193 2212  
2194 2213          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2195 2214          if (gp == NULL) {
2196 2215                  return (0);
2197 2216          }
2198 2217  
2199 2218          mutex_enter(&gp->gp_mutex);
2200 2219  
2201 2220          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2202 2221  
2203 2222          if (lock) {
2204 2223                  while (lock->l_vnode == vp) {
2205 2224                          /* get NLM id from sysid */
2206 2225                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2207 2226  
2208 2227                          /*
2209 2228                           * If NLM server request _and_ nlmid of lock matches
2210 2229                           * nlmid of argument, then we've found a remote lock.
2211 2230                           */
2212 2231                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2213 2232                                  result = 1;
2214 2233                                  goto done;
2215 2234                          }
2216 2235                          lock = lock->l_next;
2217 2236                  }
2218 2237          }
2219 2238  
2220 2239          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2221 2240  
2222 2241          if (lock) {
2223 2242                  while (lock->l_vnode == vp) {
2224 2243                          /* get NLM id from sysid */
2225 2244                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2226 2245  
2227 2246                          /*
2228 2247                           * If NLM server request _and_ nlmid of lock matches
2229 2248                           * nlmid of argument, then we've found a remote lock.
2230 2249                           */
2231 2250                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2232 2251                                  result = 1;
2233 2252                                  goto done;
2234 2253                          }
2235 2254                          lock = lock->l_next;
2236 2255                  }
2237 2256          }
2238 2257  
2239 2258  done:
2240 2259          mutex_exit(&gp->gp_mutex);
2241 2260          return (result);
2242 2261  }
2243 2262  
2244 2263  /*
2245 2264   * Determine whether there are any locks for the given vnode with a remote
2246 2265   * sysid.  Returns zero if not, non-zero if there are.
2247 2266   *
2248 2267   * Note that the return value from this function is potentially invalid
2249 2268   * once it has been returned.  The caller is responsible for providing its
2250 2269   * own synchronization mechanism to ensure that the return value is useful
2251 2270   * (e.g., see nfs_lockcompletion()).
2252 2271   */
2253 2272  int
2254 2273  flk_has_remote_locks(vnode_t *vp)
2255 2274  {
2256 2275          lock_descriptor_t *lock;
2257 2276          int result = 0;
2258 2277          graph_t *gp;
2259 2278  
2260 2279          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2261 2280          if (gp == NULL) {
2262 2281                  return (0);
2263 2282          }
2264 2283  
2265 2284          mutex_enter(&gp->gp_mutex);
2266 2285  
2267 2286          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2268 2287  
2269 2288          if (lock) {
2270 2289                  while (lock->l_vnode == vp) {
2271 2290                          if (IS_REMOTE(lock)) {
2272 2291                                  result = 1;
2273 2292                                  goto done;
2274 2293                          }
2275 2294                          lock = lock->l_next;
2276 2295                  }
2277 2296          }
2278 2297  
2279 2298          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2280 2299  
2281 2300          if (lock) {
2282 2301                  while (lock->l_vnode == vp) {
2283 2302                          if (IS_REMOTE(lock)) {
2284 2303                                  result = 1;
2285 2304                                  goto done;
2286 2305                          }
2287 2306                          lock = lock->l_next;
2288 2307                  }
2289 2308          }
2290 2309  
2291 2310  done:
2292 2311          mutex_exit(&gp->gp_mutex);
2293 2312          return (result);
2294 2313  }
2295 2314  
2296 2315  /*
2297 2316   * Determine whether there are any locks for the given vnode with a remote
2298 2317   * sysid matching given sysid.
2299 2318   * Used by the new (open source) NFS Lock Manager (NLM)
2300 2319   */
2301 2320  int
2302 2321  flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2303 2322  {
2304 2323          lock_descriptor_t *lock;
2305 2324          int result = 0;
2306 2325          graph_t *gp;
2307 2326  
2308 2327          if (sysid == 0)
2309 2328                  return (0);
2310 2329  
2311 2330          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2312 2331          if (gp == NULL) {
2313 2332                  return (0);
2314 2333          }
2315 2334  
2316 2335          mutex_enter(&gp->gp_mutex);
2317 2336  
2318 2337          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2319 2338  
2320 2339          if (lock) {
2321 2340                  while (lock->l_vnode == vp) {
2322 2341                          if (lock->l_flock.l_sysid == sysid) {
2323 2342                                  result = 1;
2324 2343                                  goto done;
2325 2344                          }
2326 2345                          lock = lock->l_next;
2327 2346                  }
2328 2347          }
2329 2348  
2330 2349          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2331 2350  
2332 2351          if (lock) {
2333 2352                  while (lock->l_vnode == vp) {
2334 2353                          if (lock->l_flock.l_sysid == sysid) {
2335 2354                                  result = 1;
2336 2355                                  goto done;
2337 2356                          }
2338 2357                          lock = lock->l_next;
2339 2358                  }
2340 2359          }
2341 2360  
2342 2361  done:
2343 2362          mutex_exit(&gp->gp_mutex);
2344 2363          return (result);
2345 2364  }
2346 2365  
2347 2366  /*
2348 2367   * Determine if there are any locks owned by the given sysid.
2349 2368   * Returns zero if not, non-zero if there are.  Note that this return code
2350 2369   * could be derived from flk_get_{sleeping,active}_locks, but this routine
2351 2370   * avoids all the memory allocations of those routines.
2352 2371   *
2353 2372   * This routine has the same synchronization issues as
2354 2373   * flk_has_remote_locks.
2355 2374   */
2356 2375  
2357 2376  int
2358 2377  flk_sysid_has_locks(int sysid, int lck_type)
2359 2378  {
2360 2379          int             has_locks = 0;
2361 2380          lock_descriptor_t       *lock;
2362 2381          graph_t         *gp;
2363 2382          int             i;
2364 2383  
2365 2384          for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2366 2385                  mutex_enter(&flock_lock);
2367 2386                  gp = lock_graph[i];
2368 2387                  mutex_exit(&flock_lock);
2369 2388                  if (gp == NULL) {
2370 2389                          continue;
2371 2390                  }
2372 2391  
2373 2392                  mutex_enter(&gp->gp_mutex);
2374 2393  
2375 2394                  if (lck_type & FLK_QUERY_ACTIVE) {
2376 2395                          for (lock = ACTIVE_HEAD(gp)->l_next;
2377 2396                              lock != ACTIVE_HEAD(gp) && !has_locks;
2378 2397                              lock = lock->l_next) {
2379 2398                                  if (lock->l_flock.l_sysid == sysid)
2380 2399                                          has_locks = 1;
2381 2400                          }
2382 2401                  }
2383 2402  
2384 2403                  if (lck_type & FLK_QUERY_SLEEPING) {
2385 2404                          for (lock = SLEEPING_HEAD(gp)->l_next;
2386 2405                              lock != SLEEPING_HEAD(gp) && !has_locks;
2387 2406                              lock = lock->l_next) {
2388 2407                                  if (lock->l_flock.l_sysid == sysid)
2389 2408                                          has_locks = 1;
2390 2409                          }
2391 2410                  }
2392 2411                  mutex_exit(&gp->gp_mutex);
2393 2412          }
2394 2413  
2395 2414          return (has_locks);
2396 2415  }
2397 2416  
2398 2417  
2399 2418  /*
2400 2419   * PSARC case 1997/292
2401 2420   *
2402 2421   * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2403 2422   *  quantity, the real sysid generated by the NLM server; the upper half
2404 2423   *  identifies the node of the cluster where the NLM server ran.
2405 2424   *  This routine is only called by an NLM server running in a cluster.
2406 2425   * Effects: Remove all locks held on behalf of the client identified
2407 2426   *  by "sysid."
2408 2427   */
2409 2428  void
2410 2429  cl_flk_remove_locks_by_sysid(int sysid)
2411 2430  {
2412 2431          graph_t *gp;
2413 2432          int i;
2414 2433          lock_descriptor_t *lock, *nlock;
2415 2434  
2416 2435          /*
2417 2436           * Check to see if node is booted as a cluster. If not, return.
2418 2437           */
2419 2438          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2420 2439                  return;
2421 2440          }
2422 2441  
2423 2442          ASSERT(sysid != 0);
2424 2443          for (i = 0; i < HASH_SIZE; i++) {
2425 2444                  mutex_enter(&flock_lock);
2426 2445                  gp = lock_graph[i];
2427 2446                  mutex_exit(&flock_lock);
2428 2447  
2429 2448                  if (gp == NULL)
2430 2449                          continue;
2431 2450  
2432 2451                  mutex_enter(&gp->gp_mutex);     /*  get mutex on lock graph */
2433 2452  
2434 2453                  /* signal sleeping requests so that they bail out */
2435 2454                  lock = SLEEPING_HEAD(gp)->l_next;
2436 2455                  while (lock != SLEEPING_HEAD(gp)) {
2437 2456                          nlock = lock->l_next;
2438 2457                          if (lock->l_flock.l_sysid == sysid) {
2439 2458                                  INTERRUPT_WAKEUP(lock);
2440 2459                          }
2441 2460                          lock = nlock;
2442 2461                  }
2443 2462  
2444 2463                  /* delete active locks */
2445 2464                  lock = ACTIVE_HEAD(gp)->l_next;
2446 2465                  while (lock != ACTIVE_HEAD(gp)) {
2447 2466                          nlock = lock->l_next;
2448 2467                          if (lock->l_flock.l_sysid == sysid) {
2449 2468                                  flk_delete_active_lock(lock, 0);
2450 2469                                  flk_wakeup(lock, 1);
2451 2470                                  flk_free_lock(lock);
2452 2471                          }
2453 2472                          lock = nlock;
2454 2473                  }
2455 2474                  mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2456 2475          }
2457 2476  }
2458 2477  
2459 2478  /*
2460 2479   * Delete all locks in the system that belongs to the sysid of the request.
2461 2480   */
2462 2481  
2463 2482  static void
2464 2483  flk_delete_locks_by_sysid(lock_descriptor_t *request)
2465 2484  {
2466 2485          int     sysid  = request->l_flock.l_sysid;
2467 2486          lock_descriptor_t *lock, *nlock;
2468 2487          graph_t *gp;
2469 2488          int i;
2470 2489  
2471 2490          ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2472 2491          ASSERT(sysid != 0);
2473 2492  
2474 2493          mutex_exit(&request->l_graph->gp_mutex);
2475 2494  
2476 2495          for (i = 0; i < HASH_SIZE; i++) {
2477 2496                  mutex_enter(&flock_lock);
2478 2497                  gp = lock_graph[i];
2479 2498                  mutex_exit(&flock_lock);
2480 2499  
2481 2500                  if (gp == NULL)
2482 2501                          continue;
2483 2502  
2484 2503                  mutex_enter(&gp->gp_mutex);
2485 2504  
2486 2505                  /* signal sleeping requests so that they bail out */
2487 2506                  lock = SLEEPING_HEAD(gp)->l_next;
2488 2507                  while (lock != SLEEPING_HEAD(gp)) {
2489 2508                          nlock = lock->l_next;
2490 2509                          if (lock->l_flock.l_sysid == sysid) {
2491 2510                                  INTERRUPT_WAKEUP(lock);
2492 2511                          }
2493 2512                          lock = nlock;
2494 2513                  }
2495 2514  
2496 2515                  /* delete active locks */
2497 2516                  lock = ACTIVE_HEAD(gp)->l_next;
2498 2517                  while (lock != ACTIVE_HEAD(gp)) {
2499 2518                          nlock = lock->l_next;
2500 2519                          if (lock->l_flock.l_sysid == sysid) {
2501 2520                                  flk_delete_active_lock(lock, 0);
2502 2521                                  flk_wakeup(lock, 1);
2503 2522                                  flk_free_lock(lock);
2504 2523                          }
2505 2524                          lock = nlock;
2506 2525                  }
2507 2526                  mutex_exit(&gp->gp_mutex);
2508 2527          }
2509 2528  
2510 2529          mutex_enter(&request->l_graph->gp_mutex);
2511 2530  }
2512 2531  
2513 2532  /*
2514 2533   * Clustering: Deletes PXFS locks
2515 2534   * Effects: Delete all locks on files in the given file system and with the
2516 2535   *  given PXFS id.
2517 2536   */
2518 2537  void
2519 2538  cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2520 2539  {
2521 2540          lock_descriptor_t *lock, *nlock;
2522 2541          graph_t *gp;
2523 2542          int i;
2524 2543  
2525 2544          for (i = 0; i < HASH_SIZE; i++) {
2526 2545                  mutex_enter(&flock_lock);
2527 2546                  gp = lock_graph[i];
2528 2547                  mutex_exit(&flock_lock);
2529 2548  
2530 2549                  if (gp == NULL)
2531 2550                          continue;
2532 2551  
2533 2552                  mutex_enter(&gp->gp_mutex);
2534 2553  
2535 2554                  /* signal sleeping requests so that they bail out */
2536 2555                  lock = SLEEPING_HEAD(gp)->l_next;
2537 2556                  while (lock != SLEEPING_HEAD(gp)) {
2538 2557                          nlock = lock->l_next;
2539 2558                          if (lock->l_vnode->v_vfsp == vfsp) {
2540 2559                                  ASSERT(IS_PXFS(lock));
2541 2560                                  if (GETPXFSID(lock->l_flock.l_sysid) ==
2542 2561                                      pxfsid) {
2543 2562                                          flk_set_state(lock,
2544 2563                                              FLK_CANCELLED_STATE);
2545 2564                                          flk_cancel_sleeping_lock(lock, 1);
2546 2565                                  }
2547 2566                          }
2548 2567                          lock = nlock;
2549 2568                  }
2550 2569  
2551 2570                  /* delete active locks */
2552 2571                  lock = ACTIVE_HEAD(gp)->l_next;
2553 2572                  while (lock != ACTIVE_HEAD(gp)) {
2554 2573                          nlock = lock->l_next;
2555 2574                          if (lock->l_vnode->v_vfsp == vfsp) {
2556 2575                                  ASSERT(IS_PXFS(lock));
2557 2576                                  if (GETPXFSID(lock->l_flock.l_sysid) ==
2558 2577                                      pxfsid) {
2559 2578                                          flk_delete_active_lock(lock, 0);
2560 2579                                          flk_wakeup(lock, 1);
2561 2580                                          flk_free_lock(lock);
2562 2581                                  }
2563 2582                          }
2564 2583                          lock = nlock;
2565 2584                  }
2566 2585                  mutex_exit(&gp->gp_mutex);
2567 2586          }
2568 2587  }
2569 2588  
2570 2589  /*
2571 2590   * Search for a sleeping lock manager lock which matches exactly this lock
2572 2591   * request; if one is found, fake a signal to cancel it.
2573 2592   *
2574 2593   * Return 1 if a matching lock was found, 0 otherwise.
2575 2594   */
2576 2595  
2577 2596  static int
2578 2597  flk_canceled(lock_descriptor_t *request)
2579 2598  {
2580 2599          lock_descriptor_t *lock, *nlock;
2581 2600          graph_t *gp = request->l_graph;
2582 2601          vnode_t *vp = request->l_vnode;
2583 2602  
2584 2603          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2585 2604          ASSERT(IS_LOCKMGR(request));
2586 2605          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2587 2606  
2588 2607          if (lock) {
2589 2608                  while (lock->l_vnode == vp) {
2590 2609                          nlock = lock->l_next;
2591 2610                          if (SAME_OWNER(lock, request) &&
2592 2611                              lock->l_start == request->l_start &&
2593 2612                              lock->l_end == request->l_end) {
2594 2613                                  INTERRUPT_WAKEUP(lock);
2595 2614                                  return (1);
2596 2615                          }
2597 2616                          lock = nlock;
2598 2617                  }
2599 2618          }
2600 2619          return (0);
2601 2620  }
2602 2621  
2603 2622  /*
2604 2623   * Remove all the locks for the vnode belonging to the given pid and sysid.
2605 2624   */
2606 2625  
2607 2626  void
2608 2627  cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2609 2628  {
2610 2629          graph_t *gp;
2611 2630          lock_descriptor_t *lock, *nlock;
2612 2631          lock_descriptor_t *link_stack;
2613 2632  
2614 2633          STACK_INIT(link_stack);
2615 2634  
2616 2635          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2617 2636  
2618 2637          if (gp == NULL)
2619 2638                  return;
2620 2639          mutex_enter(&gp->gp_mutex);
2621 2640  
2622 2641          CHECK_SLEEPING_LOCKS(gp);
2623 2642          CHECK_ACTIVE_LOCKS(gp);
2624 2643  
2625 2644          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2626 2645  
2627 2646          if (lock) {
2628 2647                  do {
2629 2648                          nlock = lock->l_next;
2630 2649                          if ((lock->l_flock.l_pid == pid ||
2631 2650                              pid == IGN_PID) &&
2632 2651                              lock->l_flock.l_sysid == sysid) {
2633 2652                                  CANCEL_WAKEUP(lock);
2634 2653                          }
2635 2654                          lock = nlock;
2636 2655                  } while (lock->l_vnode == vp);
2637 2656          }
2638 2657  
2639 2658          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2640 2659  
2641 2660          if (lock) {
2642 2661                  do {
2643 2662                          nlock = lock->l_next;
2644 2663                          if ((lock->l_flock.l_pid == pid ||
2645 2664                              pid == IGN_PID) &&
2646 2665                              lock->l_flock.l_sysid == sysid) {
2647 2666                                  flk_delete_active_lock(lock, 0);
2648 2667                                  STACK_PUSH(link_stack, lock, l_stack);
2649 2668                          }
2650 2669                          lock = nlock;
2651 2670                  } while (lock->l_vnode == vp);
2652 2671          }
2653 2672  
2654 2673          while ((lock = STACK_TOP(link_stack)) != NULL) {
2655 2674                  STACK_POP(link_stack, l_stack);
2656 2675                  flk_wakeup(lock, 1);
2657 2676                  flk_free_lock(lock);
2658 2677          }
2659 2678  
2660 2679          CHECK_SLEEPING_LOCKS(gp);
2661 2680          CHECK_ACTIVE_LOCKS(gp);
2662 2681          CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2663 2682          mutex_exit(&gp->gp_mutex);
2664 2683  }
2665 2684  
2666 2685  
2667 2686  /*
2668 2687   * Called from 'fs' read and write routines for files that have mandatory
2669 2688   * locking enabled.
2670 2689   */
2671 2690  
2672 2691  int
2673 2692  chklock(
2674 2693          struct vnode    *vp,
2675 2694          int             iomode,
2676 2695          u_offset_t      offset,
2677 2696          ssize_t         len,
2678 2697          int             fmode,
2679 2698          caller_context_t *ct)
2680 2699  {
2681 2700          register int    i;
2682 2701          struct flock64  bf;
2683 2702          int             error = 0;
2684 2703  
2685 2704          bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2686 2705          bf.l_whence = 0;
2687 2706          bf.l_start = offset;
2688 2707          bf.l_len = len;
2689 2708          if (ct == NULL) {
2690 2709                  bf.l_pid = curproc->p_pid;
2691 2710                  bf.l_sysid = 0;
2692 2711          } else {
2693 2712                  bf.l_pid = ct->cc_pid;
2694 2713                  bf.l_sysid = ct->cc_sysid;
2695 2714          }
2696 2715          i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2697 2716          if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2698 2717              bf.l_type != F_UNLCK)
2699 2718                  error = i ? i : EAGAIN;
2700 2719          return (error);
2701 2720  }
2702 2721  
2703 2722  /*
2704 2723   * convoff - converts the given data (start, whence) to the
2705 2724   * given whence.
2706 2725   */
2707 2726  int
2708 2727  convoff(vp, lckdat, whence, offset)
2709 2728          struct vnode    *vp;
2710 2729          struct flock64  *lckdat;
2711 2730          int             whence;
2712 2731          offset_t        offset;
2713 2732  {
2714 2733          int             error;
2715 2734          struct vattr    vattr;
2716 2735  
2717 2736          if ((lckdat->l_whence == 2) || (whence == 2)) {
2718 2737                  vattr.va_mask = AT_SIZE;
2719 2738                  if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2720 2739                          return (error);
2721 2740          }
2722 2741  
2723 2742          switch (lckdat->l_whence) {
2724 2743          case 1:
2725 2744                  lckdat->l_start += offset;
2726 2745                  break;
2727 2746          case 2:
2728 2747                  lckdat->l_start += vattr.va_size;
2729 2748                  /* FALLTHRU */
2730 2749          case 0:
2731 2750                  break;
2732 2751          default:
2733 2752                  return (EINVAL);
2734 2753          }
2735 2754  
2736 2755          if (lckdat->l_start < 0)
2737 2756                  return (EINVAL);
2738 2757  
2739 2758          switch (whence) {
2740 2759          case 1:
2741 2760                  lckdat->l_start -= offset;
2742 2761                  break;
2743 2762          case 2:
2744 2763                  lckdat->l_start -= vattr.va_size;
2745 2764                  /* FALLTHRU */
2746 2765          case 0:
2747 2766                  break;
2748 2767          default:
2749 2768                  return (EINVAL);
2750 2769          }
2751 2770  
2752 2771          lckdat->l_whence = (short)whence;
2753 2772          return (0);
2754 2773  }
2755 2774  
2756 2775  
2757 2776  /*      proc_graph function definitions */
2758 2777  
2759 2778  /*
2760 2779   * Function checks for deadlock due to the new 'lock'. If deadlock found
2761 2780   * edges of this lock are freed and returned.
2762 2781   */
2763 2782  
2764 2783  static int
2765 2784  flk_check_deadlock(lock_descriptor_t *lock)
2766 2785  {
2767 2786          proc_vertex_t   *start_vertex, *pvertex;
2768 2787          proc_vertex_t *dvertex;
2769 2788          proc_edge_t *pep, *ppep;
2770 2789          edge_t  *ep, *nep;
2771 2790          proc_vertex_t *process_stack;
2772 2791  
2773 2792          STACK_INIT(process_stack);
2774 2793  
2775 2794          mutex_enter(&flock_lock);
2776 2795          start_vertex = flk_get_proc_vertex(lock);
2777 2796          ASSERT(start_vertex != NULL);
2778 2797  
2779 2798          /* construct the edges from this process to other processes */
2780 2799  
2781 2800          ep = FIRST_ADJ(lock);
2782 2801          while (ep != HEAD(lock)) {
2783 2802                  proc_vertex_t *adj_proc;
2784 2803  
2785 2804                  adj_proc = flk_get_proc_vertex(ep->to_vertex);
2786 2805                  for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2787 2806                          if (pep->to_proc == adj_proc) {
2788 2807                                  ASSERT(pep->refcount);
2789 2808                                  pep->refcount++;
2790 2809                                  break;
2791 2810                          }
2792 2811                  }
2793 2812                  if (pep == NULL) {
2794 2813                          pep = flk_get_proc_edge();
2795 2814                          pep->to_proc = adj_proc;
2796 2815                          pep->refcount = 1;
2797 2816                          adj_proc->incount++;
2798 2817                          pep->next = start_vertex->edge;
2799 2818                          start_vertex->edge = pep;
2800 2819                  }
2801 2820                  ep = NEXT_ADJ(ep);
2802 2821          }
2803 2822  
2804 2823          ep = FIRST_IN(lock);
2805 2824  
2806 2825          while (ep != HEAD(lock)) {
2807 2826                  proc_vertex_t *in_proc;
2808 2827  
2809 2828                  in_proc = flk_get_proc_vertex(ep->from_vertex);
2810 2829  
2811 2830                  for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2812 2831                          if (pep->to_proc == start_vertex) {
2813 2832                                  ASSERT(pep->refcount);
2814 2833                                  pep->refcount++;
2815 2834                                  break;
2816 2835                          }
2817 2836                  }
2818 2837                  if (pep == NULL) {
2819 2838                          pep = flk_get_proc_edge();
2820 2839                          pep->to_proc = start_vertex;
2821 2840                          pep->refcount = 1;
2822 2841                          start_vertex->incount++;
2823 2842                          pep->next = in_proc->edge;
2824 2843                          in_proc->edge = pep;
2825 2844                  }
2826 2845                  ep = NEXT_IN(ep);
2827 2846          }
2828 2847  
2829 2848          if (start_vertex->incount == 0) {
2830 2849                  mutex_exit(&flock_lock);
2831 2850                  return (0);
2832 2851          }
2833 2852  
2834 2853          flk_proc_graph_uncolor();
2835 2854  
2836 2855          start_vertex->p_sedge = start_vertex->edge;
2837 2856  
2838 2857          STACK_PUSH(process_stack, start_vertex, p_stack);
2839 2858  
2840 2859          while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2841 2860                  for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2842 2861                          dvertex = pep->to_proc;
2843 2862                          if (!PROC_ARRIVED(dvertex)) {
2844 2863                                  STACK_PUSH(process_stack, dvertex, p_stack);
2845 2864                                  dvertex->p_sedge = dvertex->edge;
2846 2865                                  PROC_ARRIVE(pvertex);
2847 2866                                  pvertex->p_sedge = pep->next;
2848 2867                                  break;
2849 2868                          }
2850 2869                          if (!PROC_DEPARTED(dvertex))
2851 2870                                  goto deadlock;
2852 2871                  }
2853 2872                  if (pep == NULL) {
2854 2873                          PROC_DEPART(pvertex);
2855 2874                          STACK_POP(process_stack, p_stack);
2856 2875                  }
2857 2876          }
2858 2877          mutex_exit(&flock_lock);
2859 2878          return (0);
2860 2879  
2861 2880  deadlock:
2862 2881  
2863 2882          /* we remove all lock edges and proc edges */
2864 2883  
2865 2884          ep = FIRST_ADJ(lock);
2866 2885          while (ep != HEAD(lock)) {
2867 2886                  proc_vertex_t *adj_proc;
2868 2887                  adj_proc = flk_get_proc_vertex(ep->to_vertex);
2869 2888                  nep = NEXT_ADJ(ep);
2870 2889                  IN_LIST_REMOVE(ep);
2871 2890                  ADJ_LIST_REMOVE(ep);
2872 2891                  flk_free_edge(ep);
2873 2892                  ppep = start_vertex->edge;
2874 2893                  for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2875 2894                      pep = ppep->next) {
2876 2895                          if (pep->to_proc == adj_proc) {
2877 2896                                  pep->refcount--;
2878 2897                                  if (pep->refcount == 0) {
2879 2898                                          if (pep == ppep) {
2880 2899                                                  start_vertex->edge = pep->next;
2881 2900                                          } else {
2882 2901                                                  ppep->next = pep->next;
2883 2902                                          }
2884 2903                                          adj_proc->incount--;
2885 2904                                          flk_proc_release(adj_proc);
2886 2905                                          flk_free_proc_edge(pep);
2887 2906                                  }
2888 2907                                  break;
2889 2908                          }
2890 2909                  }
2891 2910                  ep = nep;
2892 2911          }
2893 2912          ep = FIRST_IN(lock);
2894 2913          while (ep != HEAD(lock)) {
2895 2914                  proc_vertex_t *in_proc;
2896 2915                  in_proc = flk_get_proc_vertex(ep->from_vertex);
2897 2916                  nep = NEXT_IN(ep);
2898 2917                  IN_LIST_REMOVE(ep);
2899 2918                  ADJ_LIST_REMOVE(ep);
2900 2919                  flk_free_edge(ep);
2901 2920                  ppep = in_proc->edge;
2902 2921                  for (pep = in_proc->edge; pep != NULL; ppep = pep,
2903 2922                      pep = ppep->next) {
2904 2923                          if (pep->to_proc == start_vertex) {
2905 2924                                  pep->refcount--;
2906 2925                                  if (pep->refcount == 0) {
2907 2926                                          if (pep == ppep) {
2908 2927                                                  in_proc->edge = pep->next;
2909 2928                                          } else {
2910 2929                                                  ppep->next = pep->next;
2911 2930                                          }
2912 2931                                          start_vertex->incount--;
2913 2932                                          flk_proc_release(in_proc);
2914 2933                                          flk_free_proc_edge(pep);
2915 2934                                  }
2916 2935                                  break;
2917 2936                          }
2918 2937                  }
2919 2938                  ep = nep;
2920 2939          }
2921 2940          flk_proc_release(start_vertex);
2922 2941          mutex_exit(&flock_lock);
2923 2942          return (1);
2924 2943  }
2925 2944  
2926 2945  /*
2927 2946   * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2928 2947   * from the list we return that, otherwise we allocate one. If necessary,
2929 2948   * we grow the list of vertices also.
2930 2949   */
2931 2950  
2932 2951  static proc_vertex_t *
2933 2952  flk_get_proc_vertex(lock_descriptor_t *lock)
2934 2953  {
2935 2954          int i;
2936 2955          proc_vertex_t   *pv;
2937 2956          proc_vertex_t   **palloc;
2938 2957  
2939 2958          ASSERT(MUTEX_HELD(&flock_lock));
2940 2959          if (lock->pvertex != -1) {
2941 2960                  ASSERT(lock->pvertex >= 0);
2942 2961                  pv = pgraph.proc[lock->pvertex];
2943 2962                  if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2944 2963                          return (pv);
2945 2964                  }
2946 2965          }
2947 2966          for (i = 0; i < pgraph.gcount; i++) {
2948 2967                  pv = pgraph.proc[i];
2949 2968                  if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2950 2969                          lock->pvertex = pv->index = i;
2951 2970                          return (pv);
2952 2971                  }
2953 2972          }
2954 2973          pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2955 2974          pv->pid = lock->l_flock.l_pid;
2956 2975          pv->sysid = lock->l_flock.l_sysid;
2957 2976          flk_proc_vertex_allocs++;
2958 2977          if (pgraph.free != 0) {
2959 2978                  for (i = 0; i < pgraph.gcount; i++) {
2960 2979                          if (pgraph.proc[i] == NULL) {
2961 2980                                  pgraph.proc[i] = pv;
2962 2981                                  lock->pvertex = pv->index = i;
2963 2982                                  pgraph.free--;
2964 2983                                  return (pv);
2965 2984                          }
2966 2985                  }
2967 2986          }
2968 2987          palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2969 2988              sizeof (proc_vertex_t *), KM_SLEEP);
2970 2989  
2971 2990          if (pgraph.proc) {
2972 2991                  bcopy(pgraph.proc, palloc,
2973 2992                      pgraph.gcount * sizeof (proc_vertex_t *));
2974 2993  
2975 2994                  kmem_free(pgraph.proc,
2976 2995                      pgraph.gcount * sizeof (proc_vertex_t *));
2977 2996          }
2978 2997          pgraph.proc = palloc;
2979 2998          pgraph.free += (PROC_CHUNK - 1);
2980 2999          pv->index = lock->pvertex = pgraph.gcount;
2981 3000          pgraph.gcount += PROC_CHUNK;
2982 3001          pgraph.proc[pv->index] = pv;
2983 3002          return (pv);
2984 3003  }
2985 3004  
2986 3005  /*
2987 3006   * Allocate a proc edge.
2988 3007   */
2989 3008  
2990 3009  static proc_edge_t *
2991 3010  flk_get_proc_edge()
2992 3011  {
2993 3012          proc_edge_t *pep;
2994 3013  
2995 3014          pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2996 3015          flk_proc_edge_allocs++;
2997 3016          return (pep);
2998 3017  }
2999 3018  
3000 3019  /*
3001 3020   * Free the proc edge. Called whenever its reference count goes to zero.
3002 3021   */
3003 3022  
3004 3023  static void
3005 3024  flk_free_proc_edge(proc_edge_t *pep)
3006 3025  {
3007 3026          ASSERT(pep->refcount == 0);
3008 3027          kmem_free((void *)pep, sizeof (proc_edge_t));
3009 3028          flk_proc_edge_frees++;
3010 3029  }
3011 3030  
3012 3031  /*
3013 3032   * Color the graph explicitly done only when the mark value hits max value.
3014 3033   */
3015 3034  
3016 3035  static void
3017 3036  flk_proc_graph_uncolor()
3018 3037  {
3019 3038          int i;
3020 3039  
3021 3040          if (pgraph.mark == UINT_MAX) {
3022 3041                  for (i = 0; i < pgraph.gcount; i++)
3023 3042                          if (pgraph.proc[i] != NULL) {
3024 3043                                  pgraph.proc[i]->atime = 0;
3025 3044                                  pgraph.proc[i]->dtime = 0;
3026 3045                          }
3027 3046                  pgraph.mark = 1;
3028 3047          } else {
3029 3048                  pgraph.mark++;
3030 3049          }
3031 3050  }
3032 3051  
3033 3052  /*
3034 3053   * Release the proc vertex iff both there are no in edges and out edges
3035 3054   */
3036 3055  
3037 3056  static void
3038 3057  flk_proc_release(proc_vertex_t *proc)
3039 3058  {
3040 3059          ASSERT(MUTEX_HELD(&flock_lock));
3041 3060          if (proc->edge == NULL && proc->incount == 0) {
3042 3061                  pgraph.proc[proc->index] = NULL;
3043 3062                  pgraph.free++;
3044 3063                  kmem_free(proc, sizeof (proc_vertex_t));
3045 3064                  flk_proc_vertex_frees++;
3046 3065          }
3047 3066  }
3048 3067  
3049 3068  /*
3050 3069   * Updates process graph to reflect change in a lock_graph.
3051 3070   * Note: We should call this function only after we have a correctly
3052 3071   * recomputed lock graph. Otherwise we might miss a deadlock detection.
3053 3072   * eg: in function flk_relation() we call this function after flk_recompute_
3054 3073   * dependencies() otherwise if a process tries to lock a vnode hashed
3055 3074   * into another graph it might sleep for ever.
3056 3075   */
3057 3076  
3058 3077  static void
3059 3078  flk_update_proc_graph(edge_t *ep, int delete)
3060 3079  {
3061 3080          proc_vertex_t *toproc, *fromproc;
3062 3081          proc_edge_t *pep, *prevpep;
3063 3082  
3064 3083          mutex_enter(&flock_lock);
3065 3084          toproc = flk_get_proc_vertex(ep->to_vertex);
3066 3085          fromproc = flk_get_proc_vertex(ep->from_vertex);
3067 3086  
3068 3087          if (!delete)
3069 3088                  goto add;
3070 3089          pep = prevpep = fromproc->edge;
3071 3090  
3072 3091          ASSERT(pep != NULL);
3073 3092          while (pep != NULL) {
3074 3093                  if (pep->to_proc == toproc) {
3075 3094                          ASSERT(pep->refcount > 0);
3076 3095                          pep->refcount--;
3077 3096                          if (pep->refcount == 0) {
3078 3097                                  if (pep == prevpep) {
3079 3098                                          fromproc->edge = pep->next;
3080 3099                                  } else {
3081 3100                                          prevpep->next = pep->next;
3082 3101                                  }
3083 3102                                  toproc->incount--;
3084 3103                                  flk_proc_release(toproc);
3085 3104                                  flk_free_proc_edge(pep);
3086 3105                          }
3087 3106                          break;
3088 3107                  }
3089 3108                  prevpep = pep;
3090 3109                  pep = pep->next;
3091 3110          }
3092 3111          flk_proc_release(fromproc);
3093 3112          mutex_exit(&flock_lock);
3094 3113          return;
3095 3114  add:
3096 3115  
3097 3116          pep = fromproc->edge;
3098 3117  
3099 3118          while (pep != NULL) {
3100 3119                  if (pep->to_proc == toproc) {
3101 3120                          ASSERT(pep->refcount > 0);
3102 3121                          pep->refcount++;
3103 3122                          break;
3104 3123                  }
3105 3124                  pep = pep->next;
3106 3125          }
3107 3126          if (pep == NULL) {
3108 3127                  pep = flk_get_proc_edge();
3109 3128                  pep->to_proc = toproc;
3110 3129                  pep->refcount = 1;
3111 3130                  toproc->incount++;
3112 3131                  pep->next = fromproc->edge;
3113 3132                  fromproc->edge = pep;
3114 3133          }
3115 3134          mutex_exit(&flock_lock);
3116 3135  }
3117 3136  
3118 3137  /*
3119 3138   * Set the control status for lock manager requests.
3120 3139   *
3121 3140   */
3122 3141  
3123 3142  /*
3124 3143   * PSARC case 1997/292
3125 3144   *
3126 3145   * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3127 3146   * Effects: Set the state of the NLM server identified by "nlmid"
3128 3147   *   in the NLM registry to state "nlm_state."
3129 3148   *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3130 3149   *   NLM server to this LLM.
3131 3150   *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3132 3151   *   may be locks requests that have gotten started but not finished.  In
3133 3152   *   particular, there may be blocking requests that are in the callback code
3134 3153   *   before sleeping (so they're not holding the lock for the graph).  If
3135 3154   *   such a thread reacquires the graph's lock (to go to sleep) after
3136 3155   *   NLM state in the NLM registry  is set to a non-up value,
3137 3156   *   it will notice the status and bail out.  If the request gets
3138 3157   *   granted before the thread can check the NLM registry, let it
3139 3158   *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3140 3159   *
3141 3160   * Modifies: nlm_reg_obj (global)
3142 3161   * Arguments:
3143 3162   *    nlmid     (IN):    id uniquely identifying an NLM server
3144 3163   *    nlm_state (IN):    NLM server state to change "nlmid" to
3145 3164   */
3146 3165  void
3147 3166  cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3148 3167  {
3149 3168          /*
3150 3169           * Check to see if node is booted as a cluster. If not, return.
3151 3170           */
3152 3171          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3153 3172                  return;
3154 3173          }
3155 3174  
3156 3175          /*
3157 3176           * Check for development/debugging.  It is possible to boot a node
3158 3177           * in non-cluster mode, and then run a special script, currently
3159 3178           * available only to developers, to bring up the node as part of a
3160 3179           * cluster.  The problem is that running such a script does not
3161 3180           * result in the routine flk_init() being called and hence global array
3162 3181           * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3163 3182           * but the LLM needs to do an additional check to see if the global
3164 3183           * array has been created or not. If nlm_reg_status is NULL, then
3165 3184           * return, else continue.
3166 3185           */
3167 3186          if (nlm_reg_status == NULL) {
3168 3187                  return;
3169 3188          }
3170 3189  
3171 3190          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3172 3191          mutex_enter(&nlm_reg_lock);
3173 3192  
3174 3193          if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3175 3194                  /*
3176 3195                   * If the NLM server "nlmid" is unknown in the NLM registry,
3177 3196                   * add it to the registry in the nlm shutting down state.
3178 3197                   */
3179 3198                  FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3180 3199                      FLK_NLM_SHUTTING_DOWN);
3181 3200          } else {
3182 3201                  /*
3183 3202                   * Change the state of the NLM server identified by "nlmid"
3184 3203                   * in the NLM registry to the argument "nlm_state."
3185 3204                   */
3186 3205                  FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3187 3206                      nlm_state);
3188 3207          }
3189 3208  
3190 3209          /*
3191 3210           *  The reason we must register the NLM server that is shutting down
3192 3211           *  with an LLM that doesn't already know about it (never sent a lock
3193 3212           *  request) is to handle correctly a race between shutdown and a new
3194 3213           *  lock request.  Suppose that a shutdown request from the NLM server
3195 3214           *  invokes this routine at the LLM, and a thread is spawned to
3196 3215           *  service the request. Now suppose a new lock request is in
3197 3216           *  progress and has already passed the first line of defense in
3198 3217           *  reclock(), which denies new locks requests from NLM servers
3199 3218           *  that are not in the NLM_UP state.  After the current routine
3200 3219           *  is invoked for both phases of shutdown, the routine will return,
3201 3220           *  having done nothing, and the lock request will proceed and
3202 3221           *  probably be granted.  The problem is that the shutdown was ignored
3203 3222           *  by the lock request because there was no record of that NLM server
3204 3223           *  shutting down.   We will be in the peculiar position of thinking
3205 3224           *  that we've shutdown the NLM server and all locks at all LLMs have
3206 3225           *  been discarded, but in fact there's still one lock held.
3207 3226           *  The solution is to record the existence of NLM server and change
3208 3227           *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3209 3228           *  progress may proceed because the next phase NLM_DOWN will catch
3210 3229           *  this lock and discard it.
3211 3230           */
3212 3231          mutex_exit(&nlm_reg_lock);
3213 3232  
3214 3233          switch (nlm_state) {
3215 3234          case FLK_NLM_UP:
3216 3235                  /*
3217 3236                   * Change the NLM state of all locks still held on behalf of
3218 3237                   * the NLM server identified by "nlmid" to NLM_UP.
3219 3238                   */
3220 3239                  cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3221 3240                  break;
3222 3241  
3223 3242          case FLK_NLM_SHUTTING_DOWN:
3224 3243                  /*
3225 3244                   * Wake up all sleeping locks for the NLM server identified
3226 3245                   * by "nlmid." Note that eventually all woken threads will
3227 3246                   * have their lock requests cancelled and descriptors
3228 3247                   * removed from the sleeping lock list.  Note that the NLM
3229 3248                   * server state associated with each lock descriptor is
3230 3249                   * changed to FLK_NLM_SHUTTING_DOWN.
3231 3250                   */
3232 3251                  cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3233 3252                  break;
3234 3253  
3235 3254          case FLK_NLM_DOWN:
3236 3255                  /*
3237 3256                   * Discard all active, granted locks for this NLM server
3238 3257                   * identified by "nlmid."
3239 3258                   */
3240 3259                  cl_flk_unlock_nlm_granted(nlmid);
3241 3260                  break;
3242 3261  
3243 3262          default:
3244 3263                  panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3245 3264          }
3246 3265  }
3247 3266  
3248 3267  /*
3249 3268   * Set the control status for lock manager requests.
3250 3269   *
3251 3270   * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3252 3271   * may be locks requests that have gotten started but not finished.  In
3253 3272   * particular, there may be blocking requests that are in the callback code
3254 3273   * before sleeping (so they're not holding the lock for the graph).  If
3255 3274   * such a thread reacquires the graph's lock (to go to sleep) after
3256 3275   * flk_lockmgr_status is set to a non-up value, it will notice the status
3257 3276   * and bail out.  If the request gets granted before the thread can check
3258 3277   * flk_lockmgr_status, let it continue normally.  It will get flushed when
3259 3278   * we are called with FLK_LOCKMGR_DOWN.
3260 3279   */
3261 3280  
3262 3281  void
3263 3282  flk_set_lockmgr_status(flk_lockmgr_status_t status)
3264 3283  {
3265 3284          int i;
3266 3285          graph_t *gp;
3267 3286          struct flock_globals *fg;
3268 3287  
3269 3288          fg = flk_get_globals();
3270 3289          ASSERT(fg != NULL);
3271 3290  
3272 3291          mutex_enter(&flock_lock);
3273 3292          fg->flk_lockmgr_status = status;
3274 3293          mutex_exit(&flock_lock);
3275 3294  
3276 3295          /*
3277 3296           * If the lock manager is coming back up, all that's needed is to
3278 3297           * propagate this information to the graphs.  If the lock manager
3279 3298           * is going down, additional action is required, and each graph's
3280 3299           * copy of the state is updated atomically with this other action.
3281 3300           */
3282 3301          switch (status) {
3283 3302          case FLK_LOCKMGR_UP:
3284 3303                  for (i = 0; i < HASH_SIZE; i++) {
3285 3304                          mutex_enter(&flock_lock);
3286 3305                          gp = lock_graph[i];
3287 3306                          mutex_exit(&flock_lock);
3288 3307                          if (gp == NULL)
3289 3308                                  continue;
3290 3309                          mutex_enter(&gp->gp_mutex);
3291 3310                          fg->lockmgr_status[i] = status;
3292 3311                          mutex_exit(&gp->gp_mutex);
3293 3312                  }
3294 3313                  break;
3295 3314          case FLK_WAKEUP_SLEEPERS:
3296 3315                  wakeup_sleeping_lockmgr_locks(fg);
3297 3316                  break;
3298 3317          case FLK_LOCKMGR_DOWN:
3299 3318                  unlock_lockmgr_granted(fg);
3300 3319                  break;
3301 3320          default:
3302 3321                  panic("flk_set_lockmgr_status: bad status (%d)", status);
3303 3322                  break;
3304 3323          }
3305 3324  }
3306 3325  
3307 3326  /*
3308 3327   * This routine returns all the locks that are active or sleeping and are
3309 3328   * associated with a particular set of identifiers.  If lock_state != 0, then
3310 3329   * only locks that match the lock_state are returned. If lock_state == 0, then
3311 3330   * all locks are returned. If pid == NOPID, the pid is ignored.  If
3312 3331   * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3313 3332   * vnode pointer is ignored.
3314 3333   *
3315 3334   * A list containing the vnode pointer and an flock structure
3316 3335   * describing the lock is returned.  Each element in the list is
3317 3336   * dynamically allocated and must be freed by the caller.  The
3318 3337   * last item in the list is denoted by a NULL value in the ll_next
3319 3338   * field.
3320 3339   *
3321 3340   * The vnode pointers returned are held.  The caller is responsible
3322 3341   * for releasing these.  Note that the returned list is only a snapshot of
3323 3342   * the current lock information, and that it is a snapshot of a moving
3324 3343   * target (only one graph is locked at a time).
3325 3344   */
3326 3345  
3327 3346  locklist_t *
3328 3347  get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3329 3348                  pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3330 3349  {
3331 3350          lock_descriptor_t       *lock;
3332 3351          lock_descriptor_t       *graph_head;
3333 3352          locklist_t              listhead;
3334 3353          locklist_t              *llheadp;
3335 3354          locklist_t              *llp;
3336 3355          locklist_t              *lltp;
3337 3356          graph_t                 *gp;
3338 3357          int                     i;
3339 3358          int                     first_index; /* graph index */
3340 3359          int                     num_indexes; /* graph index */
3341 3360  
3342 3361          ASSERT((list_type == FLK_ACTIVE_STATE) ||
3343 3362              (list_type == FLK_SLEEPING_STATE));
3344 3363  
3345 3364          /*
3346 3365           * Get a pointer to something to use as a list head while building
3347 3366           * the rest of the list.
3348 3367           */
3349 3368          llheadp = &listhead;
3350 3369          lltp = llheadp;
3351 3370          llheadp->ll_next = (locklist_t *)NULL;
3352 3371  
3353 3372          /* Figure out which graphs we want to look at. */
3354 3373          if (vp == NULL) {
3355 3374                  first_index = 0;
3356 3375                  num_indexes = HASH_SIZE;
3357 3376          } else {
3358 3377                  first_index = HASH_INDEX(vp);
3359 3378                  num_indexes = 1;
3360 3379          }
3361 3380  
3362 3381          for (i = first_index; i < first_index + num_indexes; i++) {
3363 3382                  mutex_enter(&flock_lock);
3364 3383                  gp = lock_graph[i];
3365 3384                  mutex_exit(&flock_lock);
3366 3385                  if (gp == NULL) {
3367 3386                          continue;
3368 3387                  }
3369 3388  
3370 3389                  mutex_enter(&gp->gp_mutex);
3371 3390                  graph_head = (list_type == FLK_ACTIVE_STATE) ?
3372 3391                      ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3373 3392                  for (lock = graph_head->l_next;
3374 3393                      lock != graph_head;
3375 3394                      lock = lock->l_next) {
3376 3395                          if (use_sysid && lock->l_flock.l_sysid != sysid)
3377 3396                                  continue;
3378 3397                          if (pid != NOPID && lock->l_flock.l_pid != pid)
3379 3398                                  continue;
3380 3399                          if (vp != NULL && lock->l_vnode != vp)
3381 3400                                  continue;
3382 3401                          if (lock_state && !(lock_state & lock->l_state))
3383 3402                                  continue;
3384 3403                          if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3385 3404                                  continue;
3386 3405                          /*
3387 3406                           * A matching lock was found.  Allocate
3388 3407                           * space for a new locklist entry and fill
3389 3408                           * it in.
3390 3409                           */
3391 3410                          llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3392 3411                          lltp->ll_next = llp;
3393 3412                          VN_HOLD(lock->l_vnode);
3394 3413                          llp->ll_vp = lock->l_vnode;
3395 3414                          create_flock(lock, &(llp->ll_flock));
3396 3415                          llp->ll_next = (locklist_t *)NULL;
3397 3416                          lltp = llp;
3398 3417                  }
3399 3418                  mutex_exit(&gp->gp_mutex);
3400 3419          }
3401 3420  
3402 3421          llp = llheadp->ll_next;
3403 3422          return (llp);
3404 3423  }
3405 3424  
3406 3425  /*
3407 3426   * These two functions are simply interfaces to get_lock_list.  They return
3408 3427   * a list of sleeping or active locks for the given sysid and pid.  See
3409 3428   * get_lock_list for details.
3410 3429   *
3411 3430   * In either case we don't particularly care to specify the zone of interest;
3412 3431   * the sysid-space is global across zones, so the sysid will map to exactly one
3413 3432   * zone, and we'll return information for that zone.
3414 3433   */
3415 3434  
3416 3435  locklist_t *
3417 3436  flk_get_sleeping_locks(int sysid, pid_t pid)
3418 3437  {
3419 3438          return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3420 3439              ALL_ZONES));
3421 3440  }
3422 3441  
3423 3442  locklist_t *
3424 3443  flk_get_active_locks(int sysid, pid_t pid)
3425 3444  {
3426 3445          return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3427 3446              ALL_ZONES));
3428 3447  }
3429 3448  
3430 3449  /*
3431 3450   * Another interface to get_lock_list.  This one returns all the active
3432 3451   * locks for a given vnode.  Again, see get_lock_list for details.
3433 3452   *
3434 3453   * We don't need to specify which zone's locks we're interested in.  The matter
3435 3454   * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3436 3455   * be used by multiple zones, so the list of locks will all be from the right
3437 3456   * zone.
3438 3457   */
3439 3458  
3440 3459  locklist_t *
3441 3460  flk_active_locks_for_vp(const vnode_t *vp)
3442 3461  {
3443 3462          return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3444 3463              ALL_ZONES));
3445 3464  }
3446 3465  
3447 3466  /*
3448 3467   * Another interface to get_lock_list.  This one returns all the active
3449 3468   * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3450 3469   *
3451 3470   * See the comment for flk_active_locks_for_vp() for why we don't care to
3452 3471   * specify the particular zone of interest.
3453 3472   */
3454 3473  locklist_t *
3455 3474  flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3456 3475  {
3457 3476          return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3458 3477              NOPID, vp, ALL_ZONES));
3459 3478  }
3460 3479  
3461 3480  /*
3462 3481   * Another interface to get_lock_list.  This one returns all the active
3463 3482   * nbmand locks for a given pid.  Again, see get_lock_list for details.
3464 3483   *
3465 3484   * The zone doesn't need to be specified here; the locks held by a
3466 3485   * particular process will either be local (ie, non-NFS) or from the zone
3467 3486   * the process is executing in.  This is because other parts of the system
3468 3487   * ensure that an NFS vnode can't be used in a zone other than that in
3469 3488   * which it was opened.
3470 3489   */
3471 3490  locklist_t *
3472 3491  flk_active_nbmand_locks(pid_t pid)
3473 3492  {
3474 3493          return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3475 3494              pid, NULL, ALL_ZONES));
3476 3495  }
3477 3496  
3478 3497  /*
3479 3498   * Free up all entries in the locklist.
3480 3499   */
3481 3500  void
3482 3501  flk_free_locklist(locklist_t *llp)
3483 3502  {
3484 3503          locklist_t *next_llp;
3485 3504  
3486 3505          while (llp) {
3487 3506                  next_llp = llp->ll_next;
3488 3507                  VN_RELE(llp->ll_vp);
3489 3508                  kmem_free(llp, sizeof (*llp));
3490 3509                  llp = next_llp;
3491 3510          }
3492 3511  }
3493 3512  
3494 3513  static void
3495 3514  cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3496 3515  {
3497 3516          /*
3498 3517           * For each graph "lg" in the hash table lock_graph do
3499 3518           * a.  Get the list of sleeping locks
3500 3519           * b.  For each lock descriptor in the list do
3501 3520           *      i.   If the requested lock is an NLM server request AND
3502 3521           *              the nlmid is the same as the routine argument then
3503 3522           *              change the lock descriptor's state field to
3504 3523           *              "nlm_state."
3505 3524           * c.  Get the list of active locks
3506 3525           * d.  For each lock descriptor in the list do
3507 3526           *      i.   If the requested lock is an NLM server request AND
3508 3527           *              the nlmid is the same as the routine argument then
3509 3528           *              change the lock descriptor's state field to
3510 3529           *              "nlm_state."
3511 3530           */
3512 3531  
3513 3532          int                     i;
3514 3533          graph_t                 *gp;                    /* lock graph */
3515 3534          lock_descriptor_t       *lock;                  /* lock */
3516 3535          lock_descriptor_t       *nlock = NULL;          /* next lock */
3517 3536          int                     lock_nlmid;
3518 3537  
3519 3538          for (i = 0; i < HASH_SIZE; i++) {
3520 3539                  mutex_enter(&flock_lock);
3521 3540                  gp = lock_graph[i];
3522 3541                  mutex_exit(&flock_lock);
3523 3542                  if (gp == NULL) {
3524 3543                          continue;
3525 3544                  }
3526 3545  
3527 3546                  /* Get list of sleeping locks in current lock graph. */
3528 3547                  mutex_enter(&gp->gp_mutex);
3529 3548                  for (lock = SLEEPING_HEAD(gp)->l_next;
3530 3549                      lock != SLEEPING_HEAD(gp);
3531 3550                      lock = nlock) {
3532 3551                          nlock = lock->l_next;
3533 3552                          /* get NLM id */
3534 3553                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3535 3554  
3536 3555                          /*
3537 3556                           * If NLM server request AND nlmid of lock matches
3538 3557                           * nlmid of argument, then set the NLM state of the
3539 3558                           * lock to "nlm_state."
3540 3559                           */
3541 3560                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3542 3561                                  SET_NLM_STATE(lock, nlm_state);
3543 3562                          }
3544 3563                  }
3545 3564  
3546 3565                  /* Get list of active locks in current lock graph. */
3547 3566                  for (lock = ACTIVE_HEAD(gp)->l_next;
3548 3567                      lock != ACTIVE_HEAD(gp);
3549 3568                      lock = nlock) {
3550 3569                          nlock = lock->l_next;
3551 3570                          /* get NLM id */
3552 3571                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3553 3572  
3554 3573                          /*
3555 3574                           * If NLM server request AND nlmid of lock matches
3556 3575                           * nlmid of argument, then set the NLM state of the
3557 3576                           * lock to "nlm_state."
3558 3577                           */
3559 3578                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3560 3579                                  ASSERT(IS_ACTIVE(lock));
3561 3580                                  SET_NLM_STATE(lock, nlm_state);
3562 3581                          }
3563 3582                  }
3564 3583                  mutex_exit(&gp->gp_mutex);
3565 3584          }
3566 3585  }
3567 3586  
3568 3587  /*
3569 3588   * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3570 3589   * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3571 3590   *   identified by "nlmid." Poke those lock requests.
3572 3591   */
3573 3592  static void
3574 3593  cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3575 3594  {
3576 3595          lock_descriptor_t *lock;
3577 3596          lock_descriptor_t *nlock = NULL; /* next lock */
3578 3597          int i;
3579 3598          graph_t *gp;
3580 3599          int     lock_nlmid;
3581 3600  
3582 3601          for (i = 0; i < HASH_SIZE; i++) {
3583 3602                  mutex_enter(&flock_lock);
3584 3603                  gp = lock_graph[i];
3585 3604                  mutex_exit(&flock_lock);
3586 3605                  if (gp == NULL) {
3587 3606                          continue;
3588 3607                  }
3589 3608  
3590 3609                  mutex_enter(&gp->gp_mutex);
3591 3610                  for (lock = SLEEPING_HEAD(gp)->l_next;
3592 3611                      lock != SLEEPING_HEAD(gp);
3593 3612                      lock = nlock) {
3594 3613                          nlock = lock->l_next;
3595 3614                          /*
3596 3615                           * If NLM server request _and_ nlmid of lock matches
3597 3616                           * nlmid of argument, then set the NLM state of the
3598 3617                           * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3599 3618                           * request.
3600 3619                           */
3601 3620                          if (IS_LOCKMGR(lock)) {
3602 3621                                  /* get NLM id */
3603 3622                                  lock_nlmid =
3604 3623                                      GETNLMID(lock->l_flock.l_sysid);
3605 3624                                  if (nlmid == lock_nlmid) {
3606 3625                                          SET_NLM_STATE(lock,
3607 3626                                              FLK_NLM_SHUTTING_DOWN);
3608 3627                                          INTERRUPT_WAKEUP(lock);
3609 3628                                  }
3610 3629                          }
3611 3630                  }
3612 3631                  mutex_exit(&gp->gp_mutex);
3613 3632          }
3614 3633  }
3615 3634  
3616 3635  /*
3617 3636   * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3618 3637   * Effects:  Find all active (granted) lock manager locks _only_ for the
3619 3638   *   NLM server identified by "nlmid" and release them.
3620 3639   */
3621 3640  static void
3622 3641  cl_flk_unlock_nlm_granted(int nlmid)
3623 3642  {
3624 3643          lock_descriptor_t *lock;
3625 3644          lock_descriptor_t *nlock = NULL; /* next lock */
3626 3645          int i;
3627 3646          graph_t *gp;
3628 3647          int     lock_nlmid;
3629 3648  
3630 3649          for (i = 0; i < HASH_SIZE; i++) {
3631 3650                  mutex_enter(&flock_lock);
3632 3651                  gp = lock_graph[i];
3633 3652                  mutex_exit(&flock_lock);
3634 3653                  if (gp == NULL) {
3635 3654                          continue;
3636 3655                  }
3637 3656  
3638 3657                  mutex_enter(&gp->gp_mutex);
3639 3658                  for (lock = ACTIVE_HEAD(gp)->l_next;
3640 3659                      lock != ACTIVE_HEAD(gp);
3641 3660                      lock = nlock) {
3642 3661                          nlock = lock->l_next;
3643 3662                          ASSERT(IS_ACTIVE(lock));
3644 3663  
3645 3664                          /*
3646 3665                           * If it's an  NLM server request _and_ nlmid of
3647 3666                           * the lock matches nlmid of argument, then
3648 3667                           * remove the active lock the list, wakup blocked
3649 3668                           * threads, and free the storage for the lock.
3650 3669                           * Note that there's no need to mark the NLM state
3651 3670                           * of this lock to NLM_DOWN because the lock will
3652 3671                           * be deleted anyway and its storage freed.
3653 3672                           */
3654 3673                          if (IS_LOCKMGR(lock)) {
3655 3674                                  /* get NLM id */
3656 3675                                  lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3657 3676                                  if (nlmid == lock_nlmid) {
3658 3677                                          flk_delete_active_lock(lock, 0);
3659 3678                                          flk_wakeup(lock, 1);
3660 3679                                          flk_free_lock(lock);
3661 3680                                  }
3662 3681                          }
3663 3682                  }
3664 3683                  mutex_exit(&gp->gp_mutex);
3665 3684          }
3666 3685  }
3667 3686  
3668 3687  /*
3669 3688   * Find all sleeping lock manager requests and poke them.
3670 3689   */
3671 3690  static void
3672 3691  wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3673 3692  {
3674 3693          lock_descriptor_t *lock;
3675 3694          lock_descriptor_t *nlock = NULL; /* next lock */
3676 3695          int i;
3677 3696          graph_t *gp;
3678 3697          zoneid_t zoneid = getzoneid();
3679 3698  
3680 3699          for (i = 0; i < HASH_SIZE; i++) {
3681 3700                  mutex_enter(&flock_lock);
3682 3701                  gp = lock_graph[i];
3683 3702                  mutex_exit(&flock_lock);
3684 3703                  if (gp == NULL) {
3685 3704                          continue;
3686 3705                  }
3687 3706  
3688 3707                  mutex_enter(&gp->gp_mutex);
3689 3708                  fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3690 3709                  for (lock = SLEEPING_HEAD(gp)->l_next;
3691 3710                      lock != SLEEPING_HEAD(gp);
3692 3711                      lock = nlock) {
3693 3712                          nlock = lock->l_next;
3694 3713                          if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3695 3714                                  INTERRUPT_WAKEUP(lock);
3696 3715                          }
3697 3716                  }
3698 3717                  mutex_exit(&gp->gp_mutex);
3699 3718          }
3700 3719  }
3701 3720  
3702 3721  
3703 3722  /*
3704 3723   * Find all active (granted) lock manager locks and release them.
3705 3724   */
3706 3725  static void
3707 3726  unlock_lockmgr_granted(struct flock_globals *fg)
3708 3727  {
3709 3728          lock_descriptor_t *lock;
3710 3729          lock_descriptor_t *nlock = NULL; /* next lock */
3711 3730          int i;
3712 3731          graph_t *gp;
3713 3732          zoneid_t zoneid = getzoneid();
3714 3733  
3715 3734          for (i = 0; i < HASH_SIZE; i++) {
3716 3735                  mutex_enter(&flock_lock);
3717 3736                  gp = lock_graph[i];
3718 3737                  mutex_exit(&flock_lock);
3719 3738                  if (gp == NULL) {
3720 3739                          continue;
3721 3740                  }
3722 3741  
3723 3742                  mutex_enter(&gp->gp_mutex);
3724 3743                  fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3725 3744                  for (lock = ACTIVE_HEAD(gp)->l_next;
3726 3745                      lock != ACTIVE_HEAD(gp);
3727 3746                      lock = nlock) {
3728 3747                          nlock = lock->l_next;
3729 3748                          if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3730 3749                                  ASSERT(IS_ACTIVE(lock));
3731 3750                                  flk_delete_active_lock(lock, 0);
3732 3751                                  flk_wakeup(lock, 1);
3733 3752                                  flk_free_lock(lock);
3734 3753                          }
3735 3754                  }
3736 3755                  mutex_exit(&gp->gp_mutex);
3737 3756          }
3738 3757  }
3739 3758  
3740 3759  
3741 3760  /*
3742 3761   * Wait until a lock is granted, cancelled, or interrupted.
3743 3762   */
3744 3763  
3745 3764  static void
3746 3765  wait_for_lock(lock_descriptor_t *request)
3747 3766  {
3748 3767          graph_t *gp = request->l_graph;
3749 3768  
3750 3769          ASSERT(MUTEX_HELD(&gp->gp_mutex));
3751 3770  
3752 3771          while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3753 3772              !(IS_INTERRUPTED(request))) {
3754 3773                  if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3755 3774                          flk_set_state(request, FLK_INTERRUPTED_STATE);
3756 3775                          request->l_state |= INTERRUPTED_LOCK;
3757 3776                  }
3758 3777          }
3759 3778  }
3760 3779  
3761 3780  /*
3762 3781   * Create an flock structure from the existing lock information
3763 3782   *
3764 3783   * This routine is used to create flock structures for the lock manager
3765 3784   * to use in a reclaim request.  Since the lock was originated on this
3766 3785   * host, it must be conforming to UNIX semantics, so no checking is
3767 3786   * done to make sure it falls within the lower half of the 32-bit range.
3768 3787   */
3769 3788  
3770 3789  static void
3771 3790  create_flock(lock_descriptor_t *lp, flock64_t *flp)
3772 3791  {
3773 3792          ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3774 3793          ASSERT(lp->l_end >= lp->l_start);
3775 3794  
3776 3795          flp->l_type = lp->l_type;
3777 3796          flp->l_whence = 0;
3778 3797          flp->l_start = lp->l_start;
3779 3798          flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3780 3799              (lp->l_end - lp->l_start + 1);
3781 3800          flp->l_sysid = lp->l_flock.l_sysid;
3782 3801          flp->l_pid = lp->l_flock.l_pid;
3783 3802  }
3784 3803  
3785 3804  /*
3786 3805   * Convert flock_t data describing a lock range into unsigned long starting
3787 3806   * and ending points, which are put into lock_request.  Returns 0 or an
3788 3807   * errno value.
3789 3808   * Large Files: max is passed by the caller and we return EOVERFLOW
3790 3809   * as defined by LFS API.
3791 3810   */
3792 3811  
3793 3812  int
3794 3813  flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3795 3814      u_offset_t *start, u_offset_t *end, offset_t offset)
3796 3815  {
3797 3816          struct vattr    vattr;
3798 3817          int     error;
3799 3818  
3800 3819          /*
3801 3820           * Determine the starting point of the request
3802 3821           */
3803 3822          switch (flp->l_whence) {
3804 3823          case 0:         /* SEEK_SET */
3805 3824                  *start = (u_offset_t)flp->l_start;
3806 3825                  break;
3807 3826          case 1:         /* SEEK_CUR */
3808 3827                  *start = (u_offset_t)(flp->l_start + offset);
3809 3828                  break;
3810 3829          case 2:         /* SEEK_END */
3811 3830                  vattr.va_mask = AT_SIZE;
3812 3831                  if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3813 3832                          return (error);
3814 3833                  *start = (u_offset_t)(flp->l_start + vattr.va_size);
3815 3834                  break;
3816 3835          default:
3817 3836                  return (EINVAL);
3818 3837          }
3819 3838  
3820 3839          /*
3821 3840           * Determine the range covered by the request.
3822 3841           */
3823 3842          if (flp->l_len == 0)
3824 3843                  *end = MAX_U_OFFSET_T;
3825 3844          else if ((offset_t)flp->l_len > 0) {
3826 3845                  *end = (u_offset_t)(*start + (flp->l_len - 1));
3827 3846          } else {
3828 3847                  /*
3829 3848                   * Negative length; why do we even allow this ?
3830 3849                   * Because this allows easy specification of
3831 3850                   * the last n bytes of the file.
3832 3851                   */
3833 3852                  *end = *start;
3834 3853                  *start += (u_offset_t)flp->l_len;
3835 3854                  (*start)++;
3836 3855          }
3837 3856          return (0);
3838 3857  }
3839 3858  
3840 3859  /*
3841 3860   * Check the validity of lock data.  This can used by the NFS
3842 3861   * frlock routines to check data before contacting the server.  The
3843 3862   * server must support semantics that aren't as restrictive as
3844 3863   * the UNIX API, so the NFS client is required to check.
3845 3864   * The maximum is now passed in by the caller.
3846 3865   */
3847 3866  
3848 3867  int
3849 3868  flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3850 3869  {
3851 3870          /*
3852 3871           * The end (length) for local locking should never be greater
3853 3872           * than MAXEND. However, the representation for
3854 3873           * the entire file is MAX_U_OFFSET_T.
3855 3874           */
3856 3875          if ((start > max) ||
3857 3876              ((end > max) && (end != MAX_U_OFFSET_T))) {
3858 3877                  return (EINVAL);
3859 3878          }
3860 3879          if (start > end) {
3861 3880                  return (EINVAL);
3862 3881          }
3863 3882          return (0);
3864 3883  }
3865 3884  
3866 3885  /*
3867 3886   * Fill in request->l_flock with information about the lock blocking the
3868 3887   * request.  The complexity here is that lock manager requests are allowed
3869 3888   * to see into the upper part of the 32-bit address range, whereas local
3870 3889   * requests are only allowed to see signed values.
3871 3890   *
3872 3891   * What should be done when "blocker" is a lock manager lock that uses the
3873 3892   * upper portion of the 32-bit range, but "request" is local?  Since the
3874 3893   * request has already been determined to have been blocked by the blocker,
3875 3894   * at least some portion of "blocker" must be in the range of the request,
3876 3895   * or the request extends to the end of file.  For the first case, the
3877 3896   * portion in the lower range is returned with the indication that it goes
3878 3897   * "to EOF."  For the second case, the last byte of the lower range is
3879 3898   * returned with the indication that it goes "to EOF."
3880 3899   */
3881 3900  
3882 3901  static void
3883 3902  report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3884 3903  {
3885 3904          flock64_t *flrp;                        /* l_flock portion of request */
3886 3905  
3887 3906          ASSERT(blocker != NULL);
3888 3907  
3889 3908          flrp = &request->l_flock;
3890 3909          flrp->l_whence = 0;
3891 3910          flrp->l_type = blocker->l_type;
3892 3911          flrp->l_pid = blocker->l_flock.l_pid;
3893 3912          flrp->l_sysid = blocker->l_flock.l_sysid;
3894 3913  
3895 3914          if (IS_LOCKMGR(request)) {
3896 3915                  flrp->l_start = blocker->l_start;
3897 3916                  if (blocker->l_end == MAX_U_OFFSET_T)
3898 3917                          flrp->l_len = 0;
3899 3918                  else
3900 3919                          flrp->l_len = blocker->l_end - blocker->l_start + 1;
3901 3920          } else {
3902 3921                  if (blocker->l_start > MAXEND) {
3903 3922                          flrp->l_start = MAXEND;
3904 3923                          flrp->l_len = 0;
3905 3924                  } else {
3906 3925                          flrp->l_start = blocker->l_start;
3907 3926                          if (blocker->l_end == MAX_U_OFFSET_T)
3908 3927                                  flrp->l_len = 0;
3909 3928                          else
3910 3929                                  flrp->l_len = blocker->l_end -
3911 3930                                      blocker->l_start + 1;
3912 3931                  }
3913 3932          }
3914 3933  }
3915 3934  
3916 3935  /*
3917 3936   * PSARC case 1997/292
3918 3937   */
3919 3938  /*
3920 3939   * This is the public routine exported by flock.h.
3921 3940   */
3922 3941  void
3923 3942  cl_flk_change_nlm_state_to_unknown(int nlmid)
3924 3943  {
3925 3944          /*
3926 3945           * Check to see if node is booted as a cluster. If not, return.
3927 3946           */
3928 3947          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3929 3948                  return;
3930 3949          }
3931 3950  
3932 3951          /*
3933 3952           * See comment in cl_flk_set_nlm_status().
3934 3953           */
3935 3954          if (nlm_reg_status == NULL) {
3936 3955                  return;
3937 3956          }
3938 3957  
3939 3958          /*
3940 3959           * protect NLM registry state with a mutex.
3941 3960           */
3942 3961          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3943 3962          mutex_enter(&nlm_reg_lock);
3944 3963          FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3945 3964          mutex_exit(&nlm_reg_lock);
3946 3965  }
3947 3966  
3948 3967  /*
3949 3968   * Return non-zero if the given I/O request conflicts with an active NBMAND
3950 3969   * lock.
3951 3970   * If svmand is non-zero, it means look at all active locks, not just NBMAND
3952 3971   * locks.
3953 3972   */
3954 3973  
3955 3974  int
3956 3975  nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3957 3976                  ssize_t length, int svmand, caller_context_t *ct)
3958 3977  {
3959 3978          int conflict = 0;
3960 3979          graph_t                 *gp;
3961 3980          lock_descriptor_t       *lock;
3962 3981          pid_t pid;
3963 3982          int sysid;
3964 3983  
3965 3984          if (ct == NULL) {
3966 3985                  pid = curproc->p_pid;
3967 3986                  sysid = 0;
3968 3987          } else {
3969 3988                  pid = ct->cc_pid;
3970 3989                  sysid = ct->cc_sysid;
3971 3990          }
3972 3991  
3973 3992          mutex_enter(&flock_lock);
3974 3993          gp = lock_graph[HASH_INDEX(vp)];
3975 3994          mutex_exit(&flock_lock);
3976 3995          if (gp == NULL)
3977 3996                  return (0);
3978 3997  
3979 3998          mutex_enter(&gp->gp_mutex);
3980 3999          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3981 4000  
3982 4001          for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3983 4002                  if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3984 4003                      (lock->l_flock.l_sysid != sysid ||
3985 4004                      lock->l_flock.l_pid != pid) &&
3986 4005                      lock_blocks_io(op, offset, length,
3987 4006                      lock->l_type, lock->l_start, lock->l_end)) {
3988 4007                          conflict = 1;
3989 4008                          break;
3990 4009                  }
3991 4010          }
3992 4011          mutex_exit(&gp->gp_mutex);
3993 4012  
3994 4013          return (conflict);
3995 4014  }
3996 4015  
3997 4016  /*
3998 4017   * Return non-zero if the given I/O request conflicts with the given lock.
3999 4018   */
4000 4019  
4001 4020  static int
4002 4021  lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4003 4022              int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4004 4023  {
4005 4024          ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4006 4025          ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4007 4026  
4008 4027          if (op == NBL_READ && lock_type == F_RDLCK)
4009 4028                  return (0);
4010 4029  
4011 4030          if (offset <= lock_start && lock_start < offset + length)
4012 4031                  return (1);
4013 4032          if (lock_start <= offset && offset <= lock_end)
4014 4033                  return (1);
4015 4034  
4016 4035          return (0);
4017 4036  }
4018 4037  
4019 4038  #ifdef DEBUG
4020 4039  static void
4021 4040  check_active_locks(graph_t *gp)
4022 4041  {
4023 4042          lock_descriptor_t *lock, *lock1;
4024 4043          edge_t  *ep;
4025 4044  
4026 4045          for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4027 4046              lock = lock->l_next) {
4028 4047                  ASSERT(IS_ACTIVE(lock));
4029 4048                  ASSERT(NOT_BLOCKED(lock));
4030 4049                  ASSERT(!IS_BARRIER(lock));
4031 4050  
4032 4051                  ep = FIRST_IN(lock);
4033 4052  
4034 4053                  while (ep != HEAD(lock)) {
4035 4054                          ASSERT(IS_SLEEPING(ep->from_vertex));
4036 4055                          ASSERT(!NOT_BLOCKED(ep->from_vertex));
4037 4056                          ep = NEXT_IN(ep);
4038 4057                  }
4039 4058  
4040 4059                  for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4041 4060                      lock1 = lock1->l_next) {
4042 4061                          if (lock1->l_vnode == lock->l_vnode) {
4043 4062                          if (BLOCKS(lock1, lock)) {
4044 4063                                  cmn_err(CE_PANIC,
4045 4064                                      "active lock %p blocks %p",
4046 4065                                      (void *)lock1, (void *)lock);
4047 4066                          } else if (BLOCKS(lock, lock1)) {
4048 4067                                  cmn_err(CE_PANIC,
4049 4068                                      "active lock %p blocks %p",
4050 4069                                      (void *)lock, (void *)lock1);
4051 4070                          }
4052 4071                          }
4053 4072                  }
4054 4073          }
4055 4074  }
4056 4075  
4057 4076  /*
4058 4077   * Effect: This functions checks to see if the transition from 'old_state' to
4059 4078   *      'new_state' is a valid one.  It returns 0 if the transition is valid
4060 4079   *      and 1 if it is not.
4061 4080   *      For a map of valid transitions, see sys/flock_impl.h
4062 4081   */
4063 4082  static int
4064 4083  check_lock_transition(int old_state, int new_state)
4065 4084  {
4066 4085          switch (old_state) {
4067 4086          case FLK_INITIAL_STATE:
4068 4087                  if ((new_state == FLK_START_STATE) ||
4069 4088                      (new_state == FLK_SLEEPING_STATE) ||
4070 4089                      (new_state == FLK_ACTIVE_STATE) ||
4071 4090                      (new_state == FLK_DEAD_STATE)) {
4072 4091                          return (0);
4073 4092                  } else {
4074 4093                          return (1);
4075 4094                  }
4076 4095          case FLK_START_STATE:
4077 4096                  if ((new_state == FLK_ACTIVE_STATE) ||
4078 4097                      (new_state == FLK_DEAD_STATE)) {
4079 4098                          return (0);
4080 4099                  } else {
4081 4100                          return (1);
4082 4101                  }
4083 4102          case FLK_ACTIVE_STATE:
4084 4103                  if (new_state == FLK_DEAD_STATE) {
4085 4104                          return (0);
4086 4105                  } else {
4087 4106                          return (1);
4088 4107                  }
4089 4108          case FLK_SLEEPING_STATE:
4090 4109                  if ((new_state == FLK_GRANTED_STATE) ||
4091 4110                      (new_state == FLK_INTERRUPTED_STATE) ||
4092 4111                      (new_state == FLK_CANCELLED_STATE)) {
4093 4112                          return (0);
4094 4113                  } else {
4095 4114                          return (1);
4096 4115                  }
4097 4116          case FLK_GRANTED_STATE:
4098 4117                  if ((new_state == FLK_START_STATE) ||
4099 4118                      (new_state == FLK_INTERRUPTED_STATE) ||
4100 4119                      (new_state == FLK_CANCELLED_STATE)) {
4101 4120                          return (0);
4102 4121                  } else {
4103 4122                          return (1);
4104 4123                  }
4105 4124          case FLK_CANCELLED_STATE:
4106 4125                  if ((new_state == FLK_INTERRUPTED_STATE) ||
4107 4126                      (new_state == FLK_DEAD_STATE)) {
4108 4127                          return (0);
4109 4128                  } else {
4110 4129                          return (1);
4111 4130                  }
4112 4131          case FLK_INTERRUPTED_STATE:
4113 4132                  if (new_state == FLK_DEAD_STATE) {
4114 4133                          return (0);
4115 4134                  } else {
4116 4135                          return (1);
4117 4136                  }
4118 4137          case FLK_DEAD_STATE:
4119 4138                  /* May be set more than once */
4120 4139                  if (new_state == FLK_DEAD_STATE) {
4121 4140                          return (0);
4122 4141                  } else {
4123 4142                          return (1);
4124 4143                  }
4125 4144          default:
4126 4145                  return (1);
4127 4146          }
4128 4147  }
4129 4148  
4130 4149  static void
4131 4150  check_sleeping_locks(graph_t *gp)
4132 4151  {
4133 4152          lock_descriptor_t *lock1, *lock2;
4134 4153          edge_t *ep;
4135 4154          for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4136 4155              lock1 = lock1->l_next) {
4137 4156                                  ASSERT(!IS_BARRIER(lock1));
4138 4157          for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4139 4158              lock2 = lock2->l_next) {
4140 4159                  if (lock1->l_vnode == lock2->l_vnode) {
4141 4160                          if (BLOCKS(lock2, lock1)) {
4142 4161                                  ASSERT(!IS_GRANTED(lock1));
4143 4162                                  ASSERT(!NOT_BLOCKED(lock1));
4144 4163                                  path(lock1, lock2);
4145 4164                          }
4146 4165                  }
4147 4166          }
4148 4167  
4149 4168          for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4150 4169              lock2 = lock2->l_next) {
4151 4170                                  ASSERT(!IS_BARRIER(lock1));
4152 4171                  if (lock1->l_vnode == lock2->l_vnode) {
4153 4172                          if (BLOCKS(lock2, lock1)) {
4154 4173                                  ASSERT(!IS_GRANTED(lock1));
4155 4174                                  ASSERT(!NOT_BLOCKED(lock1));
4156 4175                                  path(lock1, lock2);
4157 4176                          }
4158 4177                  }
4159 4178          }
4160 4179          ep = FIRST_ADJ(lock1);
4161 4180          while (ep != HEAD(lock1)) {
4162 4181                  ASSERT(BLOCKS(ep->to_vertex, lock1));
4163 4182                  ep = NEXT_ADJ(ep);
4164 4183          }
4165 4184          }
4166 4185  }
4167 4186  
4168 4187  static int
4169 4188  level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4170 4189  {
4171 4190          edge_t  *ep;
4172 4191          lock_descriptor_t       *vertex;
4173 4192          lock_descriptor_t *vertex_stack;
4174 4193  
4175 4194          STACK_INIT(vertex_stack);
4176 4195  
4177 4196          flk_graph_uncolor(lock1->l_graph);
4178 4197          ep = FIRST_ADJ(lock1);
4179 4198          ASSERT(ep != HEAD(lock1));
4180 4199          while (ep != HEAD(lock1)) {
4181 4200                  if (no_path)
4182 4201                          ASSERT(ep->to_vertex != lock2);
4183 4202                  STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4184 4203                  COLOR(ep->to_vertex);
4185 4204                  ep = NEXT_ADJ(ep);
4186 4205          }
4187 4206  
4188 4207          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4189 4208                  STACK_POP(vertex_stack, l_dstack);
4190 4209                  for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4191 4210                      ep = NEXT_ADJ(ep)) {
4192 4211                          if (COLORED(ep->to_vertex))
4193 4212                                  continue;
4194 4213                          COLOR(ep->to_vertex);
4195 4214                          if (ep->to_vertex == lock2)
4196 4215                                  return (1);
4197 4216  
4198 4217                          STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4199 4218                  }
4200 4219          }
4201 4220          return (0);
4202 4221  }
4203 4222  
4204 4223  static void
4205 4224  check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4206 4225  {
4207 4226          lock_descriptor_t *lock;
4208 4227  
4209 4228          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4210 4229  
4211 4230          if (lock) {
4212 4231                  while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4213 4232                          if (lock->l_flock.l_pid == pid &&
4214 4233                              lock->l_flock.l_sysid == sysid)
4215 4234                                  cmn_err(CE_PANIC,
4216 4235                                      "owner pid %d's lock %p in active queue",
4217 4236                                      pid, (void *)lock);
4218 4237                          lock = lock->l_next;
4219 4238                  }
4220 4239          }
4221 4240          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4222 4241  
4223 4242          if (lock) {
4224 4243                  while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4225 4244                          if (lock->l_flock.l_pid == pid &&
4226 4245                              lock->l_flock.l_sysid == sysid)
4227 4246                                  cmn_err(CE_PANIC,
4228 4247                                      "owner pid %d's lock %p in sleep queue",
4229 4248                                      pid, (void *)lock);
4230 4249                          lock = lock->l_next;
4231 4250                  }
4232 4251          }
4233 4252  }
4234 4253  
4235 4254  static int
4236 4255  level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4237 4256  {
4238 4257          edge_t *ep = FIRST_ADJ(lock1);
4239 4258  
4240 4259          while (ep != HEAD(lock1)) {
4241 4260                  if (ep->to_vertex == lock2)
4242 4261                          return (1);
4243 4262                  else
4244 4263                          ep = NEXT_ADJ(ep);
4245 4264          }
4246 4265          return (0);
4247 4266  }
4248 4267  
4249 4268  static int
4250 4269  no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4251 4270  {
4252 4271          return (!level_two_path(lock1, lock2, 1));
4253 4272  }
4254 4273  
4255 4274  static void
4256 4275  path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4257 4276  {
4258 4277          if (level_one_path(lock1, lock2)) {
4259 4278                  if (level_two_path(lock1, lock2, 0) != 0) {
4260 4279                          cmn_err(CE_WARN,
4261 4280                              "one edge one path from lock1 %p lock2 %p",
4262 4281                              (void *)lock1, (void *)lock2);
4263 4282                  }
4264 4283          } else if (no_path(lock1, lock2)) {
4265 4284                  cmn_err(CE_PANIC,
4266 4285                      "No path from  lock1 %p to lock2 %p",
4267 4286                      (void *)lock1, (void *)lock2);
4268 4287          }
4269 4288  }
4270 4289  #endif /* DEBUG */

↓ open down ↓

2159 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX