dlpx-os-diff Wdiff usr/src/uts/common/os/flock.c

Print this page

195 Need replacement for nfs/lockd+klm

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/flock.c
          +++ new/usr/src/uts/common/os/flock.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END

↓ open down ↓

19 lines elided

↑ open up ↑

  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*      All Rights Reserved */
  29   29  
       30 +/*
       31 + * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
       32 + */
       33 +
  30   34  #include <sys/flock_impl.h>
  31   35  #include <sys/vfs.h>
  32   36  #include <sys/t_lock.h>         /* for <sys/callb.h> */
  33   37  #include <sys/callb.h>
  34   38  #include <sys/clconf.h>
  35   39  #include <sys/cladm.h>
  36   40  #include <sys/nbmlock.h>
  37   41  #include <sys/cred.h>
  38   42  #include <sys/policy.h>
  39   43

  40   44  /*
  41   45   * The following four variables are for statistics purposes and they are
  42   46   * not protected by locks. They may not be accurate but will at least be
  43   47   * close to the actual value.
  44   48   */
  45   49  
  46   50  int     flk_lock_allocs;
  47   51  int     flk_lock_frees;
  48   52  int     edge_allocs;
  49   53  int     edge_frees;
  50   54  int     flk_proc_vertex_allocs;
  51   55  int     flk_proc_edge_allocs;
  52   56  int     flk_proc_vertex_frees;
  53   57  int     flk_proc_edge_frees;
  54   58  
  55   59  static kmutex_t flock_lock;
  56   60  
  57   61  #ifdef DEBUG
  58   62  int check_debug = 0;
  59   63  #define CHECK_ACTIVE_LOCKS(gp)  if (check_debug) \
  60   64                                          check_active_locks(gp);
  61   65  #define CHECK_SLEEPING_LOCKS(gp)        if (check_debug) \
  62   66                                                  check_sleeping_locks(gp);
  63   67  #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)   \
  64   68                  if (check_debug)        \
  65   69                          check_owner_locks(gp, pid, sysid, vp);
  66   70  #define CHECK_LOCK_TRANSITION(old_state, new_state) \
  67   71          { \
  68   72                  if (check_lock_transition(old_state, new_state)) { \
  69   73                          cmn_err(CE_PANIC, "Illegal lock transition \
  70   74                              from %d to %d", old_state, new_state); \
  71   75                  } \
  72   76          }
  73   77  #else
  74   78  
  75   79  #define CHECK_ACTIVE_LOCKS(gp)
  76   80  #define CHECK_SLEEPING_LOCKS(gp)
  77   81  #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
  78   82  #define CHECK_LOCK_TRANSITION(old_state, new_state)
  79   83  
  80   84  #endif /* DEBUG */
  81   85  
  82   86  struct kmem_cache       *flk_edge_cache;
  83   87  
  84   88  graph_t         *lock_graph[HASH_SIZE];
  85   89  proc_graph_t    pgraph;
  86   90  
  87   91  /*
  88   92   * Clustering.
  89   93   *
  90   94   * NLM REGISTRY TYPE IMPLEMENTATION
  91   95   *
  92   96   * Assumptions:
  93   97   *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
  94   98   *      integers; maximum node id is returned by clconf_maximum_nodeid().
  95   99   *  2.  We use this node id to identify the node an NLM server runs on.
  96  100   */
  97  101  
  98  102  /*
  99  103   * NLM registry object keeps track of NLM servers via their
 100  104   * nlmids (which are the node ids of the node in the cluster they run on)
 101  105   * that have requested locks at this LLM with which this registry is
 102  106   * associated.
 103  107   *
 104  108   * Representation of abstraction:
 105  109   *    rep = record[     states: array[nlm_state],
 106  110   *                      lock: mutex]
 107  111   *
 108  112   *    Representation invariants:
 109  113   *      1. index i of rep.states is between 0 and n - 1 where n is number
 110  114   *         of elements in the array, which happen to be the maximum number
 111  115   *         of nodes in the cluster configuration + 1.
 112  116   *      2. map nlmid to index i of rep.states
 113  117   *              0   -> 0
 114  118   *              1   -> 1
 115  119   *              2   -> 2
 116  120   *              n-1 -> clconf_maximum_nodeid()+1
 117  121   *      3.  This 1-1 mapping is quite convenient and it avoids errors resulting
 118  122   *          from forgetting to subtract 1 from the index.
 119  123   *      4.  The reason we keep the 0th index is the following.  A legitimate
 120  124   *          cluster configuration includes making a UFS file system NFS
 121  125   *          exportable.  The code is structured so that if you're in a cluster
 122  126   *          you do one thing; otherwise, you do something else.  The problem
 123  127   *          is what to do if you think you're in a cluster with PXFS loaded,
 124  128   *          but you're using UFS not PXFS?  The upper two bytes of the sysid
 125  129   *          encode the node id of the node where NLM server runs; these bytes
 126  130   *          are zero for UFS.  Since the nodeid is used to index into the
 127  131   *          registry, we can record the NLM server state information at index
 128  132   *          0 using the same mechanism used for PXFS file locks!
 129  133   */
 130  134  static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
 131  135  static kmutex_t nlm_reg_lock;                   /* lock to protect arrary */
 132  136  static uint_t nlm_status_size;                  /* size of state array */
 133  137  
 134  138  /*
 135  139   * Although we need a global lock dependency graph (and associated data
 136  140   * structures), we also need a per-zone notion of whether the lock manager is
 137  141   * running, and so whether to allow lock manager requests or not.
 138  142   *
 139  143   * Thus, on a per-zone basis we maintain a ``global'' variable
 140  144   * (flk_lockmgr_status), protected by flock_lock, and set when the lock
 141  145   * manager is determined to be changing state (starting or stopping).
 142  146   *
 143  147   * Each graph/zone pair also has a copy of this variable, which is protected by
 144  148   * the graph's mutex.
 145  149   *
 146  150   * The per-graph copies are used to synchronize lock requests with shutdown
 147  151   * requests.  The global copy is used to initialize the per-graph field when a
 148  152   * new graph is created.
 149  153   */
 150  154  struct flock_globals {
 151  155          flk_lockmgr_status_t flk_lockmgr_status;
 152  156          flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
 153  157  };
 154  158  
 155  159  zone_key_t flock_zone_key;
 156  160  
 157  161  static void create_flock(lock_descriptor_t *, flock64_t *);
 158  162  static lock_descriptor_t        *flk_get_lock(void);
 159  163  static void     flk_free_lock(lock_descriptor_t *lock);
 160  164  static void     flk_get_first_blocking_lock(lock_descriptor_t *request);
 161  165  static int flk_process_request(lock_descriptor_t *);
 162  166  static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
 163  167  static edge_t *flk_get_edge(void);
 164  168  static int flk_wait_execute_request(lock_descriptor_t *);
 165  169  static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
 166  170  static void flk_insert_active_lock(lock_descriptor_t *);
 167  171  static void flk_delete_active_lock(lock_descriptor_t *, int);
 168  172  static void flk_insert_sleeping_lock(lock_descriptor_t *);
 169  173  static void flk_graph_uncolor(graph_t *);
 170  174  static void flk_wakeup(lock_descriptor_t *, int);
 171  175  static void flk_free_edge(edge_t *);
 172  176  static void flk_recompute_dependencies(lock_descriptor_t *,
 173  177                          lock_descriptor_t **,  int, int);
 174  178  static int flk_find_barriers(lock_descriptor_t *);
 175  179  static void flk_update_barriers(lock_descriptor_t *);
 176  180  static int flk_color_reachables(lock_descriptor_t *);
 177  181  static int flk_canceled(lock_descriptor_t *);
 178  182  static void flk_delete_locks_by_sysid(lock_descriptor_t *);
 179  183  static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
 180  184  static void wait_for_lock(lock_descriptor_t *);
 181  185  static void unlock_lockmgr_granted(struct flock_globals *);
 182  186  static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
 183  187  
 184  188  /* Clustering hooks */
 185  189  static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
 186  190  static void cl_flk_wakeup_sleeping_nlm_locks(int);
 187  191  static void cl_flk_unlock_nlm_granted(int);
 188  192  
 189  193  #ifdef DEBUG
 190  194  static int check_lock_transition(int, int);
 191  195  static void check_sleeping_locks(graph_t *);
 192  196  static void check_active_locks(graph_t *);
 193  197  static int no_path(lock_descriptor_t *, lock_descriptor_t *);
 194  198  static void path(lock_descriptor_t *, lock_descriptor_t *);
 195  199  static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
 196  200  static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
 197  201  static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
 198  202  #endif
 199  203  
 200  204  /*      proc_graph function definitions */
 201  205  static int flk_check_deadlock(lock_descriptor_t *);
 202  206  static void flk_proc_graph_uncolor(void);
 203  207  static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
 204  208  static proc_edge_t *flk_get_proc_edge(void);
 205  209  static void flk_proc_release(proc_vertex_t *);
 206  210  static void flk_free_proc_edge(proc_edge_t *);
 207  211  static void flk_update_proc_graph(edge_t *, int);
 208  212  
 209  213  /* Non-blocking mandatory locking */
 210  214  static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
 211  215                          u_offset_t);
 212  216  
 213  217  static struct flock_globals *
 214  218  flk_get_globals(void)
 215  219  {
 216  220          /*
 217  221           * The KLM module had better be loaded if we're attempting to handle
 218  222           * lockmgr requests.
 219  223           */
 220  224          ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
 221  225          return (zone_getspecific(flock_zone_key, curproc->p_zone));
 222  226  }
 223  227  
 224  228  static flk_lockmgr_status_t
 225  229  flk_get_lockmgr_status(void)
 226  230  {
 227  231          struct flock_globals *fg;
 228  232  
 229  233          ASSERT(MUTEX_HELD(&flock_lock));
 230  234  
 231  235          if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
 232  236                  /*
 233  237                   * KLM module not loaded; lock manager definitely not running.
 234  238                   */
 235  239                  return (FLK_LOCKMGR_DOWN);
 236  240          }
 237  241          fg = flk_get_globals();
 238  242          return (fg->flk_lockmgr_status);
 239  243  }
 240  244  
 241  245  /*
 242  246   * Routine called from fs_frlock in fs/fs_subr.c
 243  247   */
 244  248  
 245  249  int
 246  250  reclock(vnode_t         *vp,
 247  251          flock64_t       *lckdat,
 248  252          int             cmd,
 249  253          int             flag,
 250  254          u_offset_t      offset,
 251  255          flk_callback_t  *flk_cbp)
 252  256  {
 253  257          lock_descriptor_t       stack_lock_request;
 254  258          lock_descriptor_t       *lock_request;
 255  259          int error = 0;
 256  260          graph_t *gp;
 257  261          int                     nlmid;
 258  262  
 259  263          /*
 260  264           * Check access permissions
 261  265           */
 262  266          if ((cmd & SETFLCK) &&
 263  267                  ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
 264  268                  (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
 265  269                          return (EBADF);
 266  270  
 267  271          /*
 268  272           * for query and unlock we use the stack_lock_request
 269  273           */
 270  274  
 271  275          if ((lckdat->l_type == F_UNLCK) ||
 272  276                          !((cmd & INOFLCK) || (cmd & SETFLCK))) {
 273  277                  lock_request = &stack_lock_request;
 274  278                  (void) bzero((caddr_t)lock_request,
 275  279                                  sizeof (lock_descriptor_t));
 276  280  
 277  281                  /*
 278  282                   * following is added to make the assertions in
 279  283                   * flk_execute_request() to pass through
 280  284                   */
 281  285  
 282  286                  lock_request->l_edge.edge_in_next = &lock_request->l_edge;
 283  287                  lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
 284  288                  lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
 285  289                  lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
 286  290                  lock_request->l_status = FLK_INITIAL_STATE;
 287  291          } else {
 288  292                  lock_request = flk_get_lock();
 289  293          }
 290  294          lock_request->l_state = 0;
 291  295          lock_request->l_vnode = vp;
 292  296          lock_request->l_zoneid = getzoneid();
 293  297  
 294  298          /*
 295  299           * Convert the request range into the canonical start and end
 296  300           * values.  The NLM protocol supports locking over the entire
 297  301           * 32-bit range, so there's no range checking for remote requests,
 298  302           * but we still need to verify that local requests obey the rules.
 299  303           */
 300  304          /* Clustering */
 301  305          if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
 302  306                  ASSERT(lckdat->l_whence == 0);
 303  307                  lock_request->l_start = lckdat->l_start;
 304  308                  lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
 305  309                          lckdat->l_start + (lckdat->l_len - 1);
 306  310          } else {
 307  311                  /* check the validity of the lock range */
 308  312                  error = flk_convert_lock_data(vp, lckdat,
 309  313                          &lock_request->l_start, &lock_request->l_end,
 310  314                          offset);
 311  315                  if (error) {
 312  316                          goto done;
 313  317                  }
 314  318                  error = flk_check_lock_data(lock_request->l_start,
 315  319                                              lock_request->l_end, MAXEND);
 316  320                  if (error) {
 317  321                          goto done;
 318  322                  }
 319  323          }
 320  324  
 321  325          ASSERT(lock_request->l_end >= lock_request->l_start);
 322  326  
 323  327          lock_request->l_type = lckdat->l_type;
 324  328          if (cmd & INOFLCK)
 325  329                  lock_request->l_state |= IO_LOCK;
 326  330          if (cmd & SLPFLCK)
 327  331                  lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
 328  332          if (cmd & RCMDLCK)
 329  333                  lock_request->l_state |= LOCKMGR_LOCK;
 330  334          if (cmd & NBMLCK)
 331  335                  lock_request->l_state |= NBMAND_LOCK;
 332  336          /*
 333  337           * Clustering: set flag for PXFS locks
 334  338           * We do not _only_ check for the PCMDLCK flag because PXFS locks could
 335  339           * also be of type 'RCMDLCK'.
 336  340           * We do not _only_ check the GETPXFSID() macro because local PXFS
 337  341           * clients use a pxfsid of zero to permit deadlock detection in the LLM.
 338  342           */
 339  343  
 340  344          if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
 341  345                  lock_request->l_state |= PXFS_LOCK;
 342  346          }
 343  347          if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
 344  348                  if (lock_request->l_type == F_RDLCK ||
 345  349                          lock_request->l_type == F_WRLCK)
 346  350                          lock_request->l_state |= QUERY_LOCK;
 347  351          }
 348  352          lock_request->l_flock = (*lckdat);
 349  353          lock_request->l_callbacks = flk_cbp;
 350  354  
 351  355          /*
 352  356           * We are ready for processing the request
 353  357           */
 354  358          if (IS_LOCKMGR(lock_request)) {
 355  359                  /*
 356  360                   * If the lock request is an NLM server request ....
 357  361                   */
 358  362                  if (nlm_status_size == 0) { /* not booted as cluster */
 359  363                          mutex_enter(&flock_lock);
 360  364                          /*
 361  365                           * Bail out if this is a lock manager request and the
 362  366                           * lock manager is not supposed to be running.
 363  367                           */
 364  368                          if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
 365  369                                  mutex_exit(&flock_lock);
 366  370                                  error = ENOLCK;
 367  371                                  goto done;
 368  372                          }
 369  373                          mutex_exit(&flock_lock);
 370  374                  } else {                        /* booted as a cluster */
 371  375                          nlmid = GETNLMID(lock_request->l_flock.l_sysid);
 372  376                          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
 373  377  
 374  378                          mutex_enter(&nlm_reg_lock);
 375  379                          /*
 376  380                           * If the NLM registry does not know about this
 377  381                           * NLM server making the request, add its nlmid
 378  382                           * to the registry.
 379  383                           */
 380  384                          if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
 381  385                                  nlmid)) {
 382  386                                  FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
 383  387                          } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
 384  388                                  nlmid)) {
 385  389                                  /*
 386  390                                   * If the NLM server is already known (has made
 387  391                                   * previous lock requests) and its state is
 388  392                                   * not NLM_UP (means that NLM server is
 389  393                                   * shutting down), then bail out with an
 390  394                                   * error to deny the lock request.
 391  395                                   */
 392  396                                  mutex_exit(&nlm_reg_lock);
 393  397                                  error = ENOLCK;
 394  398                                  goto done;
 395  399                          }
 396  400                          mutex_exit(&nlm_reg_lock);
 397  401                  }
 398  402          }
 399  403  
 400  404          /* Now get the lock graph for a particular vnode */
 401  405          gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
 402  406  
 403  407          /*
 404  408           * We drop rwlock here otherwise this might end up causing a
 405  409           * deadlock if this IOLOCK sleeps. (bugid # 1183392).
 406  410           */
 407  411  
 408  412          if (IS_IO_LOCK(lock_request)) {
 409  413                  VOP_RWUNLOCK(vp,
 410  414                          (lock_request->l_type == F_RDLCK) ?
 411  415                                  V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 412  416          }
 413  417          mutex_enter(&gp->gp_mutex);
 414  418  
 415  419          lock_request->l_state |= REFERENCED_LOCK;
 416  420          lock_request->l_graph = gp;
 417  421  
 418  422          switch (lock_request->l_type) {
 419  423          case F_RDLCK:
 420  424          case F_WRLCK:
 421  425                  if (IS_QUERY_LOCK(lock_request)) {
 422  426                          flk_get_first_blocking_lock(lock_request);
 423  427                          (*lckdat) = lock_request->l_flock;
 424  428                          break;
 425  429                  }
 426  430  
 427  431                  /* process the request now */
 428  432  
 429  433                  error = flk_process_request(lock_request);
 430  434                  break;
 431  435  
 432  436          case F_UNLCK:
 433  437                  /* unlock request will not block so execute it immediately */
 434  438  
 435  439                  if (IS_LOCKMGR(lock_request) &&
 436  440                      flk_canceled(lock_request)) {
 437  441                          error = 0;
 438  442                  } else {
 439  443                          error = flk_execute_request(lock_request);
 440  444                  }
 441  445                  break;
 442  446  
 443  447          case F_UNLKSYS:
 444  448                  /*
 445  449                   * Recovery mechanism to release lock manager locks when
 446  450                   * NFS client crashes and restart. NFS server will clear
 447  451                   * old locks and grant new locks.
 448  452                   */
 449  453  
 450  454                  if (lock_request->l_flock.l_sysid == 0) {
 451  455                          mutex_exit(&gp->gp_mutex);
 452  456                          return (EINVAL);
 453  457                  }
 454  458                  if (secpolicy_nfs(CRED()) != 0) {
 455  459                          mutex_exit(&gp->gp_mutex);
 456  460                          return (EPERM);
 457  461                  }
 458  462                  flk_delete_locks_by_sysid(lock_request);
 459  463                  lock_request->l_state &= ~REFERENCED_LOCK;
 460  464                  flk_set_state(lock_request, FLK_DEAD_STATE);
 461  465                  flk_free_lock(lock_request);
 462  466                  mutex_exit(&gp->gp_mutex);
 463  467                  return (0);
 464  468  
 465  469          default:
 466  470                  error = EINVAL;
 467  471                  break;
 468  472          }
 469  473  
 470  474          /* Clustering: For blocked PXFS locks, return */
 471  475          if (error == PXFS_LOCK_BLOCKED) {
 472  476                  lock_request->l_state &= ~REFERENCED_LOCK;
 473  477                  mutex_exit(&gp->gp_mutex);
 474  478                  return (error);
 475  479          }
 476  480  
 477  481          /*
 478  482           * Now that we have seen the status of locks in the system for
 479  483           * this vnode we acquire the rwlock if it is an IO_LOCK.
 480  484           */
 481  485  
 482  486          if (IS_IO_LOCK(lock_request)) {
 483  487                  (void) VOP_RWLOCK(vp,
 484  488                          (lock_request->l_type == F_RDLCK) ?
 485  489                                  V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
 486  490                  if (!error) {
 487  491                          lckdat->l_type = F_UNLCK;
 488  492  
 489  493                          /*
 490  494                           * This wake up is needed otherwise
 491  495                           * if IO_LOCK has slept the dependents on this
 492  496                           * will not be woken up at all. (bugid # 1185482).
 493  497                           */
 494  498  
 495  499                          flk_wakeup(lock_request, 1);
 496  500                          flk_set_state(lock_request, FLK_DEAD_STATE);
 497  501                          flk_free_lock(lock_request);
 498  502                  }
 499  503                  /*
 500  504                   * else if error had occurred either flk_process_request()
 501  505                   * has returned EDEADLK in which case there will be no
 502  506                   * dependents for this lock or EINTR from flk_wait_execute_
 503  507                   * request() in which case flk_cancel_sleeping_lock()
 504  508                   * would have been done. same is true with EBADF.
 505  509                   */
 506  510          }
 507  511  
 508  512          if (lock_request == &stack_lock_request) {
 509  513                  flk_set_state(lock_request, FLK_DEAD_STATE);
 510  514          } else {
 511  515                  lock_request->l_state &= ~REFERENCED_LOCK;
 512  516                  if ((error != 0) || IS_DELETED(lock_request)) {
 513  517                          flk_set_state(lock_request, FLK_DEAD_STATE);
 514  518                          flk_free_lock(lock_request);
 515  519                  }
 516  520          }
 517  521  
 518  522          mutex_exit(&gp->gp_mutex);
 519  523          return (error);
 520  524  
 521  525  done:
 522  526          flk_set_state(lock_request, FLK_DEAD_STATE);
 523  527          if (lock_request != &stack_lock_request)
 524  528                  flk_free_lock(lock_request);
 525  529          return (error);
 526  530  }
 527  531  
 528  532  /*
 529  533   * Invoke the callbacks in the given list.  If before sleeping, invoke in
 530  534   * list order.  If after sleeping, invoke in reverse order.
 531  535   *
 532  536   * CPR (suspend/resume) support: if one of the callbacks returns a
 533  537   * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
 534  538   * while it is sleeping.  There should be at most one callb_cpr_t for the
 535  539   * thread.
 536  540   * XXX This is unnecessarily complicated.  The CPR information should just
 537  541   * get passed in directly through VOP_FRLOCK and reclock, rather than
 538  542   * sneaking it in via a callback.
 539  543   */
 540  544  
 541  545  callb_cpr_t *
 542  546  flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
 543  547  {
 544  548          callb_cpr_t *cpr_callbackp = NULL;
 545  549          callb_cpr_t *one_result;
 546  550          flk_callback_t *cb;
 547  551  
 548  552          if (cblist == NULL)
 549  553                  return (NULL);
 550  554  
 551  555          if (when == FLK_BEFORE_SLEEP) {
 552  556                  cb = cblist;
 553  557                  do {
 554  558                          one_result = (*cb->cb_callback)(when, cb->cb_data);
 555  559                          if (one_result != NULL) {
 556  560                                  ASSERT(cpr_callbackp == NULL);
 557  561                                  cpr_callbackp = one_result;
 558  562                          }
 559  563                          cb = cb->cb_next;
 560  564                  } while (cb != cblist);
 561  565          } else {
 562  566                  cb = cblist->cb_prev;
 563  567                  do {
 564  568                          one_result = (*cb->cb_callback)(when, cb->cb_data);
 565  569                          if (one_result != NULL) {
 566  570                                  cpr_callbackp = one_result;
 567  571                          }
 568  572                          cb = cb->cb_prev;
 569  573                  } while (cb != cblist->cb_prev);
 570  574          }
 571  575  
 572  576          return (cpr_callbackp);
 573  577  }
 574  578  
 575  579  /*
 576  580   * Initialize a flk_callback_t to hold the given callback.
 577  581   */
 578  582  
 579  583  void
 580  584  flk_init_callback(flk_callback_t *flk_cb,
 581  585          callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
 582  586  {
 583  587          flk_cb->cb_next = flk_cb;
 584  588          flk_cb->cb_prev = flk_cb;
 585  589          flk_cb->cb_callback = cb_fcn;
 586  590          flk_cb->cb_data = cbdata;
 587  591  }
 588  592  
 589  593  /*
 590  594   * Initialize an flk_callback_t and then link it into the head of an
 591  595   * existing list (which may be NULL).
 592  596   */
 593  597  
 594  598  void
 595  599  flk_add_callback(flk_callback_t *newcb,
 596  600                  callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
 597  601                  void *cbdata, flk_callback_t *cblist)
 598  602  {
 599  603          flk_init_callback(newcb, cb_fcn, cbdata);
 600  604  
 601  605          if (cblist == NULL)
 602  606                  return;
 603  607  
 604  608          newcb->cb_prev = cblist->cb_prev;
 605  609          newcb->cb_next = cblist;
 606  610          cblist->cb_prev->cb_next = newcb;
 607  611          cblist->cb_prev = newcb;
 608  612  }
 609  613  
 610  614  /*
 611  615   * Initialize the flk_edge_cache data structure and create the
 612  616   * nlm_reg_status array.
 613  617   */
 614  618  
 615  619  void
 616  620  flk_init(void)
 617  621  {
 618  622          uint_t  i;
 619  623  
 620  624          flk_edge_cache = kmem_cache_create("flk_edges",
 621  625                  sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
 622  626          if (flk_edge_cache == NULL) {
 623  627                  cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
 624  628          }
 625  629          /*
 626  630           * Create the NLM registry object.
 627  631           */
 628  632  
 629  633          if (cluster_bootflags & CLUSTER_BOOTED) {
 630  634                  /*
 631  635                   * This routine tells you the maximum node id that will be used
 632  636                   * in the cluster.  This number will be the size of the nlm
 633  637                   * registry status array.  We add 1 because we will be using
 634  638                   * all entries indexed from 0 to maxnodeid; e.g., from 0
 635  639                   * to 64, for a total of 65 entries.
 636  640                   */
 637  641                  nlm_status_size = clconf_maximum_nodeid() + 1;
 638  642          } else {
 639  643                  nlm_status_size = 0;
 640  644          }
 641  645  
 642  646          if (nlm_status_size != 0) {     /* booted as a cluster */
 643  647                  nlm_reg_status = (flk_nlm_status_t *)
 644  648                          kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
 645  649                                  KM_SLEEP);
 646  650  
 647  651                  /* initialize all NLM states in array to NLM_UNKNOWN */
 648  652                  for (i = 0; i < nlm_status_size; i++) {
 649  653                          nlm_reg_status[i] = FLK_NLM_UNKNOWN;
 650  654                  }
 651  655          }
 652  656  }
 653  657  
 654  658  /*
 655  659   * Zone constructor/destructor callbacks to be executed when a zone is
 656  660   * created/destroyed.
 657  661   */
 658  662  /* ARGSUSED */
 659  663  void *
 660  664  flk_zone_init(zoneid_t zoneid)
 661  665  {
 662  666          struct flock_globals *fg;
 663  667          uint_t i;
 664  668  
 665  669          fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
 666  670          fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
 667  671          for (i = 0; i < HASH_SIZE; i++)
 668  672                  fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
 669  673          return (fg);
 670  674  }
 671  675  
 672  676  /* ARGSUSED */
 673  677  void
 674  678  flk_zone_fini(zoneid_t zoneid, void *data)
 675  679  {
 676  680          struct flock_globals *fg = data;
 677  681  
 678  682          kmem_free(fg, sizeof (*fg));
 679  683  }
 680  684  
 681  685  /*
 682  686   * Get a lock_descriptor structure with initialization of edge lists.
 683  687   */
 684  688  
 685  689  static lock_descriptor_t *
 686  690  flk_get_lock(void)
 687  691  {
 688  692          lock_descriptor_t       *l;
 689  693  
 690  694          l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
 691  695  
 692  696          cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
 693  697          l->l_edge.edge_in_next = &l->l_edge;
 694  698          l->l_edge.edge_in_prev = &l->l_edge;
 695  699          l->l_edge.edge_adj_next = &l->l_edge;
 696  700          l->l_edge.edge_adj_prev = &l->l_edge;
 697  701          l->pvertex = -1;
 698  702          l->l_status = FLK_INITIAL_STATE;
 699  703          flk_lock_allocs++;
 700  704          return (l);
 701  705  }
 702  706  
 703  707  /*
 704  708   * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
 705  709   * when some thread has a reference to it as in reclock().
 706  710   */
 707  711  
 708  712  void
 709  713  flk_free_lock(lock_descriptor_t *lock)
 710  714  {
 711  715          ASSERT(IS_DEAD(lock));
 712  716          if (IS_REFERENCED(lock)) {
 713  717                  lock->l_state |= DELETED_LOCK;
 714  718                  return;
 715  719          }
 716  720          flk_lock_frees++;
 717  721          kmem_free((void *)lock, sizeof (lock_descriptor_t));
 718  722  }
 719  723  
 720  724  void
 721  725  flk_set_state(lock_descriptor_t *lock, int new_state)
 722  726  {
 723  727          /*
 724  728           * Locks in the sleeping list may be woken up in a number of ways,
 725  729           * and more than once.  If a sleeping lock is signaled awake more
 726  730           * than once, then it may or may not change state depending on its
 727  731           * current state.
 728  732           * Also note that NLM locks that are sleeping could be moved to an
 729  733           * interrupted state more than once if the unlock request is
 730  734           * retransmitted by the NLM client - the second time around, this is
 731  735           * just a nop.
 732  736           * The ordering of being signaled awake is:
 733  737           * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
 734  738           * The checks below implement this ordering.
 735  739           */
 736  740          if (IS_INTERRUPTED(lock)) {
 737  741                  if ((new_state == FLK_CANCELLED_STATE) ||
 738  742                      (new_state == FLK_GRANTED_STATE) ||
 739  743                      (new_state == FLK_INTERRUPTED_STATE)) {
 740  744                          return;
 741  745                  }
 742  746          }
 743  747          if (IS_CANCELLED(lock)) {
 744  748                  if ((new_state == FLK_GRANTED_STATE) ||
 745  749                      (new_state == FLK_CANCELLED_STATE)) {
 746  750                          return;
 747  751                  }
 748  752          }
 749  753          CHECK_LOCK_TRANSITION(lock->l_status, new_state);
 750  754          if (IS_PXFS(lock)) {
 751  755                  cl_flk_state_transition_notify(lock, lock->l_status, new_state);
 752  756          }
 753  757          lock->l_status = new_state;
 754  758  }
 755  759  
 756  760  /*
 757  761   * Routine that checks whether there are any blocking locks in the system.
 758  762   *
 759  763   * The policy followed is if a write lock is sleeping we don't allow read
 760  764   * locks before this write lock even though there may not be any active
 761  765   * locks corresponding to the read locks' region.
 762  766   *
 763  767   * flk_add_edge() function adds an edge between l1 and l2 iff there
 764  768   * is no path between l1 and l2. This is done to have a "minimum
 765  769   * storage representation" of the dependency graph.
 766  770   *
 767  771   * Another property of the graph is since only the new request throws
 768  772   * edges to the existing locks in the graph, the graph is always topologically
 769  773   * ordered.
 770  774   */
 771  775  
 772  776  static int
 773  777  flk_process_request(lock_descriptor_t *request)
 774  778  {
 775  779          graph_t *gp = request->l_graph;
 776  780          lock_descriptor_t *lock;
 777  781          int request_blocked_by_active = 0;
 778  782          int request_blocked_by_granted = 0;
 779  783          int request_blocked_by_sleeping = 0;
 780  784          vnode_t *vp = request->l_vnode;
 781  785          int     error = 0;
 782  786          int request_will_wait = 0;
 783  787          int found_covering_lock = 0;
 784  788          lock_descriptor_t *covered_by = NULL;
 785  789  
 786  790          ASSERT(MUTEX_HELD(&gp->gp_mutex));
 787  791          request_will_wait = IS_WILLING_TO_SLEEP(request);
 788  792  
 789  793          /*
 790  794           * check active locks
 791  795           */
 792  796  
 793  797          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 794  798  
 795  799  
 796  800          if (lock) {
 797  801                  do {
 798  802                          if (BLOCKS(lock, request)) {
 799  803                                  if (!request_will_wait)
 800  804                                          return (EAGAIN);
 801  805                                  request_blocked_by_active = 1;
 802  806                                  break;
 803  807                          }
 804  808                          /*
 805  809                           * Grant lock if it is for the same owner holding active
 806  810                           * lock that covers the request.
 807  811                           */
 808  812  
 809  813                          if (SAME_OWNER(lock, request) &&
 810  814                                          COVERS(lock, request) &&
 811  815                                                  (request->l_type == F_RDLCK))
 812  816                                  return (flk_execute_request(request));
 813  817                          lock = lock->l_next;
 814  818                  } while (lock->l_vnode == vp);
 815  819          }
 816  820  
 817  821          if (!request_blocked_by_active) {
 818  822                          lock_descriptor_t *lk[1];
 819  823                          lock_descriptor_t *first_glock = NULL;
 820  824                  /*
 821  825                   * Shall we grant this?! NO!!
 822  826                   * What about those locks that were just granted and still
 823  827                   * in sleep queue. Those threads are woken up and so locks
 824  828                   * are almost active.
 825  829                   */
 826  830                  SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 827  831                  if (lock) {
 828  832                          do {
 829  833                                  if (BLOCKS(lock, request)) {
 830  834                                          if (IS_GRANTED(lock)) {
 831  835                                                  request_blocked_by_granted = 1;
 832  836                                          } else {
 833  837                                                  request_blocked_by_sleeping = 1;
 834  838                                          }
 835  839                                  }
 836  840  
 837  841                                  lock = lock->l_next;
 838  842                          } while ((lock->l_vnode == vp));
 839  843                          first_glock = lock->l_prev;
 840  844                          ASSERT(first_glock->l_vnode == vp);
 841  845                  }
 842  846  
 843  847                  if (request_blocked_by_granted)
 844  848                          goto block;
 845  849  
 846  850                  if (!request_blocked_by_sleeping) {
 847  851                          /*
 848  852                           * If the request isn't going to be blocked by a
 849  853                           * sleeping request, we know that it isn't going to
 850  854                           * be blocked; we can just execute the request --
 851  855                           * without performing costly deadlock detection.
 852  856                           */
 853  857                          ASSERT(!request_blocked_by_active);
 854  858                          return (flk_execute_request(request));
 855  859                  } else if (request->l_type == F_RDLCK) {
 856  860                          /*
 857  861                           * If we have a sleeping writer in the requested
 858  862                           * lock's range, block.
 859  863                           */
 860  864                          goto block;
 861  865                  }
 862  866  
 863  867                  lk[0] = request;
 864  868                  request->l_state |= RECOMPUTE_LOCK;
 865  869                  SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 866  870                  if (lock) {
 867  871                          do {
 868  872                                  flk_recompute_dependencies(lock, lk, 1, 0);
 869  873                                  lock = lock->l_next;
 870  874                          } while (lock->l_vnode == vp);
 871  875                  }
 872  876                  lock = first_glock;
 873  877                  if (lock) {
 874  878                          do {
 875  879                                  if (IS_GRANTED(lock)) {
 876  880                                  flk_recompute_dependencies(lock, lk, 1, 0);
 877  881                                  }
 878  882                                  lock = lock->l_prev;
 879  883                          } while ((lock->l_vnode == vp));
 880  884                  }
 881  885                  request->l_state &= ~RECOMPUTE_LOCK;
 882  886                  if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
 883  887                          return (EDEADLK);
 884  888                  return (flk_execute_request(request));
 885  889          }
 886  890  
 887  891  block:
 888  892          if (request_will_wait)
 889  893                  flk_graph_uncolor(gp);
 890  894  
 891  895          /* check sleeping locks */
 892  896  
 893  897          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
 894  898  
 895  899          /*
 896  900           * If we find a sleeping write lock that is a superset of the
 897  901           * region wanted by request we can be assured that by adding an
 898  902           * edge to this write lock we have paths to all locks in the
 899  903           * graph that blocks the request except in one case and that is why
 900  904           * another check for SAME_OWNER in the loop below. The exception
 901  905           * case is when this process that owns the sleeping write lock 'l1'
 902  906           * has other locks l2, l3, l4 that are in the system and arrived
 903  907           * before l1. l1 does not have path to these locks as they are from
 904  908           * same process. We break when we find a second covering sleeping
 905  909           * lock l5 owned by a process different from that owning l1, because
 906  910           * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
 907  911           * it has l1 would have produced a deadlock already.
 908  912           */
 909  913  
 910  914          if (lock) {
 911  915                  do {
 912  916                          if (BLOCKS(lock, request)) {
 913  917                                  if (!request_will_wait)
 914  918                                          return (EAGAIN);
 915  919                                  if (COVERS(lock, request) &&
 916  920                                                  lock->l_type == F_WRLCK) {
 917  921                                          if (found_covering_lock &&
 918  922                                              !SAME_OWNER(lock, covered_by)) {
 919  923                                                  found_covering_lock++;
 920  924                                                  break;
 921  925                                          }
 922  926                                          found_covering_lock = 1;
 923  927                                          covered_by = lock;
 924  928                                  }
 925  929                                  if (found_covering_lock &&
 926  930                                          !SAME_OWNER(lock, covered_by)) {
 927  931                                          lock = lock->l_next;
 928  932                                          continue;
 929  933                                  }
 930  934                                  if ((error = flk_add_edge(request, lock,
 931  935                                                  !found_covering_lock, 0)))
 932  936                                          return (error);
 933  937                          }
 934  938                          lock = lock->l_next;
 935  939                  } while (lock->l_vnode == vp);
 936  940          }
 937  941  
 938  942  /*
 939  943   * found_covering_lock == 2 iff at this point 'request' has paths
 940  944   * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
 941  945   * point 'request' has paths to all locks that blocks 'request' whose owners
 942  946   * are not same as the one that covers 'request' (covered_by above) and
 943  947   * we can have locks whose owner is same as covered_by in the active list.
 944  948   */
 945  949  
 946  950          if (request_blocked_by_active && found_covering_lock != 2) {
 947  951                  SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
 948  952                  ASSERT(lock != NULL);
 949  953                  do {
 950  954                          if (BLOCKS(lock, request)) {
 951  955                                  if (found_covering_lock &&
 952  956                                          !SAME_OWNER(lock, covered_by)) {
 953  957                                          lock = lock->l_next;
 954  958                                          continue;
 955  959                                  }
 956  960                                  if ((error = flk_add_edge(request, lock,
 957  961                                                          CHECK_CYCLE, 0)))
 958  962                                          return (error);
 959  963                          }
 960  964                          lock = lock->l_next;
 961  965                  } while (lock->l_vnode == vp);
 962  966          }
 963  967  
 964  968          if (NOT_BLOCKED(request)) {
 965  969                  /*
 966  970                   * request not dependent on any other locks
 967  971                   * so execute this request
 968  972                   */
 969  973                  return (flk_execute_request(request));
 970  974          } else {
 971  975                  /*
 972  976                   * check for deadlock
 973  977                   */
 974  978                  if (flk_check_deadlock(request))
 975  979                          return (EDEADLK);
 976  980                  /*
 977  981                   * this thread has to sleep
 978  982                   */
 979  983                  return (flk_wait_execute_request(request));
 980  984          }
 981  985  }
 982  986  
 983  987  /*
 984  988   * The actual execution of the request in the simple case is only to
 985  989   * insert the 'request' in the list of active locks if it is not an
 986  990   * UNLOCK.
 987  991   * We have to consider the existing active locks' relation to
 988  992   * this 'request' if they are owned by same process. flk_relation() does
 989  993   * this job and sees to that the dependency graph information is maintained
 990  994   * properly.
 991  995   */
 992  996  
 993  997  int
 994  998  flk_execute_request(lock_descriptor_t *request)
 995  999  {
 996 1000          graph_t *gp = request->l_graph;
 997 1001          vnode_t *vp = request->l_vnode;
 998 1002          lock_descriptor_t       *lock, *lock1;
 999 1003          int done_searching = 0;
1000 1004  
1001 1005          CHECK_SLEEPING_LOCKS(gp);
1002 1006          CHECK_ACTIVE_LOCKS(gp);
1003 1007  
1004 1008          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1005 1009  
1006 1010          flk_set_state(request, FLK_START_STATE);
1007 1011  
1008 1012          ASSERT(NOT_BLOCKED(request));
1009 1013  
1010 1014          /* IO_LOCK requests are only to check status */
1011 1015  
1012 1016          if (IS_IO_LOCK(request))
1013 1017                  return (0);
1014 1018  
1015 1019          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1016 1020  
1017 1021          if (lock == NULL && request->l_type == F_UNLCK)
1018 1022                  return (0);
1019 1023          if (lock == NULL) {
1020 1024                  flk_insert_active_lock(request);
1021 1025                  return (0);
1022 1026          }
1023 1027  
1024 1028          do {
1025 1029                  lock1 = lock->l_next;
1026 1030                  if (SAME_OWNER(request, lock)) {
1027 1031                          done_searching = flk_relation(lock, request);
1028 1032                  }
1029 1033                  lock = lock1;
1030 1034          } while (lock->l_vnode == vp && !done_searching);
1031 1035  
1032 1036          /*
1033 1037           * insert in active queue
1034 1038           */
1035 1039  
1036 1040          if (request->l_type != F_UNLCK)
1037 1041                  flk_insert_active_lock(request);
1038 1042  
1039 1043          return (0);
1040 1044  }
1041 1045  
1042 1046  /*
1043 1047   * 'request' is blocked by some one therefore we put it into sleep queue.
1044 1048   */
1045 1049  static int
1046 1050  flk_wait_execute_request(lock_descriptor_t *request)
1047 1051  {
1048 1052          graph_t *gp = request->l_graph;
1049 1053          callb_cpr_t     *cprp;          /* CPR info from callback */
1050 1054          struct flock_globals *fg;
1051 1055          int index;
1052 1056  
1053 1057          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1054 1058          ASSERT(IS_WILLING_TO_SLEEP(request));
1055 1059  
1056 1060          flk_insert_sleeping_lock(request);
1057 1061  
1058 1062          if (IS_LOCKMGR(request)) {
1059 1063                  index = HASH_INDEX(request->l_vnode);
1060 1064                  fg = flk_get_globals();
1061 1065  
1062 1066                  if (nlm_status_size == 0) {     /* not booted as a cluster */
1063 1067                          if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1064 1068                                  flk_cancel_sleeping_lock(request, 1);
1065 1069                                  return (ENOLCK);
1066 1070                          }
1067 1071                  } else {                        /* booted as a cluster */
1068 1072                          /*
1069 1073                           * If the request is an NLM server lock request,
1070 1074                           * and the NLM state of the lock request is not
1071 1075                           * NLM_UP (because the NLM server is shutting
1072 1076                           * down), then cancel the sleeping lock and
1073 1077                           * return error ENOLCK that will encourage the
1074 1078                           * client to retransmit.
1075 1079                           */
1076 1080                          if (!IS_NLM_UP(request)) {
1077 1081                                  flk_cancel_sleeping_lock(request, 1);
1078 1082                                  return (ENOLCK);
1079 1083                          }
1080 1084                  }
1081 1085          }
1082 1086  
1083 1087          /* Clustering: For blocking PXFS locks, return */
1084 1088          if (IS_PXFS(request)) {
1085 1089                  /*
1086 1090                   * PXFS locks sleep on the client side.
1087 1091                   * The callback argument is used to wake up the sleeper
1088 1092                   * when the lock is granted.
1089 1093                   * We return -1 (rather than an errno value) to indicate
1090 1094                   * the client side should sleep
1091 1095                   */
1092 1096                  return (PXFS_LOCK_BLOCKED);
1093 1097          }
1094 1098  
1095 1099          if (request->l_callbacks != NULL) {
1096 1100                  /*
1097 1101                   * To make sure the shutdown code works correctly, either
1098 1102                   * the callback must happen after putting the lock on the
1099 1103                   * sleep list, or we must check the shutdown status after
1100 1104                   * returning from the callback (and before sleeping).  At
1101 1105                   * least for now, we'll use the first option.  If a
1102 1106                   * shutdown or signal or whatever happened while the graph
1103 1107                   * mutex was dropped, that will be detected by
1104 1108                   * wait_for_lock().
1105 1109                   */
1106 1110                  mutex_exit(&gp->gp_mutex);
1107 1111  
1108 1112                  cprp = flk_invoke_callbacks(request->l_callbacks,
1109 1113                                              FLK_BEFORE_SLEEP);
1110 1114  
1111 1115                  mutex_enter(&gp->gp_mutex);
1112 1116  
1113 1117                  if (cprp == NULL) {
1114 1118                          wait_for_lock(request);
1115 1119                  } else {
1116 1120                          mutex_enter(cprp->cc_lockp);
1117 1121                          CALLB_CPR_SAFE_BEGIN(cprp);
1118 1122                          mutex_exit(cprp->cc_lockp);
1119 1123                          wait_for_lock(request);
1120 1124                          mutex_enter(cprp->cc_lockp);
1121 1125                          CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1122 1126                          mutex_exit(cprp->cc_lockp);
1123 1127                  }
1124 1128  
1125 1129                  mutex_exit(&gp->gp_mutex);
1126 1130                  (void) flk_invoke_callbacks(request->l_callbacks,
1127 1131                                              FLK_AFTER_SLEEP);
1128 1132                  mutex_enter(&gp->gp_mutex);
1129 1133          } else {
1130 1134                  wait_for_lock(request);
1131 1135          }
1132 1136  
1133 1137          if (IS_LOCKMGR(request)) {
1134 1138                  /*
1135 1139                   * If the lock manager is shutting down, return an
1136 1140                   * error that will encourage the client to retransmit.
1137 1141                   */
1138 1142                  if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1139 1143                          !IS_GRANTED(request)) {
1140 1144                          flk_cancel_sleeping_lock(request, 1);
1141 1145                          return (ENOLCK);
1142 1146                  }
1143 1147          }
1144 1148  
1145 1149          if (IS_INTERRUPTED(request)) {
1146 1150                  /* we got a signal, or act like we did */
1147 1151                  flk_cancel_sleeping_lock(request, 1);
1148 1152                  return (EINTR);
1149 1153          }
1150 1154  
1151 1155          /* Cancelled if some other thread has closed the file */
1152 1156  
1153 1157          if (IS_CANCELLED(request)) {
1154 1158                  flk_cancel_sleeping_lock(request, 1);
1155 1159                  return (EBADF);
1156 1160          }
1157 1161  
1158 1162          request->l_state &= ~GRANTED_LOCK;
1159 1163          REMOVE_SLEEP_QUEUE(request);
1160 1164          return (flk_execute_request(request));
1161 1165  }
1162 1166  
1163 1167  /*
1164 1168   * This routine adds an edge between from and to because from depends
1165 1169   * to. If asked to check for deadlock it checks whether there are any
1166 1170   * reachable locks from "from_lock" that is owned by the same process
1167 1171   * as "from_lock".
1168 1172   * NOTE: It is the caller's responsibility to make sure that the color
1169 1173   * of the graph is consistent between the calls to flk_add_edge as done
1170 1174   * in flk_process_request. This routine does not color and check for
1171 1175   * deadlock explicitly.
1172 1176   */
1173 1177  
1174 1178  static int
1175 1179  flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1176 1180                          int check_cycle, int update_graph)
1177 1181  {
1178 1182          edge_t  *edge;
1179 1183          edge_t  *ep;
1180 1184          lock_descriptor_t       *vertex;
1181 1185          lock_descriptor_t *vertex_stack;
1182 1186  
1183 1187          STACK_INIT(vertex_stack);
1184 1188  
1185 1189          /*
1186 1190           * if to vertex already has mark_color just return
1187 1191           * don't add an edge as it is reachable from from vertex
1188 1192           * before itself.
1189 1193           */
1190 1194  
1191 1195          if (COLORED(to_lock))
1192 1196                  return (0);
1193 1197  
1194 1198          edge = flk_get_edge();
1195 1199  
1196 1200          /*
1197 1201           * set the from and to vertex
1198 1202           */
1199 1203  
1200 1204          edge->from_vertex = from_lock;
1201 1205          edge->to_vertex = to_lock;
1202 1206  
1203 1207          /*
1204 1208           * put in adjacency list of from vertex
1205 1209           */
1206 1210  
1207 1211          from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1208 1212          edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1209 1213          edge->edge_adj_prev = &from_lock->l_edge;
1210 1214          from_lock->l_edge.edge_adj_next = edge;
1211 1215  
1212 1216          /*
1213 1217           * put in in list of to vertex
1214 1218           */
1215 1219  
1216 1220          to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1217 1221          edge->edge_in_next = to_lock->l_edge.edge_in_next;
1218 1222          to_lock->l_edge.edge_in_next = edge;
1219 1223          edge->edge_in_prev = &to_lock->l_edge;
1220 1224  
1221 1225  
1222 1226          if (update_graph) {
1223 1227                  flk_update_proc_graph(edge, 0);
1224 1228                  return (0);
1225 1229          }
1226 1230          if (!check_cycle) {
1227 1231                  return (0);
1228 1232          }
1229 1233  
1230 1234          STACK_PUSH(vertex_stack, from_lock, l_stack);
1231 1235  
1232 1236          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1233 1237  
1234 1238                  STACK_POP(vertex_stack, l_stack);
1235 1239  
1236 1240                  for (ep = FIRST_ADJ(vertex);
1237 1241                          ep != HEAD(vertex);
1238 1242                                  ep = NEXT_ADJ(ep)) {
1239 1243                          if (COLORED(ep->to_vertex))
1240 1244                                  continue;
1241 1245                          COLOR(ep->to_vertex);
1242 1246                          if (SAME_OWNER(ep->to_vertex, from_lock))
1243 1247                                  goto dead_lock;
1244 1248                          STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1245 1249                  }
1246 1250          }
1247 1251          return (0);
1248 1252  
1249 1253  dead_lock:
1250 1254  
1251 1255          /*
1252 1256           * remove all edges
1253 1257           */
1254 1258  
1255 1259          ep = FIRST_ADJ(from_lock);
1256 1260  
1257 1261          while (ep != HEAD(from_lock)) {
1258 1262                  IN_LIST_REMOVE(ep);
1259 1263                  from_lock->l_sedge = NEXT_ADJ(ep);
1260 1264                  ADJ_LIST_REMOVE(ep);
1261 1265                  flk_free_edge(ep);
1262 1266                  ep = from_lock->l_sedge;
1263 1267          }
1264 1268          return (EDEADLK);
1265 1269  }
1266 1270  
1267 1271  /*
1268 1272   * Get an edge structure for representing the dependency between two locks.
1269 1273   */
1270 1274  
1271 1275  static edge_t *
1272 1276  flk_get_edge()
1273 1277  {
1274 1278          edge_t  *ep;
1275 1279  
1276 1280          ASSERT(flk_edge_cache != NULL);
1277 1281  
1278 1282          ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1279 1283          edge_allocs++;
1280 1284          return (ep);
1281 1285  }
1282 1286  
1283 1287  /*
1284 1288   * Free the edge structure.
1285 1289   */
1286 1290  
1287 1291  static void
1288 1292  flk_free_edge(edge_t *ep)
1289 1293  {
1290 1294          edge_frees++;
1291 1295          kmem_cache_free(flk_edge_cache, (void *)ep);
1292 1296  }
1293 1297  
1294 1298  /*
1295 1299   * Check the relationship of request with lock and perform the
1296 1300   * recomputation of dependencies, break lock if required, and return
1297 1301   * 1 if request cannot have any more relationship with the next
1298 1302   * active locks.
1299 1303   * The 'lock' and 'request' are compared and in case of overlap we
1300 1304   * delete the 'lock' and form new locks to represent the non-overlapped
1301 1305   * portion of original 'lock'. This function has side effects such as
1302 1306   * 'lock' will be freed, new locks will be added to the active list.
1303 1307   */
1304 1308  
1305 1309  static int
1306 1310  flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1307 1311  {
1308 1312          int lock_effect;
1309 1313          lock_descriptor_t *lock1, *lock2;
1310 1314          lock_descriptor_t *topology[3];
1311 1315          int nvertex = 0;
1312 1316          int i;
1313 1317          edge_t  *ep;
1314 1318          graph_t *gp = (lock->l_graph);
1315 1319  
1316 1320  
1317 1321          CHECK_SLEEPING_LOCKS(gp);
1318 1322          CHECK_ACTIVE_LOCKS(gp);
1319 1323  
1320 1324          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1321 1325  
1322 1326          topology[0] = topology[1] = topology[2] = NULL;
1323 1327  
1324 1328          if (request->l_type == F_UNLCK)
1325 1329                  lock_effect = FLK_UNLOCK;
1326 1330          else if (request->l_type == F_RDLCK &&
1327 1331                          lock->l_type == F_WRLCK)
1328 1332                  lock_effect = FLK_DOWNGRADE;
1329 1333          else if (request->l_type == F_WRLCK &&
1330 1334                          lock->l_type == F_RDLCK)
1331 1335                  lock_effect = FLK_UPGRADE;
1332 1336          else
1333 1337                  lock_effect = FLK_STAY_SAME;
1334 1338  
1335 1339          if (lock->l_end < request->l_start) {
1336 1340                  if (lock->l_end == request->l_start - 1 &&
1337 1341                                  lock_effect == FLK_STAY_SAME) {
1338 1342                          topology[0] = request;
1339 1343                          request->l_start = lock->l_start;
1340 1344                          nvertex = 1;
1341 1345                          goto recompute;
1342 1346                  } else {
1343 1347                          return (0);
1344 1348                  }
1345 1349          }
1346 1350  
1347 1351          if (lock->l_start > request->l_end) {
1348 1352                  if (request->l_end == lock->l_start - 1 &&
1349 1353                                          lock_effect == FLK_STAY_SAME) {
1350 1354                          topology[0] = request;
1351 1355                          request->l_end = lock->l_end;
1352 1356                          nvertex = 1;
1353 1357                          goto recompute;
1354 1358                  } else {
1355 1359                          return (1);
1356 1360                  }
1357 1361          }
1358 1362  
1359 1363          if (request->l_end < lock->l_end) {
1360 1364                  if (request->l_start > lock->l_start) {
1361 1365                          if (lock_effect == FLK_STAY_SAME) {
1362 1366                                  request->l_start = lock->l_start;
1363 1367                                  request->l_end = lock->l_end;
1364 1368                                  topology[0] = request;
1365 1369                                  nvertex = 1;
1366 1370                          } else {
1367 1371                                  lock1 = flk_get_lock();
1368 1372                                  lock2 = flk_get_lock();
1369 1373                                  COPY(lock1, lock);
1370 1374                                  COPY(lock2, lock);
1371 1375                                  lock1->l_start = lock->l_start;
1372 1376                                  lock1->l_end = request->l_start - 1;
1373 1377                                  lock2->l_start = request->l_end + 1;
1374 1378                                  lock2->l_end = lock->l_end;
1375 1379                                  topology[0] = lock1;
1376 1380                                  topology[1] = lock2;
1377 1381                                  topology[2] = request;
1378 1382                                  nvertex = 3;
1379 1383                          }
1380 1384                  } else if (request->l_start < lock->l_start) {
1381 1385                          if (lock_effect == FLK_STAY_SAME) {
1382 1386                                  request->l_end = lock->l_end;
1383 1387                                  topology[0] = request;
1384 1388                                  nvertex = 1;
1385 1389                          } else {
1386 1390                                  lock1 = flk_get_lock();
1387 1391                                  COPY(lock1, lock);
1388 1392                                  lock1->l_start = request->l_end + 1;
1389 1393                                  topology[0] = lock1;
1390 1394                                  topology[1] = request;
1391 1395                                  nvertex = 2;
1392 1396                          }
1393 1397                  } else  {
1394 1398                          if (lock_effect == FLK_STAY_SAME) {
1395 1399                                  request->l_start = lock->l_start;
1396 1400                                  request->l_end = lock->l_end;
1397 1401                                  topology[0] = request;
1398 1402                                  nvertex = 1;
1399 1403                          } else {
1400 1404                                  lock1 = flk_get_lock();
1401 1405                                  COPY(lock1, lock);
1402 1406                                  lock1->l_start = request->l_end + 1;
1403 1407                                  topology[0] = lock1;
1404 1408                                  topology[1] = request;
1405 1409                                  nvertex = 2;
1406 1410                          }
1407 1411                  }
1408 1412          } else if (request->l_end > lock->l_end) {
1409 1413                  if (request->l_start > lock->l_start)  {
1410 1414                          if (lock_effect == FLK_STAY_SAME) {
1411 1415                                  request->l_start = lock->l_start;
1412 1416                                  topology[0] = request;
1413 1417                                  nvertex = 1;
1414 1418                          } else {
1415 1419                                  lock1 = flk_get_lock();
1416 1420                                  COPY(lock1, lock);
1417 1421                                  lock1->l_end = request->l_start - 1;
1418 1422                                  topology[0] = lock1;
1419 1423                                  topology[1] = request;
1420 1424                                  nvertex = 2;
1421 1425                          }
1422 1426                  } else if (request->l_start < lock->l_start)  {
1423 1427                          topology[0] = request;
1424 1428                          nvertex = 1;
1425 1429                  } else {
1426 1430                          topology[0] = request;
1427 1431                          nvertex = 1;
1428 1432                  }
1429 1433          } else {
1430 1434                  if (request->l_start > lock->l_start) {
1431 1435                          if (lock_effect == FLK_STAY_SAME) {
1432 1436                                  request->l_start = lock->l_start;
1433 1437                                  topology[0] = request;
1434 1438                                  nvertex = 1;
1435 1439                          } else {
1436 1440                                  lock1 = flk_get_lock();
1437 1441                                  COPY(lock1, lock);
1438 1442                                  lock1->l_end = request->l_start - 1;
1439 1443                                  topology[0] = lock1;
1440 1444                                  topology[1] = request;
1441 1445                                  nvertex = 2;
1442 1446                          }
1443 1447                  } else if (request->l_start < lock->l_start) {
1444 1448                          topology[0] = request;
1445 1449                          nvertex = 1;
1446 1450                  } else {
1447 1451                          if (lock_effect !=  FLK_UNLOCK) {
1448 1452                                  topology[0] = request;
1449 1453                                  nvertex = 1;
1450 1454                          } else {
1451 1455                                  flk_delete_active_lock(lock, 0);
1452 1456                                  flk_wakeup(lock, 1);
1453 1457                                  flk_free_lock(lock);
1454 1458                                  CHECK_SLEEPING_LOCKS(gp);
1455 1459                                  CHECK_ACTIVE_LOCKS(gp);
1456 1460                                  return (1);
1457 1461                          }
1458 1462                  }
1459 1463          }
1460 1464  
1461 1465  recompute:
1462 1466  
1463 1467          /*
1464 1468           * For unlock we don't send the 'request' to for recomputing
1465 1469           * dependencies because no lock will add an edge to this.
1466 1470           */
1467 1471  
1468 1472          if (lock_effect == FLK_UNLOCK) {
1469 1473                  topology[nvertex-1] = NULL;
1470 1474                  nvertex--;
1471 1475          }
1472 1476          for (i = 0; i < nvertex; i++) {
1473 1477                  topology[i]->l_state |= RECOMPUTE_LOCK;
1474 1478                  topology[i]->l_color = NO_COLOR;
1475 1479          }
1476 1480  
1477 1481          ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1478 1482  
1479 1483          /*
1480 1484           * we remove the adjacent edges for all vertices' to this vertex
1481 1485           * 'lock'.
1482 1486           */
1483 1487  
1484 1488          ep = FIRST_IN(lock);
1485 1489          while (ep != HEAD(lock)) {
1486 1490                  ADJ_LIST_REMOVE(ep);
1487 1491                  ep = NEXT_IN(ep);
1488 1492          }
1489 1493  
1490 1494          flk_delete_active_lock(lock, 0);
1491 1495  
1492 1496          /* We are ready for recomputing the dependencies now */
1493 1497  
1494 1498          flk_recompute_dependencies(lock, topology, nvertex, 1);
1495 1499  
1496 1500          for (i = 0; i < nvertex; i++) {
1497 1501                  topology[i]->l_state &= ~RECOMPUTE_LOCK;
1498 1502                  topology[i]->l_color = NO_COLOR;
1499 1503          }
1500 1504  
1501 1505  
1502 1506          if (lock_effect == FLK_UNLOCK) {
1503 1507                  nvertex++;
1504 1508          }
1505 1509          for (i = 0; i < nvertex - 1; i++) {
1506 1510                  flk_insert_active_lock(topology[i]);
1507 1511          }
1508 1512  
1509 1513  
1510 1514          if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1511 1515                  flk_wakeup(lock, 0);
1512 1516          } else {
1513 1517                  ep = FIRST_IN(lock);
1514 1518                  while (ep != HEAD(lock)) {
1515 1519                          lock->l_sedge = NEXT_IN(ep);
1516 1520                          IN_LIST_REMOVE(ep);
1517 1521                          flk_update_proc_graph(ep, 1);
1518 1522                          flk_free_edge(ep);
1519 1523                          ep = lock->l_sedge;
1520 1524                  }
1521 1525          }
1522 1526          flk_free_lock(lock);
1523 1527  
1524 1528          CHECK_SLEEPING_LOCKS(gp);
1525 1529          CHECK_ACTIVE_LOCKS(gp);
1526 1530          return (0);
1527 1531  }
1528 1532  
1529 1533  /*
1530 1534   * Insert a lock into the active queue.
1531 1535   */
1532 1536  
1533 1537  static void
1534 1538  flk_insert_active_lock(lock_descriptor_t *new_lock)
1535 1539  {
1536 1540          graph_t *gp = new_lock->l_graph;
1537 1541          vnode_t *vp = new_lock->l_vnode;
1538 1542          lock_descriptor_t *first_lock, *lock;
1539 1543  
1540 1544          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1541 1545  
1542 1546          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1543 1547          first_lock = lock;
1544 1548  
1545 1549          if (first_lock != NULL) {
1546 1550                  for (; (lock->l_vnode == vp &&
1547 1551                          lock->l_start < new_lock->l_start); lock = lock->l_next)
1548 1552                          ;
1549 1553          } else {
1550 1554                  lock = ACTIVE_HEAD(gp);
1551 1555          }
1552 1556  
1553 1557          lock->l_prev->l_next = new_lock;
1554 1558          new_lock->l_next = lock;
1555 1559          new_lock->l_prev = lock->l_prev;
1556 1560          lock->l_prev = new_lock;
1557 1561  
1558 1562          if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1559 1563                  vp->v_filocks = (struct filock *)new_lock;
1560 1564          }
1561 1565          flk_set_state(new_lock, FLK_ACTIVE_STATE);
1562 1566          new_lock->l_state |= ACTIVE_LOCK;
1563 1567  
1564 1568          CHECK_ACTIVE_LOCKS(gp);
1565 1569          CHECK_SLEEPING_LOCKS(gp);
1566 1570  }
1567 1571  
1568 1572  /*
1569 1573   * Delete the active lock : Performs two functions depending on the
1570 1574   * value of second parameter. One is to remove from the active lists
1571 1575   * only and other is to both remove and free the lock.
1572 1576   */
1573 1577  
1574 1578  static void
1575 1579  flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1576 1580  {
1577 1581          vnode_t *vp = lock->l_vnode;
1578 1582          graph_t *gp = lock->l_graph;
1579 1583  
1580 1584          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1581 1585          if (free_lock)
1582 1586                  ASSERT(NO_DEPENDENTS(lock));
1583 1587          ASSERT(NOT_BLOCKED(lock));
1584 1588          ASSERT(IS_ACTIVE(lock));
1585 1589  
1586 1590          ASSERT((vp->v_filocks != NULL));
1587 1591  
1588 1592          if (vp->v_filocks == (struct filock *)lock) {
1589 1593                  vp->v_filocks = (struct filock *)
1590 1594                                  ((lock->l_next->l_vnode == vp) ? lock->l_next :
1591 1595                                                                  NULL);
1592 1596          }
1593 1597          lock->l_next->l_prev = lock->l_prev;
1594 1598          lock->l_prev->l_next = lock->l_next;
1595 1599          lock->l_next = lock->l_prev = NULL;
1596 1600          flk_set_state(lock, FLK_DEAD_STATE);
1597 1601          lock->l_state &= ~ACTIVE_LOCK;
1598 1602  
1599 1603          if (free_lock)
1600 1604                  flk_free_lock(lock);
1601 1605          CHECK_ACTIVE_LOCKS(gp);
1602 1606          CHECK_SLEEPING_LOCKS(gp);
1603 1607  }
1604 1608  
1605 1609  /*
1606 1610   * Insert into the sleep queue.
1607 1611   */
1608 1612  
1609 1613  static void
1610 1614  flk_insert_sleeping_lock(lock_descriptor_t *request)
1611 1615  {
1612 1616          graph_t *gp = request->l_graph;
1613 1617          vnode_t *vp = request->l_vnode;
1614 1618          lock_descriptor_t       *lock;
1615 1619  
1616 1620          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1617 1621          ASSERT(IS_INITIAL(request));
1618 1622  
1619 1623          for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1620 1624                  lock->l_vnode < vp); lock = lock->l_next)
1621 1625                  ;
1622 1626  
1623 1627          lock->l_prev->l_next = request;
1624 1628          request->l_prev = lock->l_prev;
1625 1629          lock->l_prev = request;
1626 1630          request->l_next = lock;
1627 1631          flk_set_state(request, FLK_SLEEPING_STATE);
1628 1632          request->l_state |= SLEEPING_LOCK;
1629 1633  }
1630 1634  
1631 1635  /*
1632 1636   * Cancelling a sleeping lock implies removing a vertex from the
1633 1637   * dependency graph and therefore we should recompute the dependencies
1634 1638   * of all vertices that have a path  to this vertex, w.r.t. all
1635 1639   * vertices reachable from this vertex.
1636 1640   */
1637 1641  
1638 1642  void
1639 1643  flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1640 1644  {
1641 1645          graph_t *gp = request->l_graph;
1642 1646          vnode_t *vp = request->l_vnode;
1643 1647          lock_descriptor_t **topology = NULL;
1644 1648          edge_t  *ep;
1645 1649          lock_descriptor_t *vertex, *lock;
1646 1650          int nvertex = 0;
1647 1651          int i;
1648 1652          lock_descriptor_t *vertex_stack;
1649 1653  
1650 1654          STACK_INIT(vertex_stack);
1651 1655  
1652 1656          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1653 1657          /*
1654 1658           * count number of vertex pointers that has to be allocated
1655 1659           * All vertices that are reachable from request.
1656 1660           */
1657 1661  
1658 1662          STACK_PUSH(vertex_stack, request, l_stack);
1659 1663  
1660 1664          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1661 1665                  STACK_POP(vertex_stack, l_stack);
1662 1666                  for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1663 1667                                          ep = NEXT_ADJ(ep)) {
1664 1668                          if (IS_RECOMPUTE(ep->to_vertex))
1665 1669                                  continue;
1666 1670                          ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1667 1671                          STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1668 1672                          nvertex++;
1669 1673                  }
1670 1674          }
1671 1675  
1672 1676          /*
1673 1677           * allocate memory for holding the vertex pointers
1674 1678           */
1675 1679  
1676 1680          if (nvertex) {
1677 1681                  topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1678 1682                                                  KM_SLEEP);
1679 1683          }
1680 1684  
1681 1685          /*
1682 1686           * one more pass to actually store the vertices in the
1683 1687           * allocated array.
1684 1688           * We first check sleeping locks and then active locks
1685 1689           * so that topology array will be in a topological
1686 1690           * order.
1687 1691           */
1688 1692  
1689 1693          nvertex = 0;
1690 1694          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1691 1695  
1692 1696          if (lock) {
1693 1697                  do {
1694 1698                          if (IS_RECOMPUTE(lock)) {
1695 1699                                  lock->l_index = nvertex;
1696 1700                                  topology[nvertex++] = lock;
1697 1701                          }
1698 1702                          lock->l_color = NO_COLOR;
1699 1703                          lock = lock->l_next;
1700 1704                  } while (lock->l_vnode == vp);
1701 1705          }
1702 1706  
1703 1707          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1704 1708  
1705 1709          if (lock) {
1706 1710                  do {
1707 1711                          if (IS_RECOMPUTE(lock)) {
1708 1712                                  lock->l_index = nvertex;
1709 1713                                  topology[nvertex++] = lock;
1710 1714                          }
1711 1715                          lock->l_color = NO_COLOR;
1712 1716                          lock = lock->l_next;
1713 1717                  } while (lock->l_vnode == vp);
1714 1718          }
1715 1719  
1716 1720          /*
1717 1721           * remove in and out edges of request
1718 1722           * They are freed after updating proc_graph below.
1719 1723           */
1720 1724  
1721 1725          for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1722 1726                  ADJ_LIST_REMOVE(ep);
1723 1727          }
1724 1728  
1725 1729  
1726 1730          if (remove_from_queue)
1727 1731                  REMOVE_SLEEP_QUEUE(request);
1728 1732  
1729 1733          /* we are ready to recompute */
1730 1734  
1731 1735          flk_recompute_dependencies(request, topology, nvertex, 1);
1732 1736  
1733 1737          ep = FIRST_ADJ(request);
1734 1738          while (ep != HEAD(request)) {
1735 1739                  IN_LIST_REMOVE(ep);
1736 1740                  request->l_sedge = NEXT_ADJ(ep);
1737 1741                  ADJ_LIST_REMOVE(ep);
1738 1742                  flk_update_proc_graph(ep, 1);
1739 1743                  flk_free_edge(ep);
1740 1744                  ep = request->l_sedge;
1741 1745          }
1742 1746  
1743 1747  
1744 1748          /*
1745 1749           * unset the RECOMPUTE flag in those vertices
1746 1750           */
1747 1751  
1748 1752          for (i = 0; i < nvertex; i++) {
1749 1753                  topology[i]->l_state &= ~RECOMPUTE_LOCK;
1750 1754          }
1751 1755  
1752 1756          /*
1753 1757           * free the topology
1754 1758           */
1755 1759          if (nvertex)
1756 1760                  kmem_free((void *)topology,
1757 1761                          (nvertex * sizeof (lock_descriptor_t *)));
1758 1762          /*
1759 1763           * Possibility of some locks unblocked now
1760 1764           */
1761 1765  
1762 1766          flk_wakeup(request, 0);
1763 1767  
1764 1768          /*
1765 1769           * we expect to have a correctly recomputed graph  now.
1766 1770           */
1767 1771          flk_set_state(request, FLK_DEAD_STATE);
1768 1772          flk_free_lock(request);
1769 1773          CHECK_SLEEPING_LOCKS(gp);
1770 1774          CHECK_ACTIVE_LOCKS(gp);
1771 1775  
1772 1776  }
1773 1777  
1774 1778  /*
1775 1779   * Uncoloring the graph is simply to increment the mark value of the graph
1776 1780   * And only when wrap round takes place will we color all vertices in
1777 1781   * the graph explicitly.
1778 1782   */
1779 1783  
1780 1784  static void
1781 1785  flk_graph_uncolor(graph_t *gp)
1782 1786  {
1783 1787          lock_descriptor_t *lock;
1784 1788  
1785 1789          if (gp->mark == UINT_MAX) {
1786 1790                  gp->mark = 1;
1787 1791          for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1788 1792                                          lock = lock->l_next)
1789 1793                          lock->l_color  = 0;
1790 1794  
1791 1795          for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1792 1796                                          lock = lock->l_next)
1793 1797                          lock->l_color  = 0;
1794 1798          } else {
1795 1799                  gp->mark++;
1796 1800          }
1797 1801  }
1798 1802  
1799 1803  /*
1800 1804   * Wake up locks that are blocked on the given lock.
1801 1805   */
1802 1806  
1803 1807  static void
1804 1808  flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1805 1809  {
1806 1810          edge_t  *ep;
1807 1811          graph_t *gp = lock->l_graph;
1808 1812          lock_descriptor_t       *lck;
1809 1813  
1810 1814          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1811 1815          if (NO_DEPENDENTS(lock))
1812 1816                  return;
1813 1817          ep = FIRST_IN(lock);
1814 1818          do {
1815 1819                  /*
1816 1820                   * delete the edge from the adjacency list
1817 1821                   * of from vertex. if no more adjacent edges
1818 1822                   * for this vertex wake this process.
1819 1823                   */
1820 1824                  lck = ep->from_vertex;
1821 1825                  if (adj_list_remove)
1822 1826                          ADJ_LIST_REMOVE(ep);
1823 1827                  flk_update_proc_graph(ep, 1);
1824 1828                  if (NOT_BLOCKED(lck)) {
1825 1829                          GRANT_WAKEUP(lck);
1826 1830                  }
1827 1831                  lock->l_sedge = NEXT_IN(ep);
1828 1832                  IN_LIST_REMOVE(ep);
1829 1833                  flk_free_edge(ep);
1830 1834                  ep = lock->l_sedge;
1831 1835          } while (ep != HEAD(lock));
1832 1836          ASSERT(NO_DEPENDENTS(lock));
1833 1837  }
1834 1838  
1835 1839  /*
1836 1840   * The dependents of request, is checked for its dependency against the
1837 1841   * locks in topology (called topology because the array is and should be in
1838 1842   * topological order for this algorithm, if not in topological order the
1839 1843   * inner loop below might add more edges than necessary. Topological ordering
1840 1844   * of vertices satisfies the property that all edges will be from left to
1841 1845   * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1842 1846   * If lock l1 in the dependent set of request is dependent (blocked by)
1843 1847   * on lock l2 in topology but does not have a path to it, we add an edge
1844 1848   * in the inner loop below.
1845 1849   *
1846 1850   * We don't want to add an edge between l1 and l2 if there exists
1847 1851   * already a path from l1 to l2, so care has to be taken for those vertices
1848 1852   * that  have two paths to 'request'. These vertices are referred to here
1849 1853   * as barrier locks.
1850 1854   *
1851 1855   * The barriers has to be found (those vertex that originally had two paths
1852 1856   * to request) because otherwise we may end up adding edges unnecessarily
1853 1857   * to vertices in topology, and thus barrier vertices can have an edge
1854 1858   * to a vertex in topology as well a path to it.
1855 1859   */
1856 1860  
1857 1861  static void
1858 1862  flk_recompute_dependencies(lock_descriptor_t *request,
1859 1863                  lock_descriptor_t **topology,
1860 1864                          int nvertex, int update_graph)
1861 1865  {
1862 1866          lock_descriptor_t *vertex, *lock;
1863 1867          graph_t *gp = request->l_graph;
1864 1868          int i, count;
1865 1869          int barrier_found = 0;
1866 1870          edge_t  *ep;
1867 1871          lock_descriptor_t *vertex_stack;
1868 1872  
1869 1873          STACK_INIT(vertex_stack);
1870 1874  
1871 1875          ASSERT(MUTEX_HELD(&gp->gp_mutex));
1872 1876          if (nvertex == 0)
1873 1877                  return;
1874 1878          flk_graph_uncolor(request->l_graph);
1875 1879          barrier_found = flk_find_barriers(request);
1876 1880          request->l_state |= RECOMPUTE_DONE;
1877 1881  
1878 1882          STACK_PUSH(vertex_stack, request, l_stack);
1879 1883          request->l_sedge = FIRST_IN(request);
1880 1884  
1881 1885  
1882 1886          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1883 1887                  if (vertex->l_state & RECOMPUTE_DONE) {
1884 1888                          count = 0;
1885 1889                          goto next_in_edge;
1886 1890                  }
1887 1891                  if (IS_BARRIER(vertex)) {
1888 1892                          /* decrement the barrier count */
1889 1893                          if (vertex->l_index) {
1890 1894                                  vertex->l_index--;
1891 1895                                  /* this guy will be pushed again anyway ? */
1892 1896                                  STACK_POP(vertex_stack, l_stack);
1893 1897                                  if (vertex->l_index == 0)  {
1894 1898                                  /*
1895 1899                                   * barrier is over we can recompute
1896 1900                                   * dependencies for this lock in the
1897 1901                                   * next stack pop
1898 1902                                   */
1899 1903                                          vertex->l_state &= ~BARRIER_LOCK;
1900 1904                                  }
1901 1905                                  continue;
1902 1906                          }
1903 1907                  }
1904 1908                  vertex->l_state |= RECOMPUTE_DONE;
1905 1909                  flk_graph_uncolor(gp);
1906 1910                  count = flk_color_reachables(vertex);
1907 1911                  for (i = 0; i < nvertex; i++) {
1908 1912                          lock = topology[i];
1909 1913                          if (COLORED(lock))
1910 1914                                  continue;
1911 1915                          if (BLOCKS(lock, vertex)) {
1912 1916                                  (void) flk_add_edge(vertex, lock,
1913 1917                                      NO_CHECK_CYCLE, update_graph);
1914 1918                                  COLOR(lock);
1915 1919                                  count++;
1916 1920                                  count += flk_color_reachables(lock);
1917 1921                          }
1918 1922  
1919 1923                  }
1920 1924  
1921 1925  next_in_edge:
1922 1926                  if (count == nvertex ||
1923 1927                                  vertex->l_sedge == HEAD(vertex)) {
1924 1928                          /* prune the tree below this */
1925 1929                          STACK_POP(vertex_stack, l_stack);
1926 1930                          vertex->l_state &= ~RECOMPUTE_DONE;
1927 1931                          /* update the barrier locks below this! */
1928 1932                          if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1929 1933                                  flk_graph_uncolor(gp);
1930 1934                                  flk_update_barriers(vertex);
1931 1935                          }
1932 1936                          continue;
1933 1937                  }
1934 1938  
1935 1939                  ep = vertex->l_sedge;
1936 1940                  lock = ep->from_vertex;
1937 1941                  STACK_PUSH(vertex_stack, lock, l_stack);
1938 1942                  lock->l_sedge = FIRST_IN(lock);
1939 1943                  vertex->l_sedge = NEXT_IN(ep);
1940 1944          }
1941 1945  
1942 1946  }
1943 1947  
1944 1948  /*
1945 1949   * Color all reachable vertices from vertex that belongs to topology (here
1946 1950   * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1947 1951   *
1948 1952   * Note: we need to use a different stack_link l_stack1 because this is
1949 1953   * called from flk_recompute_dependencies() that already uses a stack with
1950 1954   * l_stack as stack_link.
1951 1955   */
1952 1956  
1953 1957  static int
1954 1958  flk_color_reachables(lock_descriptor_t *vertex)
1955 1959  {
1956 1960          lock_descriptor_t *ver, *lock;
1957 1961          int count;
1958 1962          edge_t  *ep;
1959 1963          lock_descriptor_t *vertex_stack;
1960 1964  
1961 1965          STACK_INIT(vertex_stack);
1962 1966  
1963 1967          STACK_PUSH(vertex_stack, vertex, l_stack1);
1964 1968          count = 0;
1965 1969          while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1966 1970  
1967 1971                  STACK_POP(vertex_stack, l_stack1);
1968 1972                  for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1969 1973                                          ep = NEXT_ADJ(ep)) {
1970 1974                          lock = ep->to_vertex;
1971 1975                          if (COLORED(lock))
1972 1976                                  continue;
1973 1977                          COLOR(lock);
1974 1978                          if (IS_RECOMPUTE(lock))
1975 1979                                  count++;
1976 1980                          STACK_PUSH(vertex_stack, lock, l_stack1);
1977 1981                  }
1978 1982  
1979 1983          }
1980 1984          return (count);
1981 1985  }
1982 1986  
1983 1987  /*
1984 1988   * Called from flk_recompute_dependencies() this routine decrements
1985 1989   * the barrier count of barrier vertices that are reachable from lock.
1986 1990   */
1987 1991  
1988 1992  static void
1989 1993  flk_update_barriers(lock_descriptor_t *lock)
1990 1994  {
1991 1995          lock_descriptor_t *vertex, *lck;
1992 1996          edge_t  *ep;
1993 1997          lock_descriptor_t *vertex_stack;
1994 1998  
1995 1999          STACK_INIT(vertex_stack);
1996 2000  
1997 2001          STACK_PUSH(vertex_stack, lock, l_stack1);
1998 2002  
1999 2003          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2000 2004                  STACK_POP(vertex_stack, l_stack1);
2001 2005                  for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2002 2006                                                  ep = NEXT_IN(ep)) {
2003 2007                          lck = ep->from_vertex;
2004 2008                          if (COLORED(lck)) {
2005 2009                                  if (IS_BARRIER(lck)) {
2006 2010                                          ASSERT(lck->l_index > 0);
2007 2011                                          lck->l_index--;
2008 2012                                          if (lck->l_index == 0)
2009 2013                                                  lck->l_state &= ~BARRIER_LOCK;
2010 2014                                  }
2011 2015                                  continue;
2012 2016                          }
2013 2017                          COLOR(lck);
2014 2018                          if (IS_BARRIER(lck)) {
2015 2019                                  ASSERT(lck->l_index > 0);
2016 2020                                  lck->l_index--;
2017 2021                                  if (lck->l_index == 0)
2018 2022                                          lck->l_state &= ~BARRIER_LOCK;
2019 2023                          }
2020 2024                          STACK_PUSH(vertex_stack, lck, l_stack1);
2021 2025                  }
2022 2026          }
2023 2027  }
2024 2028  
2025 2029  /*
2026 2030   * Finds all vertices that are reachable from 'lock' more than once and
2027 2031   * mark them as barrier vertices and increment their barrier count.
2028 2032   * The barrier count is one minus the total number of paths from lock
2029 2033   * to that vertex.
2030 2034   */
2031 2035  
2032 2036  static int
2033 2037  flk_find_barriers(lock_descriptor_t *lock)
2034 2038  {
2035 2039          lock_descriptor_t *vertex, *lck;
2036 2040          int found = 0;
2037 2041          edge_t  *ep;
2038 2042          lock_descriptor_t *vertex_stack;
2039 2043  
2040 2044          STACK_INIT(vertex_stack);
2041 2045  
2042 2046          STACK_PUSH(vertex_stack, lock, l_stack1);
2043 2047  
2044 2048          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2045 2049                  STACK_POP(vertex_stack, l_stack1);
2046 2050                  for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2047 2051                                                  ep = NEXT_IN(ep)) {
2048 2052                          lck = ep->from_vertex;
2049 2053                          if (COLORED(lck)) {
2050 2054                                  /* this is a barrier */
2051 2055                                  lck->l_state |= BARRIER_LOCK;
2052 2056                                  /* index will have barrier count */
2053 2057                                  lck->l_index++;
2054 2058                                  if (!found)
2055 2059                                          found = 1;
2056 2060                                  continue;
2057 2061                          }
2058 2062                          COLOR(lck);
2059 2063                          lck->l_index = 0;
2060 2064                          STACK_PUSH(vertex_stack, lck, l_stack1);
2061 2065                  }
2062 2066          }
2063 2067          return (found);
2064 2068  }
2065 2069  
2066 2070  /*
2067 2071   * Finds the first lock that is mainly responsible for blocking this
2068 2072   * request.  If there is no such lock, request->l_flock.l_type is set to
2069 2073   * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2070 2074   * of the blocking lock.
2071 2075   *
2072 2076   * Note: It is possible a request is blocked by a sleeping lock because
2073 2077   * of the fairness policy used in flk_process_request() to construct the
2074 2078   * dependencies. (see comments before flk_process_request()).
2075 2079   */
2076 2080  
2077 2081  static void
2078 2082  flk_get_first_blocking_lock(lock_descriptor_t *request)
2079 2083  {
2080 2084          graph_t *gp = request->l_graph;
2081 2085          vnode_t *vp = request->l_vnode;
2082 2086          lock_descriptor_t *lock, *blocker;
2083 2087  
2084 2088          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2085 2089          blocker = NULL;
2086 2090          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2087 2091  
2088 2092          if (lock) {
2089 2093                  do {
2090 2094                          if (BLOCKS(lock, request)) {
2091 2095                                  blocker = lock;
2092 2096                                  break;
2093 2097                          }
2094 2098                          lock = lock->l_next;
2095 2099                  } while (lock->l_vnode == vp);
2096 2100          }
2097 2101  
2098 2102          if (blocker) {
2099 2103                  report_blocker(blocker, request);
2100 2104          } else
2101 2105                  request->l_flock.l_type = F_UNLCK;
2102 2106  }
2103 2107  
2104 2108  /*
2105 2109   * Get the graph_t structure associated with a vnode.
2106 2110   * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2107 2111   * not yet been initialized, then a new element is allocated and returned.
2108 2112   */
2109 2113  graph_t *
2110 2114  flk_get_lock_graph(vnode_t *vp, int initialize)
2111 2115  {
2112 2116          graph_t *gp;
2113 2117          graph_t *gp_alloc = NULL;
2114 2118          int index = HASH_INDEX(vp);
2115 2119  
2116 2120          if (initialize == FLK_USE_GRAPH) {
2117 2121                  mutex_enter(&flock_lock);
2118 2122                  gp = lock_graph[index];
2119 2123                  mutex_exit(&flock_lock);
2120 2124                  return (gp);
2121 2125          }
2122 2126  
2123 2127          ASSERT(initialize == FLK_INIT_GRAPH);
2124 2128  
2125 2129          if (lock_graph[index] == NULL) {
2126 2130  
2127 2131                  gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2128 2132  
2129 2133                  /* Initialize the graph */
2130 2134  
2131 2135                  gp_alloc->active_locks.l_next =
2132 2136                      gp_alloc->active_locks.l_prev =
2133 2137                      (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2134 2138                  gp_alloc->sleeping_locks.l_next =
2135 2139                      gp_alloc->sleeping_locks.l_prev =
2136 2140                      (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2137 2141                  gp_alloc->index = index;
2138 2142                  mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2139 2143          }
2140 2144  
2141 2145          mutex_enter(&flock_lock);
2142 2146  
2143 2147          gp = lock_graph[index];
2144 2148  
2145 2149          /* Recheck the value within flock_lock */
2146 2150          if (gp == NULL) {
2147 2151                  struct flock_globals *fg;
2148 2152  
2149 2153                  /* We must have previously allocated the graph_t structure */
2150 2154                  ASSERT(gp_alloc != NULL);
2151 2155                  lock_graph[index] = gp = gp_alloc;
2152 2156                  /*
2153 2157                   * The lockmgr status is only needed if KLM is loaded.
2154 2158                   */
2155 2159                  if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2156 2160                          fg = flk_get_globals();
2157 2161                          fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2158 2162                  }
2159 2163          }
2160 2164  
2161 2165          mutex_exit(&flock_lock);
2162 2166  
2163 2167          if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2164 2168                  /* There was a race to allocate the graph_t and we lost */
2165 2169                  mutex_destroy(&gp_alloc->gp_mutex);
2166 2170                  kmem_free(gp_alloc, sizeof (graph_t));
2167 2171          }
2168 2172  
2169 2173          return (gp);
2170 2174  }
2171 2175  
2172 2176  /*
2173 2177   * PSARC case 1997/292
2174 2178   */
2175 2179  int
2176 2180  cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2177 2181  {
2178 2182          lock_descriptor_t *lock;
2179 2183          int result = 0;
2180 2184          graph_t *gp;
2181 2185          int                     lock_nlmid;
2182 2186  
2183 2187          /*
2184 2188           * Check to see if node is booted as a cluster. If not, return.
2185 2189           */
2186 2190          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2187 2191                  return (0);
2188 2192          }
2189 2193  
2190 2194          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2191 2195          if (gp == NULL) {
2192 2196                  return (0);
2193 2197          }
2194 2198  
2195 2199          mutex_enter(&gp->gp_mutex);
2196 2200  
2197 2201          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2198 2202  
2199 2203          if (lock) {
2200 2204                  while (lock->l_vnode == vp) {
2201 2205                          /* get NLM id from sysid */
2202 2206                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2203 2207  
2204 2208                          /*
2205 2209                           * If NLM server request _and_ nlmid of lock matches
2206 2210                           * nlmid of argument, then we've found a remote lock.
2207 2211                           */
2208 2212                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2209 2213                                  result = 1;
2210 2214                                  goto done;
2211 2215                          }
2212 2216                          lock = lock->l_next;
2213 2217                  }
2214 2218          }
2215 2219  
2216 2220          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2217 2221  
2218 2222          if (lock) {
2219 2223                  while (lock->l_vnode == vp) {
2220 2224                          /* get NLM id from sysid */
2221 2225                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2222 2226  
2223 2227                          /*
2224 2228                           * If NLM server request _and_ nlmid of lock matches
2225 2229                           * nlmid of argument, then we've found a remote lock.
2226 2230                           */
2227 2231                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2228 2232                                  result = 1;
2229 2233                                  goto done;
2230 2234                          }
2231 2235                          lock = lock->l_next;
2232 2236                  }
2233 2237          }
2234 2238  
2235 2239  done:
2236 2240          mutex_exit(&gp->gp_mutex);
2237 2241          return (result);
2238 2242  }
2239 2243  
2240 2244  /*
2241 2245   * Determine whether there are any locks for the given vnode with a remote
2242 2246   * sysid.  Returns zero if not, non-zero if there are.
2243 2247   *
2244 2248   * Note that the return value from this function is potentially invalid
2245 2249   * once it has been returned.  The caller is responsible for providing its
2246 2250   * own synchronization mechanism to ensure that the return value is useful
2247 2251   * (e.g., see nfs_lockcompletion()).
2248 2252   */
2249 2253  int
2250 2254  flk_has_remote_locks(vnode_t *vp)
2251 2255  {
2252 2256          lock_descriptor_t *lock;
2253 2257          int result = 0;
2254 2258          graph_t *gp;
2255 2259  
2256 2260          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2257 2261          if (gp == NULL) {
2258 2262                  return (0);
2259 2263          }
2260 2264  
2261 2265          mutex_enter(&gp->gp_mutex);
2262 2266  
2263 2267          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2264 2268  
2265 2269          if (lock) {
2266 2270                  while (lock->l_vnode == vp) {
2267 2271                          if (IS_REMOTE(lock)) {
2268 2272                                  result = 1;
2269 2273                                  goto done;
2270 2274                          }
2271 2275                          lock = lock->l_next;

↓ open down ↓

2232 lines elided

↑ open up ↑

2272 2276                  }
2273 2277          }
2274 2278  
2275 2279          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2276 2280  
2277 2281          if (lock) {
2278 2282                  while (lock->l_vnode == vp) {
2279 2283                          if (IS_REMOTE(lock)) {
2280 2284                                  result = 1;
2281 2285                                  goto done;
     2286 +                        }
     2287 +                        lock = lock->l_next;
     2288 +                }
     2289 +        }
     2290 +
     2291 +done:
     2292 +        mutex_exit(&gp->gp_mutex);
     2293 +        return (result);
     2294 +}
     2295 +
     2296 +/*
     2297 + * Determine whether there are any locks for the given vnode with a remote
     2298 + * sysid matching given sysid.
     2299 + * Used by the new (open source) NFS Lock Manager (NLM)
     2300 + */
     2301 +int
     2302 +flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
     2303 +{
     2304 +        lock_descriptor_t *lock;
     2305 +        int result = 0;
     2306 +        graph_t *gp;
     2307 +
     2308 +        if (sysid == 0)
     2309 +                return (0);
     2310 +
     2311 +        gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
     2312 +        if (gp == NULL) {
     2313 +                return (0);
     2314 +        }
     2315 +
     2316 +        mutex_enter(&gp->gp_mutex);
     2317 +
     2318 +        SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
     2319 +
     2320 +        if (lock) {
     2321 +                while (lock->l_vnode == vp) {
     2322 +                        if (lock->l_flock.l_sysid == sysid) {
     2323 +                                result = 1;
     2324 +                                goto done;
     2325 +                        }
     2326 +                        lock = lock->l_next;
     2327 +                }
     2328 +        }
     2329 +
     2330 +        SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
     2331 +
     2332 +        if (lock) {
     2333 +                while (lock->l_vnode == vp) {
     2334 +                        if (lock->l_flock.l_sysid == sysid) {
     2335 +                                result = 1;
     2336 +                                goto done;
2282 2337                          }
2283 2338                          lock = lock->l_next;
2284 2339                  }
2285 2340          }
2286 2341  
2287 2342  done:
2288 2343          mutex_exit(&gp->gp_mutex);
2289 2344          return (result);
2290 2345  }
2291 2346

2292 2347  /*
2293 2348   * Determine if there are any locks owned by the given sysid.
2294 2349   * Returns zero if not, non-zero if there are.  Note that this return code
2295 2350   * could be derived from flk_get_{sleeping,active}_locks, but this routine
2296 2351   * avoids all the memory allocations of those routines.
2297 2352   *
2298 2353   * This routine has the same synchronization issues as
2299 2354   * flk_has_remote_locks.
2300 2355   */
2301 2356  
2302 2357  int
2303 2358  flk_sysid_has_locks(int sysid, int lck_type)
2304 2359  {
2305 2360          int             has_locks = 0;
2306 2361          lock_descriptor_t       *lock;
2307 2362          graph_t         *gp;
2308 2363          int             i;
2309 2364  
2310 2365          for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2311 2366                  mutex_enter(&flock_lock);
2312 2367                  gp = lock_graph[i];
2313 2368                  mutex_exit(&flock_lock);
2314 2369                  if (gp == NULL) {
2315 2370                          continue;
2316 2371                  }
2317 2372  
2318 2373                  mutex_enter(&gp->gp_mutex);
2319 2374  
2320 2375                  if (lck_type & FLK_QUERY_ACTIVE) {
2321 2376                          for (lock = ACTIVE_HEAD(gp)->l_next;
2322 2377                              lock != ACTIVE_HEAD(gp) && !has_locks;
2323 2378                              lock = lock->l_next) {
2324 2379                                  if (lock->l_flock.l_sysid == sysid)
2325 2380                                          has_locks = 1;
2326 2381                          }
2327 2382                  }
2328 2383  
2329 2384                  if (lck_type & FLK_QUERY_SLEEPING) {
2330 2385                          for (lock = SLEEPING_HEAD(gp)->l_next;
2331 2386                                  lock != SLEEPING_HEAD(gp) && !has_locks;
2332 2387                                  lock = lock->l_next) {
2333 2388                                  if (lock->l_flock.l_sysid == sysid)
2334 2389                                          has_locks = 1;
2335 2390                          }
2336 2391                  }
2337 2392                  mutex_exit(&gp->gp_mutex);
2338 2393          }
2339 2394  
2340 2395          return (has_locks);
2341 2396  }
2342 2397  
2343 2398  
2344 2399  /*
2345 2400   * PSARC case 1997/292
2346 2401   *
2347 2402   * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2348 2403   *  quantity, the real sysid generated by the NLM server; the upper half
2349 2404   *  identifies the node of the cluster where the NLM server ran.
2350 2405   *  This routine is only called by an NLM server running in a cluster.
2351 2406   * Effects: Remove all locks held on behalf of the client identified
2352 2407   *  by "sysid."
2353 2408   */
2354 2409  void
2355 2410  cl_flk_remove_locks_by_sysid(int sysid)
2356 2411  {
2357 2412          graph_t *gp;
2358 2413          int i;
2359 2414          lock_descriptor_t *lock, *nlock;
2360 2415  
2361 2416          /*
2362 2417           * Check to see if node is booted as a cluster. If not, return.
2363 2418           */
2364 2419          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2365 2420                  return;
2366 2421          }
2367 2422  
2368 2423          ASSERT(sysid != 0);
2369 2424          for (i = 0; i < HASH_SIZE; i++) {
2370 2425                  mutex_enter(&flock_lock);
2371 2426                  gp = lock_graph[i];
2372 2427                  mutex_exit(&flock_lock);
2373 2428  
2374 2429                  if (gp == NULL)
2375 2430                          continue;
2376 2431  
2377 2432                  mutex_enter(&gp->gp_mutex);     /*  get mutex on lock graph */
2378 2433  
2379 2434                  /* signal sleeping requests so that they bail out */
2380 2435                  lock = SLEEPING_HEAD(gp)->l_next;
2381 2436                  while (lock != SLEEPING_HEAD(gp)) {
2382 2437                          nlock = lock->l_next;
2383 2438                          if (lock->l_flock.l_sysid == sysid) {
2384 2439                                  INTERRUPT_WAKEUP(lock);
2385 2440                          }
2386 2441                          lock = nlock;
2387 2442                  }
2388 2443  
2389 2444                  /* delete active locks */
2390 2445                  lock = ACTIVE_HEAD(gp)->l_next;
2391 2446                  while (lock != ACTIVE_HEAD(gp)) {
2392 2447                          nlock = lock->l_next;
2393 2448                          if (lock->l_flock.l_sysid == sysid) {
2394 2449                                  flk_delete_active_lock(lock, 0);
2395 2450                                  flk_wakeup(lock, 1);
2396 2451                                  flk_free_lock(lock);
2397 2452                          }
2398 2453                          lock = nlock;
2399 2454                  }
2400 2455                  mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2401 2456          }
2402 2457  }
2403 2458  
2404 2459  /*
2405 2460   * Delete all locks in the system that belongs to the sysid of the request.
2406 2461   */
2407 2462  
2408 2463  static void
2409 2464  flk_delete_locks_by_sysid(lock_descriptor_t *request)
2410 2465  {
2411 2466          int     sysid  = request->l_flock.l_sysid;
2412 2467          lock_descriptor_t *lock, *nlock;
2413 2468          graph_t *gp;
2414 2469          int i;
2415 2470  
2416 2471          ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2417 2472          ASSERT(sysid != 0);
2418 2473  
2419 2474          mutex_exit(&request->l_graph->gp_mutex);
2420 2475  
2421 2476          for (i = 0; i < HASH_SIZE; i++) {
2422 2477                  mutex_enter(&flock_lock);
2423 2478                  gp = lock_graph[i];
2424 2479                  mutex_exit(&flock_lock);
2425 2480  
2426 2481                  if (gp == NULL)
2427 2482                          continue;
2428 2483  
2429 2484                  mutex_enter(&gp->gp_mutex);
2430 2485  
2431 2486                  /* signal sleeping requests so that they bail out */
2432 2487                  lock = SLEEPING_HEAD(gp)->l_next;
2433 2488                  while (lock != SLEEPING_HEAD(gp)) {
2434 2489                          nlock = lock->l_next;
2435 2490                          if (lock->l_flock.l_sysid == sysid) {
2436 2491                                  INTERRUPT_WAKEUP(lock);
2437 2492                          }
2438 2493                          lock = nlock;
2439 2494                  }
2440 2495  
2441 2496                  /* delete active locks */
2442 2497                  lock = ACTIVE_HEAD(gp)->l_next;
2443 2498                  while (lock != ACTIVE_HEAD(gp)) {
2444 2499                          nlock = lock->l_next;
2445 2500                          if (lock->l_flock.l_sysid == sysid) {
2446 2501                                  flk_delete_active_lock(lock, 0);
2447 2502                                  flk_wakeup(lock, 1);
2448 2503                                  flk_free_lock(lock);
2449 2504                          }
2450 2505                          lock = nlock;
2451 2506                  }
2452 2507                  mutex_exit(&gp->gp_mutex);
2453 2508          }
2454 2509  
2455 2510          mutex_enter(&request->l_graph->gp_mutex);
2456 2511  }
2457 2512  
2458 2513  /*
2459 2514   * Clustering: Deletes PXFS locks
2460 2515   * Effects: Delete all locks on files in the given file system and with the
2461 2516   *  given PXFS id.
2462 2517   */
2463 2518  void
2464 2519  cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2465 2520  {
2466 2521          lock_descriptor_t *lock, *nlock;
2467 2522          graph_t *gp;
2468 2523          int i;
2469 2524  
2470 2525          for (i = 0; i < HASH_SIZE; i++) {
2471 2526                  mutex_enter(&flock_lock);
2472 2527                  gp = lock_graph[i];
2473 2528                  mutex_exit(&flock_lock);
2474 2529  
2475 2530                  if (gp == NULL)
2476 2531                          continue;
2477 2532  
2478 2533                  mutex_enter(&gp->gp_mutex);
2479 2534  
2480 2535                  /* signal sleeping requests so that they bail out */
2481 2536                  lock = SLEEPING_HEAD(gp)->l_next;
2482 2537                  while (lock != SLEEPING_HEAD(gp)) {
2483 2538                          nlock = lock->l_next;
2484 2539                          if (lock->l_vnode->v_vfsp == vfsp) {
2485 2540                                  ASSERT(IS_PXFS(lock));
2486 2541                                  if (GETPXFSID(lock->l_flock.l_sysid) ==
2487 2542                                      pxfsid) {
2488 2543                                          flk_set_state(lock,
2489 2544                                              FLK_CANCELLED_STATE);
2490 2545                                          flk_cancel_sleeping_lock(lock, 1);
2491 2546                                  }
2492 2547                          }
2493 2548                          lock = nlock;
2494 2549                  }
2495 2550  
2496 2551                  /* delete active locks */
2497 2552                  lock = ACTIVE_HEAD(gp)->l_next;
2498 2553                  while (lock != ACTIVE_HEAD(gp)) {
2499 2554                          nlock = lock->l_next;
2500 2555                          if (lock->l_vnode->v_vfsp == vfsp) {
2501 2556                                  ASSERT(IS_PXFS(lock));
2502 2557                                  if (GETPXFSID(lock->l_flock.l_sysid) ==
2503 2558                                      pxfsid) {
2504 2559                                          flk_delete_active_lock(lock, 0);
2505 2560                                          flk_wakeup(lock, 1);
2506 2561                                          flk_free_lock(lock);
2507 2562                                  }
2508 2563                          }
2509 2564                          lock = nlock;
2510 2565                  }
2511 2566                  mutex_exit(&gp->gp_mutex);
2512 2567          }
2513 2568  }
2514 2569  
2515 2570  /*
2516 2571   * Search for a sleeping lock manager lock which matches exactly this lock
2517 2572   * request; if one is found, fake a signal to cancel it.
2518 2573   *
2519 2574   * Return 1 if a matching lock was found, 0 otherwise.
2520 2575   */
2521 2576  
2522 2577  static int
2523 2578  flk_canceled(lock_descriptor_t *request)
2524 2579  {
2525 2580          lock_descriptor_t *lock, *nlock;
2526 2581          graph_t *gp = request->l_graph;
2527 2582          vnode_t *vp = request->l_vnode;
2528 2583  
2529 2584          ASSERT(MUTEX_HELD(&gp->gp_mutex));
2530 2585          ASSERT(IS_LOCKMGR(request));
2531 2586          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2532 2587  
2533 2588          if (lock) {
2534 2589                  while (lock->l_vnode == vp) {
2535 2590                          nlock = lock->l_next;
2536 2591                          if (SAME_OWNER(lock, request) &&
2537 2592                                  lock->l_start == request->l_start &&
2538 2593                                          lock->l_end == request->l_end) {
2539 2594                                  INTERRUPT_WAKEUP(lock);
2540 2595                                  return (1);
2541 2596                          }
2542 2597                          lock = nlock;
2543 2598                  }
2544 2599          }
2545 2600          return (0);
2546 2601  }
2547 2602  
2548 2603  /*
2549 2604   * Remove all the locks for the vnode belonging to the given pid and sysid.
2550 2605   */
2551 2606  
2552 2607  void
2553 2608  cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2554 2609  {
2555 2610          graph_t *gp;
2556 2611          lock_descriptor_t *lock, *nlock;
2557 2612          lock_descriptor_t *link_stack;
2558 2613  
2559 2614          STACK_INIT(link_stack);
2560 2615  
2561 2616          gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2562 2617  
2563 2618          if (gp == NULL)
2564 2619                  return;
2565 2620          mutex_enter(&gp->gp_mutex);
2566 2621  
2567 2622          CHECK_SLEEPING_LOCKS(gp);
2568 2623          CHECK_ACTIVE_LOCKS(gp);
2569 2624  
2570 2625          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2571 2626  
2572 2627          if (lock) {
2573 2628                  do {
2574 2629                          nlock = lock->l_next;
2575 2630                          if ((lock->l_flock.l_pid == pid ||
2576 2631                                          pid == IGN_PID) &&
2577 2632                                  lock->l_flock.l_sysid == sysid) {
2578 2633                                  CANCEL_WAKEUP(lock);
2579 2634                          }
2580 2635                          lock = nlock;
2581 2636                  } while (lock->l_vnode == vp);
2582 2637          }
2583 2638  
2584 2639          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2585 2640  
2586 2641          if (lock) {
2587 2642                  do {
2588 2643                          nlock = lock->l_next;
2589 2644                          if ((lock->l_flock.l_pid == pid ||
2590 2645                                          pid == IGN_PID) &&
2591 2646                                  lock->l_flock.l_sysid == sysid) {
2592 2647                                  flk_delete_active_lock(lock, 0);
2593 2648                                  STACK_PUSH(link_stack, lock, l_stack);
2594 2649                          }
2595 2650                          lock = nlock;
2596 2651                  } while (lock->l_vnode == vp);
2597 2652          }
2598 2653  
2599 2654          while ((lock = STACK_TOP(link_stack)) != NULL) {
2600 2655                  STACK_POP(link_stack, l_stack);
2601 2656                  flk_wakeup(lock, 1);
2602 2657                  flk_free_lock(lock);
2603 2658          }
2604 2659  
2605 2660          CHECK_SLEEPING_LOCKS(gp);
2606 2661          CHECK_ACTIVE_LOCKS(gp);
2607 2662          CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2608 2663          mutex_exit(&gp->gp_mutex);
2609 2664  }
2610 2665  
2611 2666  
2612 2667  /*
2613 2668   * Called from 'fs' read and write routines for files that have mandatory
2614 2669   * locking enabled.
2615 2670   */
2616 2671  
2617 2672  int
2618 2673  chklock(
2619 2674          struct vnode    *vp,
2620 2675          int             iomode,
2621 2676          u_offset_t      offset,
2622 2677          ssize_t         len,
2623 2678          int             fmode,
2624 2679          caller_context_t *ct)
2625 2680  {
2626 2681          register int    i;
2627 2682          struct flock64  bf;
2628 2683          int             error = 0;
2629 2684  
2630 2685          bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2631 2686          bf.l_whence = 0;
2632 2687          bf.l_start = offset;
2633 2688          bf.l_len = len;
2634 2689          if (ct == NULL) {
2635 2690                  bf.l_pid = curproc->p_pid;
2636 2691                  bf.l_sysid = 0;
2637 2692          } else {
2638 2693                  bf.l_pid = ct->cc_pid;
2639 2694                  bf.l_sysid = ct->cc_sysid;
2640 2695          }
2641 2696          i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2642 2697          if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2643 2698              bf.l_type != F_UNLCK)
2644 2699                  error = i ? i : EAGAIN;
2645 2700          return (error);
2646 2701  }
2647 2702  
2648 2703  /*
2649 2704   * convoff - converts the given data (start, whence) to the
2650 2705   * given whence.
2651 2706   */
2652 2707  int
2653 2708  convoff(vp, lckdat, whence, offset)
2654 2709          struct vnode    *vp;
2655 2710          struct flock64  *lckdat;
2656 2711          int             whence;
2657 2712          offset_t        offset;
2658 2713  {
2659 2714          int             error;
2660 2715          struct vattr    vattr;
2661 2716  
2662 2717          if ((lckdat->l_whence == 2) || (whence == 2)) {
2663 2718                  vattr.va_mask = AT_SIZE;
2664 2719                  if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2665 2720                          return (error);
2666 2721          }
2667 2722  
2668 2723          switch (lckdat->l_whence) {
2669 2724          case 1:
2670 2725                  lckdat->l_start += offset;
2671 2726                  break;
2672 2727          case 2:
2673 2728                  lckdat->l_start += vattr.va_size;
2674 2729                  /* FALLTHRU */
2675 2730          case 0:
2676 2731                  break;
2677 2732          default:
2678 2733                  return (EINVAL);
2679 2734          }
2680 2735  
2681 2736          if (lckdat->l_start < 0)
2682 2737                  return (EINVAL);
2683 2738  
2684 2739          switch (whence) {
2685 2740          case 1:
2686 2741                  lckdat->l_start -= offset;
2687 2742                  break;
2688 2743          case 2:
2689 2744                  lckdat->l_start -= vattr.va_size;
2690 2745                  /* FALLTHRU */
2691 2746          case 0:
2692 2747                  break;
2693 2748          default:
2694 2749                  return (EINVAL);
2695 2750          }
2696 2751  
2697 2752          lckdat->l_whence = (short)whence;
2698 2753          return (0);
2699 2754  }
2700 2755  
2701 2756  
2702 2757  /*      proc_graph function definitions */
2703 2758  
2704 2759  /*
2705 2760   * Function checks for deadlock due to the new 'lock'. If deadlock found
2706 2761   * edges of this lock are freed and returned.
2707 2762   */
2708 2763  
2709 2764  static int
2710 2765  flk_check_deadlock(lock_descriptor_t *lock)
2711 2766  {
2712 2767          proc_vertex_t   *start_vertex, *pvertex;
2713 2768          proc_vertex_t *dvertex;
2714 2769          proc_edge_t *pep, *ppep;
2715 2770          edge_t  *ep, *nep;
2716 2771          proc_vertex_t *process_stack;
2717 2772  
2718 2773          STACK_INIT(process_stack);
2719 2774  
2720 2775          mutex_enter(&flock_lock);
2721 2776          start_vertex = flk_get_proc_vertex(lock);
2722 2777          ASSERT(start_vertex != NULL);
2723 2778  
2724 2779          /* construct the edges from this process to other processes */
2725 2780  
2726 2781          ep = FIRST_ADJ(lock);
2727 2782          while (ep != HEAD(lock)) {
2728 2783                  proc_vertex_t *adj_proc;
2729 2784  
2730 2785                  adj_proc = flk_get_proc_vertex(ep->to_vertex);
2731 2786                  for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2732 2787                          if (pep->to_proc == adj_proc) {
2733 2788                                  ASSERT(pep->refcount);
2734 2789                                  pep->refcount++;
2735 2790                                  break;
2736 2791                          }
2737 2792                  }
2738 2793                  if (pep == NULL) {
2739 2794                          pep = flk_get_proc_edge();
2740 2795                          pep->to_proc = adj_proc;
2741 2796                          pep->refcount = 1;
2742 2797                          adj_proc->incount++;
2743 2798                          pep->next = start_vertex->edge;
2744 2799                          start_vertex->edge = pep;
2745 2800                  }
2746 2801                  ep = NEXT_ADJ(ep);
2747 2802          }
2748 2803  
2749 2804          ep = FIRST_IN(lock);
2750 2805  
2751 2806          while (ep != HEAD(lock)) {
2752 2807                  proc_vertex_t *in_proc;
2753 2808  
2754 2809                  in_proc = flk_get_proc_vertex(ep->from_vertex);
2755 2810  
2756 2811                  for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2757 2812                          if (pep->to_proc == start_vertex) {
2758 2813                                  ASSERT(pep->refcount);
2759 2814                                  pep->refcount++;
2760 2815                                  break;
2761 2816                          }
2762 2817                  }
2763 2818                  if (pep == NULL) {
2764 2819                          pep = flk_get_proc_edge();
2765 2820                          pep->to_proc = start_vertex;
2766 2821                          pep->refcount = 1;
2767 2822                          start_vertex->incount++;
2768 2823                          pep->next = in_proc->edge;
2769 2824                          in_proc->edge = pep;
2770 2825                  }
2771 2826                  ep = NEXT_IN(ep);
2772 2827          }
2773 2828  
2774 2829          if (start_vertex->incount == 0) {
2775 2830                  mutex_exit(&flock_lock);
2776 2831                  return (0);
2777 2832          }
2778 2833  
2779 2834          flk_proc_graph_uncolor();
2780 2835  
2781 2836          start_vertex->p_sedge = start_vertex->edge;
2782 2837  
2783 2838          STACK_PUSH(process_stack, start_vertex, p_stack);
2784 2839  
2785 2840          while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2786 2841                  for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2787 2842                          dvertex = pep->to_proc;
2788 2843                          if (!PROC_ARRIVED(dvertex)) {
2789 2844                                  STACK_PUSH(process_stack, dvertex, p_stack);
2790 2845                                  dvertex->p_sedge = dvertex->edge;
2791 2846                                  PROC_ARRIVE(pvertex);
2792 2847                                  pvertex->p_sedge = pep->next;
2793 2848                                  break;
2794 2849                          }
2795 2850                          if (!PROC_DEPARTED(dvertex))
2796 2851                                  goto deadlock;
2797 2852                  }
2798 2853                  if (pep == NULL) {
2799 2854                          PROC_DEPART(pvertex);
2800 2855                          STACK_POP(process_stack, p_stack);
2801 2856                  }
2802 2857          }
2803 2858          mutex_exit(&flock_lock);
2804 2859          return (0);
2805 2860  
2806 2861  deadlock:
2807 2862  
2808 2863          /* we remove all lock edges and proc edges */
2809 2864  
2810 2865          ep = FIRST_ADJ(lock);
2811 2866          while (ep != HEAD(lock)) {
2812 2867                  proc_vertex_t *adj_proc;
2813 2868                  adj_proc = flk_get_proc_vertex(ep->to_vertex);
2814 2869                  nep = NEXT_ADJ(ep);
2815 2870                  IN_LIST_REMOVE(ep);
2816 2871                  ADJ_LIST_REMOVE(ep);
2817 2872                  flk_free_edge(ep);
2818 2873                  ppep = start_vertex->edge;
2819 2874                  for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2820 2875                                                  pep = ppep->next) {
2821 2876                          if (pep->to_proc == adj_proc) {
2822 2877                                  pep->refcount--;
2823 2878                                  if (pep->refcount == 0) {
2824 2879                                          if (pep == ppep) {
2825 2880                                                  start_vertex->edge = pep->next;
2826 2881                                          } else {
2827 2882                                                  ppep->next = pep->next;
2828 2883                                          }
2829 2884                                          adj_proc->incount--;
2830 2885                                          flk_proc_release(adj_proc);
2831 2886                                          flk_free_proc_edge(pep);
2832 2887                                  }
2833 2888                                  break;
2834 2889                          }
2835 2890                  }
2836 2891                  ep = nep;
2837 2892          }
2838 2893          ep = FIRST_IN(lock);
2839 2894          while (ep != HEAD(lock)) {
2840 2895                  proc_vertex_t *in_proc;
2841 2896                  in_proc = flk_get_proc_vertex(ep->from_vertex);
2842 2897                  nep = NEXT_IN(ep);
2843 2898                  IN_LIST_REMOVE(ep);
2844 2899                  ADJ_LIST_REMOVE(ep);
2845 2900                  flk_free_edge(ep);
2846 2901                  ppep = in_proc->edge;
2847 2902                  for (pep = in_proc->edge; pep != NULL; ppep = pep,
2848 2903                                                  pep = ppep->next) {
2849 2904                          if (pep->to_proc == start_vertex) {
2850 2905                                  pep->refcount--;
2851 2906                                  if (pep->refcount == 0) {
2852 2907                                          if (pep == ppep) {
2853 2908                                                  in_proc->edge = pep->next;
2854 2909                                          } else {
2855 2910                                                  ppep->next = pep->next;
2856 2911                                          }
2857 2912                                          start_vertex->incount--;
2858 2913                                          flk_proc_release(in_proc);
2859 2914                                          flk_free_proc_edge(pep);
2860 2915                                  }
2861 2916                                  break;
2862 2917                          }
2863 2918                  }
2864 2919                  ep = nep;
2865 2920          }
2866 2921          flk_proc_release(start_vertex);
2867 2922          mutex_exit(&flock_lock);
2868 2923          return (1);
2869 2924  }
2870 2925  
2871 2926  /*
2872 2927   * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2873 2928   * from the list we return that, otherwise we allocate one. If necessary,
2874 2929   * we grow the list of vertices also.
2875 2930   */
2876 2931  
2877 2932  static proc_vertex_t *
2878 2933  flk_get_proc_vertex(lock_descriptor_t *lock)
2879 2934  {
2880 2935          int i;
2881 2936          proc_vertex_t   *pv;
2882 2937          proc_vertex_t   **palloc;
2883 2938  
2884 2939          ASSERT(MUTEX_HELD(&flock_lock));
2885 2940          if (lock->pvertex != -1) {
2886 2941                  ASSERT(lock->pvertex >= 0);
2887 2942                  pv = pgraph.proc[lock->pvertex];
2888 2943                  if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2889 2944                          return (pv);
2890 2945                  }
2891 2946          }
2892 2947          for (i = 0; i < pgraph.gcount; i++) {
2893 2948                  pv = pgraph.proc[i];
2894 2949                  if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2895 2950                          lock->pvertex = pv->index = i;
2896 2951                          return (pv);
2897 2952                  }
2898 2953          }
2899 2954          pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2900 2955          pv->pid = lock->l_flock.l_pid;
2901 2956          pv->sysid = lock->l_flock.l_sysid;
2902 2957          flk_proc_vertex_allocs++;
2903 2958          if (pgraph.free != 0) {
2904 2959                  for (i = 0; i < pgraph.gcount; i++) {
2905 2960                          if (pgraph.proc[i] == NULL) {
2906 2961                                  pgraph.proc[i] = pv;
2907 2962                                  lock->pvertex = pv->index = i;
2908 2963                                  pgraph.free--;
2909 2964                                  return (pv);
2910 2965                          }
2911 2966                  }
2912 2967          }
2913 2968          palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2914 2969                                  sizeof (proc_vertex_t *), KM_SLEEP);
2915 2970  
2916 2971          if (pgraph.proc) {
2917 2972                  bcopy(pgraph.proc, palloc,
2918 2973                          pgraph.gcount * sizeof (proc_vertex_t *));
2919 2974  
2920 2975                  kmem_free(pgraph.proc,
2921 2976                          pgraph.gcount * sizeof (proc_vertex_t *));
2922 2977          }
2923 2978          pgraph.proc = palloc;
2924 2979          pgraph.free += (PROC_CHUNK - 1);
2925 2980          pv->index = lock->pvertex = pgraph.gcount;
2926 2981          pgraph.gcount += PROC_CHUNK;
2927 2982          pgraph.proc[pv->index] = pv;
2928 2983          return (pv);
2929 2984  }
2930 2985  
2931 2986  /*
2932 2987   * Allocate a proc edge.
2933 2988   */
2934 2989  
2935 2990  static proc_edge_t *
2936 2991  flk_get_proc_edge()
2937 2992  {
2938 2993          proc_edge_t *pep;
2939 2994  
2940 2995          pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2941 2996          flk_proc_edge_allocs++;
2942 2997          return (pep);
2943 2998  }
2944 2999  
2945 3000  /*
2946 3001   * Free the proc edge. Called whenever its reference count goes to zero.
2947 3002   */
2948 3003  
2949 3004  static void
2950 3005  flk_free_proc_edge(proc_edge_t *pep)
2951 3006  {
2952 3007          ASSERT(pep->refcount == 0);
2953 3008          kmem_free((void *)pep, sizeof (proc_edge_t));
2954 3009          flk_proc_edge_frees++;
2955 3010  }
2956 3011  
2957 3012  /*
2958 3013   * Color the graph explicitly done only when the mark value hits max value.
2959 3014   */
2960 3015  
2961 3016  static void
2962 3017  flk_proc_graph_uncolor()
2963 3018  {
2964 3019          int i;
2965 3020  
2966 3021          if (pgraph.mark == UINT_MAX) {
2967 3022                  for (i = 0; i < pgraph.gcount; i++)
2968 3023                          if (pgraph.proc[i] != NULL) {
2969 3024                                  pgraph.proc[i]->atime = 0;
2970 3025                                  pgraph.proc[i]->dtime = 0;
2971 3026                          }
2972 3027                  pgraph.mark = 1;
2973 3028          } else {
2974 3029                  pgraph.mark++;
2975 3030          }
2976 3031  }
2977 3032  
2978 3033  /*
2979 3034   * Release the proc vertex iff both there are no in edges and out edges
2980 3035   */
2981 3036  
2982 3037  static void
2983 3038  flk_proc_release(proc_vertex_t *proc)
2984 3039  {
2985 3040          ASSERT(MUTEX_HELD(&flock_lock));
2986 3041          if (proc->edge == NULL && proc->incount == 0) {
2987 3042                  pgraph.proc[proc->index] = NULL;
2988 3043                  pgraph.free++;
2989 3044                  kmem_free(proc, sizeof (proc_vertex_t));
2990 3045                  flk_proc_vertex_frees++;
2991 3046          }
2992 3047  }
2993 3048  
2994 3049  /*
2995 3050   * Updates process graph to reflect change in a lock_graph.
2996 3051   * Note: We should call this function only after we have a correctly
2997 3052   * recomputed lock graph. Otherwise we might miss a deadlock detection.
2998 3053   * eg: in function flk_relation() we call this function after flk_recompute_
2999 3054   * dependencies() otherwise if a process tries to lock a vnode hashed
3000 3055   * into another graph it might sleep for ever.
3001 3056   */
3002 3057  
3003 3058  static void
3004 3059  flk_update_proc_graph(edge_t *ep, int delete)
3005 3060  {
3006 3061          proc_vertex_t *toproc, *fromproc;
3007 3062          proc_edge_t *pep, *prevpep;
3008 3063  
3009 3064          mutex_enter(&flock_lock);
3010 3065          toproc = flk_get_proc_vertex(ep->to_vertex);
3011 3066          fromproc = flk_get_proc_vertex(ep->from_vertex);
3012 3067  
3013 3068          if (!delete)
3014 3069                  goto add;
3015 3070          pep = prevpep = fromproc->edge;
3016 3071  
3017 3072          ASSERT(pep != NULL);
3018 3073          while (pep != NULL) {
3019 3074                  if (pep->to_proc == toproc) {
3020 3075                          ASSERT(pep->refcount > 0);
3021 3076                          pep->refcount--;
3022 3077                          if (pep->refcount == 0) {
3023 3078                                  if (pep == prevpep) {
3024 3079                                          fromproc->edge = pep->next;
3025 3080                                  } else {
3026 3081                                          prevpep->next = pep->next;
3027 3082                                  }
3028 3083                                  toproc->incount--;
3029 3084                                  flk_proc_release(toproc);
3030 3085                                  flk_free_proc_edge(pep);
3031 3086                          }
3032 3087                          break;
3033 3088                  }
3034 3089                  prevpep = pep;
3035 3090                  pep = pep->next;
3036 3091          }
3037 3092          flk_proc_release(fromproc);
3038 3093          mutex_exit(&flock_lock);
3039 3094          return;
3040 3095  add:
3041 3096  
3042 3097          pep = fromproc->edge;
3043 3098  
3044 3099          while (pep != NULL) {
3045 3100                  if (pep->to_proc == toproc) {
3046 3101                          ASSERT(pep->refcount > 0);
3047 3102                          pep->refcount++;
3048 3103                          break;
3049 3104                  }
3050 3105                  pep = pep->next;
3051 3106          }
3052 3107          if (pep == NULL) {
3053 3108                  pep = flk_get_proc_edge();
3054 3109                  pep->to_proc = toproc;
3055 3110                  pep->refcount = 1;
3056 3111                  toproc->incount++;
3057 3112                  pep->next = fromproc->edge;
3058 3113                  fromproc->edge = pep;
3059 3114          }
3060 3115          mutex_exit(&flock_lock);
3061 3116  }
3062 3117  
3063 3118  /*
3064 3119   * Set the control status for lock manager requests.
3065 3120   *
3066 3121   */
3067 3122  
3068 3123  /*
3069 3124   * PSARC case 1997/292
3070 3125   *
3071 3126   * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3072 3127   * Effects: Set the state of the NLM server identified by "nlmid"
3073 3128   *   in the NLM registry to state "nlm_state."
3074 3129   *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3075 3130   *   NLM server to this LLM.
3076 3131   *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3077 3132   *   may be locks requests that have gotten started but not finished.  In
3078 3133   *   particular, there may be blocking requests that are in the callback code
3079 3134   *   before sleeping (so they're not holding the lock for the graph).  If
3080 3135   *   such a thread reacquires the graph's lock (to go to sleep) after
3081 3136   *   NLM state in the NLM registry  is set to a non-up value,
3082 3137   *   it will notice the status and bail out.  If the request gets
3083 3138   *   granted before the thread can check the NLM registry, let it
3084 3139   *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3085 3140   *
3086 3141   * Modifies: nlm_reg_obj (global)
3087 3142   * Arguments:
3088 3143   *    nlmid     (IN):    id uniquely identifying an NLM server
3089 3144   *    nlm_state (IN):    NLM server state to change "nlmid" to
3090 3145   */
3091 3146  void
3092 3147  cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3093 3148  {
3094 3149          /*
3095 3150           * Check to see if node is booted as a cluster. If not, return.
3096 3151           */
3097 3152          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3098 3153                  return;
3099 3154          }
3100 3155  
3101 3156          /*
3102 3157           * Check for development/debugging.  It is possible to boot a node
3103 3158           * in non-cluster mode, and then run a special script, currently
3104 3159           * available only to developers, to bring up the node as part of a
3105 3160           * cluster.  The problem is that running such a script does not
3106 3161           * result in the routine flk_init() being called and hence global array
3107 3162           * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3108 3163           * but the LLM needs to do an additional check to see if the global
3109 3164           * array has been created or not. If nlm_reg_status is NULL, then
3110 3165           * return, else continue.
3111 3166           */
3112 3167          if (nlm_reg_status == NULL) {
3113 3168                  return;
3114 3169          }
3115 3170  
3116 3171          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3117 3172          mutex_enter(&nlm_reg_lock);
3118 3173  
3119 3174          if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3120 3175                  /*
3121 3176                   * If the NLM server "nlmid" is unknown in the NLM registry,
3122 3177                   * add it to the registry in the nlm shutting down state.
3123 3178                   */
3124 3179                  FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3125 3180                          FLK_NLM_SHUTTING_DOWN);
3126 3181          } else {
3127 3182                  /*
3128 3183                   * Change the state of the NLM server identified by "nlmid"
3129 3184                   * in the NLM registry to the argument "nlm_state."
3130 3185                   */
3131 3186                  FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3132 3187                          nlm_state);
3133 3188          }
3134 3189  
3135 3190          /*
3136 3191           *  The reason we must register the NLM server that is shutting down
3137 3192           *  with an LLM that doesn't already know about it (never sent a lock
3138 3193           *  request) is to handle correctly a race between shutdown and a new
3139 3194           *  lock request.  Suppose that a shutdown request from the NLM server
3140 3195           *  invokes this routine at the LLM, and a thread is spawned to
3141 3196           *  service the request. Now suppose a new lock request is in
3142 3197           *  progress and has already passed the first line of defense in
3143 3198           *  reclock(), which denies new locks requests from NLM servers
3144 3199           *  that are not in the NLM_UP state.  After the current routine
3145 3200           *  is invoked for both phases of shutdown, the routine will return,
3146 3201           *  having done nothing, and the lock request will proceed and
3147 3202           *  probably be granted.  The problem is that the shutdown was ignored
3148 3203           *  by the lock request because there was no record of that NLM server
3149 3204           *  shutting down.   We will be in the peculiar position of thinking
3150 3205           *  that we've shutdown the NLM server and all locks at all LLMs have
3151 3206           *  been discarded, but in fact there's still one lock held.
3152 3207           *  The solution is to record the existence of NLM server and change
3153 3208           *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3154 3209           *  progress may proceed because the next phase NLM_DOWN will catch
3155 3210           *  this lock and discard it.
3156 3211           */
3157 3212          mutex_exit(&nlm_reg_lock);
3158 3213  
3159 3214          switch (nlm_state) {
3160 3215          case FLK_NLM_UP:
3161 3216                  /*
3162 3217                   * Change the NLM state of all locks still held on behalf of
3163 3218                   * the NLM server identified by "nlmid" to NLM_UP.
3164 3219                   */
3165 3220                  cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3166 3221                  break;
3167 3222  
3168 3223          case FLK_NLM_SHUTTING_DOWN:
3169 3224                  /*
3170 3225                   * Wake up all sleeping locks for the NLM server identified
3171 3226                   * by "nlmid." Note that eventually all woken threads will
3172 3227                   * have their lock requests cancelled and descriptors
3173 3228                   * removed from the sleeping lock list.  Note that the NLM
3174 3229                   * server state associated with each lock descriptor is
3175 3230                   * changed to FLK_NLM_SHUTTING_DOWN.
3176 3231                   */
3177 3232                  cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3178 3233                  break;
3179 3234  
3180 3235          case FLK_NLM_DOWN:
3181 3236                  /*
3182 3237                   * Discard all active, granted locks for this NLM server
3183 3238                   * identified by "nlmid."
3184 3239                   */
3185 3240                  cl_flk_unlock_nlm_granted(nlmid);
3186 3241                  break;
3187 3242  
3188 3243          default:
3189 3244                  panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3190 3245          }
3191 3246  }
3192 3247  
3193 3248  /*
3194 3249   * Set the control status for lock manager requests.
3195 3250   *
3196 3251   * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3197 3252   * may be locks requests that have gotten started but not finished.  In
3198 3253   * particular, there may be blocking requests that are in the callback code
3199 3254   * before sleeping (so they're not holding the lock for the graph).  If
3200 3255   * such a thread reacquires the graph's lock (to go to sleep) after
3201 3256   * flk_lockmgr_status is set to a non-up value, it will notice the status
3202 3257   * and bail out.  If the request gets granted before the thread can check
3203 3258   * flk_lockmgr_status, let it continue normally.  It will get flushed when
3204 3259   * we are called with FLK_LOCKMGR_DOWN.
3205 3260   */
3206 3261  
3207 3262  void
3208 3263  flk_set_lockmgr_status(flk_lockmgr_status_t status)
3209 3264  {
3210 3265          int i;
3211 3266          graph_t *gp;
3212 3267          struct flock_globals *fg;
3213 3268  
3214 3269          fg = flk_get_globals();
3215 3270          ASSERT(fg != NULL);
3216 3271  
3217 3272          mutex_enter(&flock_lock);
3218 3273          fg->flk_lockmgr_status = status;
3219 3274          mutex_exit(&flock_lock);
3220 3275  
3221 3276          /*
3222 3277           * If the lock manager is coming back up, all that's needed is to
3223 3278           * propagate this information to the graphs.  If the lock manager
3224 3279           * is going down, additional action is required, and each graph's
3225 3280           * copy of the state is updated atomically with this other action.
3226 3281           */
3227 3282          switch (status) {
3228 3283          case FLK_LOCKMGR_UP:
3229 3284                  for (i = 0; i < HASH_SIZE; i++) {
3230 3285                          mutex_enter(&flock_lock);
3231 3286                          gp = lock_graph[i];
3232 3287                          mutex_exit(&flock_lock);
3233 3288                          if (gp == NULL)
3234 3289                                  continue;
3235 3290                          mutex_enter(&gp->gp_mutex);
3236 3291                          fg->lockmgr_status[i] = status;
3237 3292                          mutex_exit(&gp->gp_mutex);
3238 3293                  }
3239 3294                  break;
3240 3295          case FLK_WAKEUP_SLEEPERS:
3241 3296                  wakeup_sleeping_lockmgr_locks(fg);
3242 3297                  break;
3243 3298          case FLK_LOCKMGR_DOWN:
3244 3299                  unlock_lockmgr_granted(fg);
3245 3300                  break;
3246 3301          default:
3247 3302                  panic("flk_set_lockmgr_status: bad status (%d)", status);
3248 3303                  break;
3249 3304          }
3250 3305  }
3251 3306  
3252 3307  /*
3253 3308   * This routine returns all the locks that are active or sleeping and are
3254 3309   * associated with a particular set of identifiers.  If lock_state != 0, then
3255 3310   * only locks that match the lock_state are returned. If lock_state == 0, then
3256 3311   * all locks are returned. If pid == NOPID, the pid is ignored.  If
3257 3312   * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3258 3313   * vnode pointer is ignored.
3259 3314   *
3260 3315   * A list containing the vnode pointer and an flock structure
3261 3316   * describing the lock is returned.  Each element in the list is
3262 3317   * dynamically allocated and must be freed by the caller.  The
3263 3318   * last item in the list is denoted by a NULL value in the ll_next
3264 3319   * field.
3265 3320   *
3266 3321   * The vnode pointers returned are held.  The caller is responsible
3267 3322   * for releasing these.  Note that the returned list is only a snapshot of
3268 3323   * the current lock information, and that it is a snapshot of a moving
3269 3324   * target (only one graph is locked at a time).
3270 3325   */
3271 3326  
3272 3327  locklist_t *
3273 3328  get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3274 3329                  pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3275 3330  {
3276 3331          lock_descriptor_t       *lock;
3277 3332          lock_descriptor_t       *graph_head;
3278 3333          locklist_t              listhead;
3279 3334          locklist_t              *llheadp;
3280 3335          locklist_t              *llp;
3281 3336          locklist_t              *lltp;
3282 3337          graph_t                 *gp;
3283 3338          int                     i;
3284 3339          int                     first_index; /* graph index */
3285 3340          int                     num_indexes; /* graph index */
3286 3341  
3287 3342          ASSERT((list_type == FLK_ACTIVE_STATE) ||
3288 3343              (list_type == FLK_SLEEPING_STATE));
3289 3344  
3290 3345          /*
3291 3346           * Get a pointer to something to use as a list head while building
3292 3347           * the rest of the list.
3293 3348           */
3294 3349          llheadp = &listhead;
3295 3350          lltp = llheadp;
3296 3351          llheadp->ll_next = (locklist_t *)NULL;
3297 3352  
3298 3353          /* Figure out which graphs we want to look at. */
3299 3354          if (vp == NULL) {
3300 3355                  first_index = 0;
3301 3356                  num_indexes = HASH_SIZE;
3302 3357          } else {
3303 3358                  first_index = HASH_INDEX(vp);
3304 3359                  num_indexes = 1;
3305 3360          }
3306 3361  
3307 3362          for (i = first_index; i < first_index + num_indexes; i++) {
3308 3363                  mutex_enter(&flock_lock);
3309 3364                  gp = lock_graph[i];
3310 3365                  mutex_exit(&flock_lock);
3311 3366                  if (gp == NULL) {
3312 3367                          continue;
3313 3368                  }
3314 3369  
3315 3370                  mutex_enter(&gp->gp_mutex);
3316 3371                  graph_head = (list_type == FLK_ACTIVE_STATE) ?
3317 3372                          ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3318 3373                  for (lock = graph_head->l_next;
3319 3374                      lock != graph_head;
3320 3375                      lock = lock->l_next) {
3321 3376                          if (use_sysid && lock->l_flock.l_sysid != sysid)
3322 3377                                  continue;
3323 3378                          if (pid != NOPID && lock->l_flock.l_pid != pid)
3324 3379                                  continue;
3325 3380                          if (vp != NULL && lock->l_vnode != vp)
3326 3381                                  continue;
3327 3382                          if (lock_state && !(lock_state & lock->l_state))
3328 3383                                  continue;
3329 3384                          if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3330 3385                                  continue;
3331 3386                          /*
3332 3387                           * A matching lock was found.  Allocate
3333 3388                           * space for a new locklist entry and fill
3334 3389                           * it in.
3335 3390                           */
3336 3391                          llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3337 3392                          lltp->ll_next = llp;
3338 3393                          VN_HOLD(lock->l_vnode);
3339 3394                          llp->ll_vp = lock->l_vnode;
3340 3395                          create_flock(lock, &(llp->ll_flock));
3341 3396                          llp->ll_next = (locklist_t *)NULL;
3342 3397                          lltp = llp;
3343 3398                  }
3344 3399                  mutex_exit(&gp->gp_mutex);
3345 3400          }
3346 3401  
3347 3402          llp = llheadp->ll_next;
3348 3403          return (llp);
3349 3404  }
3350 3405  
3351 3406  /*
3352 3407   * These two functions are simply interfaces to get_lock_list.  They return
3353 3408   * a list of sleeping or active locks for the given sysid and pid.  See
3354 3409   * get_lock_list for details.
3355 3410   *
3356 3411   * In either case we don't particularly care to specify the zone of interest;
3357 3412   * the sysid-space is global across zones, so the sysid will map to exactly one
3358 3413   * zone, and we'll return information for that zone.
3359 3414   */
3360 3415  
3361 3416  locklist_t *
3362 3417  flk_get_sleeping_locks(int sysid, pid_t pid)
3363 3418  {
3364 3419          return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3365 3420                      ALL_ZONES));
3366 3421  }
3367 3422  
3368 3423  locklist_t *
3369 3424  flk_get_active_locks(int sysid, pid_t pid)
3370 3425  {
3371 3426          return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3372 3427                      ALL_ZONES));
3373 3428  }
3374 3429  
3375 3430  /*
3376 3431   * Another interface to get_lock_list.  This one returns all the active
3377 3432   * locks for a given vnode.  Again, see get_lock_list for details.
3378 3433   *
3379 3434   * We don't need to specify which zone's locks we're interested in.  The matter
3380 3435   * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3381 3436   * be used by multiple zones, so the list of locks will all be from the right
3382 3437   * zone.
3383 3438   */
3384 3439  
3385 3440  locklist_t *
3386 3441  flk_active_locks_for_vp(const vnode_t *vp)
3387 3442  {
3388 3443          return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3389 3444                      ALL_ZONES));
3390 3445  }
3391 3446  
3392 3447  /*
3393 3448   * Another interface to get_lock_list.  This one returns all the active
3394 3449   * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3395 3450   *
3396 3451   * See the comment for flk_active_locks_for_vp() for why we don't care to
3397 3452   * specify the particular zone of interest.
3398 3453   */
3399 3454  locklist_t *
3400 3455  flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3401 3456  {
3402 3457          return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3403 3458                                  NOPID, vp, ALL_ZONES));
3404 3459  }
3405 3460  
3406 3461  /*
3407 3462   * Another interface to get_lock_list.  This one returns all the active
3408 3463   * nbmand locks for a given pid.  Again, see get_lock_list for details.
3409 3464   *
3410 3465   * The zone doesn't need to be specified here; the locks held by a
3411 3466   * particular process will either be local (ie, non-NFS) or from the zone
3412 3467   * the process is executing in.  This is because other parts of the system
3413 3468   * ensure that an NFS vnode can't be used in a zone other than that in
3414 3469   * which it was opened.
3415 3470   */
3416 3471  locklist_t *
3417 3472  flk_active_nbmand_locks(pid_t pid)
3418 3473  {
3419 3474          return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3420 3475                                  pid, NULL, ALL_ZONES));
3421 3476  }
3422 3477  
3423 3478  /*
3424 3479   * Free up all entries in the locklist.
3425 3480   */
3426 3481  void
3427 3482  flk_free_locklist(locklist_t *llp)
3428 3483  {
3429 3484          locklist_t *next_llp;
3430 3485  
3431 3486          while (llp) {
3432 3487                  next_llp = llp->ll_next;
3433 3488                  VN_RELE(llp->ll_vp);
3434 3489                  kmem_free(llp, sizeof (*llp));
3435 3490                  llp = next_llp;
3436 3491          }
3437 3492  }
3438 3493  
3439 3494  static void
3440 3495  cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3441 3496  {
3442 3497          /*
3443 3498           * For each graph "lg" in the hash table lock_graph do
3444 3499           * a.  Get the list of sleeping locks
3445 3500           * b.  For each lock descriptor in the list do
3446 3501           *      i.   If the requested lock is an NLM server request AND
3447 3502           *              the nlmid is the same as the routine argument then
3448 3503           *              change the lock descriptor's state field to
3449 3504           *              "nlm_state."
3450 3505           * c.  Get the list of active locks
3451 3506           * d.  For each lock descriptor in the list do
3452 3507           *      i.   If the requested lock is an NLM server request AND
3453 3508           *              the nlmid is the same as the routine argument then
3454 3509           *              change the lock descriptor's state field to
3455 3510           *              "nlm_state."
3456 3511           */
3457 3512  
3458 3513          int                     i;
3459 3514          graph_t                 *gp;                    /* lock graph */
3460 3515          lock_descriptor_t       *lock;                  /* lock */
3461 3516          lock_descriptor_t       *nlock = NULL;          /* next lock */
3462 3517          int                     lock_nlmid;
3463 3518  
3464 3519          for (i = 0; i < HASH_SIZE; i++) {
3465 3520                  mutex_enter(&flock_lock);
3466 3521                  gp = lock_graph[i];
3467 3522                  mutex_exit(&flock_lock);
3468 3523                  if (gp == NULL) {
3469 3524                          continue;
3470 3525                  }
3471 3526  
3472 3527                  /* Get list of sleeping locks in current lock graph. */
3473 3528                  mutex_enter(&gp->gp_mutex);
3474 3529                  for (lock = SLEEPING_HEAD(gp)->l_next;
3475 3530                      lock != SLEEPING_HEAD(gp);
3476 3531                      lock = nlock) {
3477 3532                          nlock = lock->l_next;
3478 3533                          /* get NLM id */
3479 3534                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3480 3535  
3481 3536                          /*
3482 3537                           * If NLM server request AND nlmid of lock matches
3483 3538                           * nlmid of argument, then set the NLM state of the
3484 3539                           * lock to "nlm_state."
3485 3540                           */
3486 3541                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3487 3542                                  SET_NLM_STATE(lock, nlm_state);
3488 3543                          }
3489 3544                  }
3490 3545  
3491 3546                  /* Get list of active locks in current lock graph. */
3492 3547                  for (lock = ACTIVE_HEAD(gp)->l_next;
3493 3548                      lock != ACTIVE_HEAD(gp);
3494 3549                      lock = nlock) {
3495 3550                          nlock = lock->l_next;
3496 3551                          /* get NLM id */
3497 3552                          lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3498 3553  
3499 3554                          /*
3500 3555                           * If NLM server request AND nlmid of lock matches
3501 3556                           * nlmid of argument, then set the NLM state of the
3502 3557                           * lock to "nlm_state."
3503 3558                           */
3504 3559                          if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3505 3560                                  ASSERT(IS_ACTIVE(lock));
3506 3561                                  SET_NLM_STATE(lock, nlm_state);
3507 3562                          }
3508 3563                  }
3509 3564                  mutex_exit(&gp->gp_mutex);
3510 3565          }
3511 3566  }
3512 3567  
3513 3568  /*
3514 3569   * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3515 3570   * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3516 3571   *   identified by "nlmid." Poke those lock requests.
3517 3572   */
3518 3573  static void
3519 3574  cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3520 3575  {
3521 3576          lock_descriptor_t *lock;
3522 3577          lock_descriptor_t *nlock = NULL; /* next lock */
3523 3578          int i;
3524 3579          graph_t *gp;
3525 3580          int     lock_nlmid;
3526 3581  
3527 3582          for (i = 0; i < HASH_SIZE; i++) {
3528 3583                  mutex_enter(&flock_lock);
3529 3584                  gp = lock_graph[i];
3530 3585                  mutex_exit(&flock_lock);
3531 3586                  if (gp == NULL) {
3532 3587                          continue;
3533 3588                  }
3534 3589  
3535 3590                  mutex_enter(&gp->gp_mutex);
3536 3591                  for (lock = SLEEPING_HEAD(gp)->l_next;
3537 3592                      lock != SLEEPING_HEAD(gp);
3538 3593                      lock = nlock) {
3539 3594                          nlock = lock->l_next;
3540 3595                          /*
3541 3596                           * If NLM server request _and_ nlmid of lock matches
3542 3597                           * nlmid of argument, then set the NLM state of the
3543 3598                           * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3544 3599                           * request.
3545 3600                           */
3546 3601                          if (IS_LOCKMGR(lock)) {
3547 3602                                  /* get NLM id */
3548 3603                                  lock_nlmid =
3549 3604                                          GETNLMID(lock->l_flock.l_sysid);
3550 3605                                  if (nlmid == lock_nlmid) {
3551 3606                                          SET_NLM_STATE(lock,
3552 3607                                                  FLK_NLM_SHUTTING_DOWN);
3553 3608                                          INTERRUPT_WAKEUP(lock);
3554 3609                                  }
3555 3610                          }
3556 3611                  }
3557 3612                  mutex_exit(&gp->gp_mutex);
3558 3613          }
3559 3614  }
3560 3615  
3561 3616  /*
3562 3617   * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3563 3618   * Effects:  Find all active (granted) lock manager locks _only_ for the
3564 3619   *   NLM server identified by "nlmid" and release them.
3565 3620   */
3566 3621  static void
3567 3622  cl_flk_unlock_nlm_granted(int nlmid)
3568 3623  {
3569 3624          lock_descriptor_t *lock;
3570 3625          lock_descriptor_t *nlock = NULL; /* next lock */
3571 3626          int i;
3572 3627          graph_t *gp;
3573 3628          int     lock_nlmid;
3574 3629  
3575 3630          for (i = 0; i < HASH_SIZE; i++) {
3576 3631                  mutex_enter(&flock_lock);
3577 3632                  gp = lock_graph[i];
3578 3633                  mutex_exit(&flock_lock);
3579 3634                  if (gp == NULL) {
3580 3635                          continue;
3581 3636                  }
3582 3637  
3583 3638                  mutex_enter(&gp->gp_mutex);
3584 3639                  for (lock = ACTIVE_HEAD(gp)->l_next;
3585 3640                      lock != ACTIVE_HEAD(gp);
3586 3641                      lock = nlock) {
3587 3642                          nlock = lock->l_next;
3588 3643                          ASSERT(IS_ACTIVE(lock));
3589 3644  
3590 3645                          /*
3591 3646                           * If it's an  NLM server request _and_ nlmid of
3592 3647                           * the lock matches nlmid of argument, then
3593 3648                           * remove the active lock the list, wakup blocked
3594 3649                           * threads, and free the storage for the lock.
3595 3650                           * Note that there's no need to mark the NLM state
3596 3651                           * of this lock to NLM_DOWN because the lock will
3597 3652                           * be deleted anyway and its storage freed.
3598 3653                           */
3599 3654                          if (IS_LOCKMGR(lock)) {
3600 3655                                  /* get NLM id */
3601 3656                                  lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3602 3657                                  if (nlmid == lock_nlmid) {
3603 3658                                          flk_delete_active_lock(lock, 0);
3604 3659                                          flk_wakeup(lock, 1);
3605 3660                                          flk_free_lock(lock);
3606 3661                                  }
3607 3662                          }
3608 3663                  }
3609 3664                  mutex_exit(&gp->gp_mutex);
3610 3665          }
3611 3666  }
3612 3667  
3613 3668  /*
3614 3669   * Find all sleeping lock manager requests and poke them.
3615 3670   */
3616 3671  static void
3617 3672  wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3618 3673  {
3619 3674          lock_descriptor_t *lock;
3620 3675          lock_descriptor_t *nlock = NULL; /* next lock */
3621 3676          int i;
3622 3677          graph_t *gp;
3623 3678          zoneid_t zoneid = getzoneid();
3624 3679  
3625 3680          for (i = 0; i < HASH_SIZE; i++) {
3626 3681                  mutex_enter(&flock_lock);
3627 3682                  gp = lock_graph[i];
3628 3683                  mutex_exit(&flock_lock);
3629 3684                  if (gp == NULL) {
3630 3685                          continue;
3631 3686                  }
3632 3687  
3633 3688                  mutex_enter(&gp->gp_mutex);
3634 3689                  fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3635 3690                  for (lock = SLEEPING_HEAD(gp)->l_next;
3636 3691                      lock != SLEEPING_HEAD(gp);
3637 3692                      lock = nlock) {
3638 3693                          nlock = lock->l_next;
3639 3694                          if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3640 3695                                  INTERRUPT_WAKEUP(lock);
3641 3696                          }
3642 3697                  }
3643 3698                  mutex_exit(&gp->gp_mutex);
3644 3699          }
3645 3700  }
3646 3701  
3647 3702  
3648 3703  /*
3649 3704   * Find all active (granted) lock manager locks and release them.
3650 3705   */
3651 3706  static void
3652 3707  unlock_lockmgr_granted(struct flock_globals *fg)
3653 3708  {
3654 3709          lock_descriptor_t *lock;
3655 3710          lock_descriptor_t *nlock = NULL; /* next lock */
3656 3711          int i;
3657 3712          graph_t *gp;
3658 3713          zoneid_t zoneid = getzoneid();
3659 3714  
3660 3715          for (i = 0; i < HASH_SIZE; i++) {
3661 3716                  mutex_enter(&flock_lock);
3662 3717                  gp = lock_graph[i];
3663 3718                  mutex_exit(&flock_lock);
3664 3719                  if (gp == NULL) {
3665 3720                          continue;
3666 3721                  }
3667 3722  
3668 3723                  mutex_enter(&gp->gp_mutex);
3669 3724                  fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3670 3725                  for (lock = ACTIVE_HEAD(gp)->l_next;
3671 3726                      lock != ACTIVE_HEAD(gp);
3672 3727                      lock = nlock) {
3673 3728                          nlock = lock->l_next;
3674 3729                          if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3675 3730                                  ASSERT(IS_ACTIVE(lock));
3676 3731                                  flk_delete_active_lock(lock, 0);
3677 3732                                  flk_wakeup(lock, 1);
3678 3733                                  flk_free_lock(lock);
3679 3734                          }
3680 3735                  }
3681 3736                  mutex_exit(&gp->gp_mutex);
3682 3737          }
3683 3738  }
3684 3739  
3685 3740  
3686 3741  /*
3687 3742   * Wait until a lock is granted, cancelled, or interrupted.
3688 3743   */
3689 3744  
3690 3745  static void
3691 3746  wait_for_lock(lock_descriptor_t *request)
3692 3747  {
3693 3748          graph_t *gp = request->l_graph;
3694 3749  
3695 3750          ASSERT(MUTEX_HELD(&gp->gp_mutex));
3696 3751  
3697 3752          while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3698 3753              !(IS_INTERRUPTED(request))) {
3699 3754                  if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3700 3755                          flk_set_state(request, FLK_INTERRUPTED_STATE);
3701 3756                          request->l_state |= INTERRUPTED_LOCK;
3702 3757                  }
3703 3758          }
3704 3759  }
3705 3760  
3706 3761  /*
3707 3762   * Create an flock structure from the existing lock information
3708 3763   *
3709 3764   * This routine is used to create flock structures for the lock manager
3710 3765   * to use in a reclaim request.  Since the lock was originated on this
3711 3766   * host, it must be conforming to UNIX semantics, so no checking is
3712 3767   * done to make sure it falls within the lower half of the 32-bit range.
3713 3768   */
3714 3769  
3715 3770  static void
3716 3771  create_flock(lock_descriptor_t *lp, flock64_t *flp)
3717 3772  {
3718 3773          ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3719 3774          ASSERT(lp->l_end >= lp->l_start);
3720 3775  
3721 3776          flp->l_type = lp->l_type;
3722 3777          flp->l_whence = 0;
3723 3778          flp->l_start = lp->l_start;
3724 3779          flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3725 3780                  (lp->l_end - lp->l_start + 1);
3726 3781          flp->l_sysid = lp->l_flock.l_sysid;
3727 3782          flp->l_pid = lp->l_flock.l_pid;
3728 3783  }
3729 3784  
3730 3785  /*
3731 3786   * Convert flock_t data describing a lock range into unsigned long starting
3732 3787   * and ending points, which are put into lock_request.  Returns 0 or an
3733 3788   * errno value.
3734 3789   * Large Files: max is passed by the caller and we return EOVERFLOW
3735 3790   * as defined by LFS API.
3736 3791   */
3737 3792  
3738 3793  int
3739 3794  flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3740 3795      u_offset_t *start, u_offset_t *end, offset_t offset)
3741 3796  {
3742 3797          struct vattr    vattr;
3743 3798          int     error;
3744 3799  
3745 3800          /*
3746 3801           * Determine the starting point of the request
3747 3802           */
3748 3803          switch (flp->l_whence) {
3749 3804          case 0:         /* SEEK_SET */
3750 3805                  *start = (u_offset_t)flp->l_start;
3751 3806                  break;
3752 3807          case 1:         /* SEEK_CUR */
3753 3808                  *start = (u_offset_t)(flp->l_start + offset);
3754 3809                  break;
3755 3810          case 2:         /* SEEK_END */
3756 3811                  vattr.va_mask = AT_SIZE;
3757 3812                  if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3758 3813                          return (error);
3759 3814                  *start = (u_offset_t)(flp->l_start + vattr.va_size);
3760 3815                  break;
3761 3816          default:
3762 3817                  return (EINVAL);
3763 3818          }
3764 3819  
3765 3820          /*
3766 3821           * Determine the range covered by the request.
3767 3822           */
3768 3823          if (flp->l_len == 0)
3769 3824                  *end = MAX_U_OFFSET_T;
3770 3825          else if ((offset_t)flp->l_len > 0) {
3771 3826                  *end = (u_offset_t)(*start + (flp->l_len - 1));
3772 3827          } else {
3773 3828                  /*
3774 3829                   * Negative length; why do we even allow this ?
3775 3830                   * Because this allows easy specification of
3776 3831                   * the last n bytes of the file.
3777 3832                   */
3778 3833                  *end = *start;
3779 3834                  *start += (u_offset_t)flp->l_len;
3780 3835                  (*start)++;
3781 3836          }
3782 3837          return (0);
3783 3838  }
3784 3839  
3785 3840  /*
3786 3841   * Check the validity of lock data.  This can used by the NFS
3787 3842   * frlock routines to check data before contacting the server.  The
3788 3843   * server must support semantics that aren't as restrictive as
3789 3844   * the UNIX API, so the NFS client is required to check.
3790 3845   * The maximum is now passed in by the caller.
3791 3846   */
3792 3847  
3793 3848  int
3794 3849  flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3795 3850  {
3796 3851          /*
3797 3852           * The end (length) for local locking should never be greater
3798 3853           * than MAXEND. However, the representation for
3799 3854           * the entire file is MAX_U_OFFSET_T.
3800 3855           */
3801 3856          if ((start > max) ||
3802 3857              ((end > max) && (end != MAX_U_OFFSET_T))) {
3803 3858                  return (EINVAL);
3804 3859          }
3805 3860          if (start > end) {
3806 3861              return (EINVAL);
3807 3862          }
3808 3863          return (0);
3809 3864  }
3810 3865  
3811 3866  /*
3812 3867   * Fill in request->l_flock with information about the lock blocking the
3813 3868   * request.  The complexity here is that lock manager requests are allowed
3814 3869   * to see into the upper part of the 32-bit address range, whereas local
3815 3870   * requests are only allowed to see signed values.
3816 3871   *
3817 3872   * What should be done when "blocker" is a lock manager lock that uses the
3818 3873   * upper portion of the 32-bit range, but "request" is local?  Since the
3819 3874   * request has already been determined to have been blocked by the blocker,
3820 3875   * at least some portion of "blocker" must be in the range of the request,
3821 3876   * or the request extends to the end of file.  For the first case, the
3822 3877   * portion in the lower range is returned with the indication that it goes
3823 3878   * "to EOF."  For the second case, the last byte of the lower range is
3824 3879   * returned with the indication that it goes "to EOF."
3825 3880   */
3826 3881  
3827 3882  static void
3828 3883  report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3829 3884  {
3830 3885          flock64_t *flrp;                        /* l_flock portion of request */
3831 3886  
3832 3887          ASSERT(blocker != NULL);
3833 3888  
3834 3889          flrp = &request->l_flock;
3835 3890          flrp->l_whence = 0;
3836 3891          flrp->l_type = blocker->l_type;
3837 3892          flrp->l_pid = blocker->l_flock.l_pid;
3838 3893          flrp->l_sysid = blocker->l_flock.l_sysid;
3839 3894  
3840 3895          if (IS_LOCKMGR(request)) {
3841 3896                  flrp->l_start = blocker->l_start;
3842 3897                  if (blocker->l_end == MAX_U_OFFSET_T)
3843 3898                          flrp->l_len = 0;
3844 3899                  else
3845 3900                          flrp->l_len = blocker->l_end - blocker->l_start + 1;
3846 3901          } else {
3847 3902                  if (blocker->l_start > MAXEND) {
3848 3903                          flrp->l_start = MAXEND;
3849 3904                          flrp->l_len = 0;
3850 3905                  } else {
3851 3906                          flrp->l_start = blocker->l_start;
3852 3907                          if (blocker->l_end == MAX_U_OFFSET_T)
3853 3908                                  flrp->l_len = 0;
3854 3909                          else
3855 3910                                  flrp->l_len = blocker->l_end -
3856 3911                                          blocker->l_start + 1;
3857 3912                  }
3858 3913          }
3859 3914  }
3860 3915  
3861 3916  /*
3862 3917   * PSARC case 1997/292
3863 3918   */
3864 3919  /*
3865 3920   * This is the public routine exported by flock.h.
3866 3921   */
3867 3922  void
3868 3923  cl_flk_change_nlm_state_to_unknown(int nlmid)
3869 3924  {
3870 3925          /*
3871 3926           * Check to see if node is booted as a cluster. If not, return.
3872 3927           */
3873 3928          if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3874 3929                  return;
3875 3930          }
3876 3931  
3877 3932          /*
3878 3933           * See comment in cl_flk_set_nlm_status().
3879 3934           */
3880 3935          if (nlm_reg_status == NULL) {
3881 3936                  return;
3882 3937          }
3883 3938  
3884 3939          /*
3885 3940           * protect NLM registry state with a mutex.
3886 3941           */
3887 3942          ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3888 3943          mutex_enter(&nlm_reg_lock);
3889 3944          FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3890 3945          mutex_exit(&nlm_reg_lock);
3891 3946  }
3892 3947  
3893 3948  /*
3894 3949   * Return non-zero if the given I/O request conflicts with an active NBMAND
3895 3950   * lock.
3896 3951   * If svmand is non-zero, it means look at all active locks, not just NBMAND
3897 3952   * locks.
3898 3953   */
3899 3954  
3900 3955  int
3901 3956  nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3902 3957                  ssize_t length, int svmand, caller_context_t *ct)
3903 3958  {
3904 3959          int conflict = 0;
3905 3960          graph_t                 *gp;
3906 3961          lock_descriptor_t       *lock;
3907 3962          pid_t pid;
3908 3963          int sysid;
3909 3964  
3910 3965          if (ct == NULL) {
3911 3966                  pid = curproc->p_pid;
3912 3967                  sysid = 0;
3913 3968          } else {
3914 3969                  pid = ct->cc_pid;
3915 3970                  sysid = ct->cc_sysid;
3916 3971          }
3917 3972  
3918 3973          mutex_enter(&flock_lock);
3919 3974          gp = lock_graph[HASH_INDEX(vp)];
3920 3975          mutex_exit(&flock_lock);
3921 3976          if (gp == NULL)
3922 3977                  return (0);
3923 3978  
3924 3979          mutex_enter(&gp->gp_mutex);
3925 3980          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3926 3981  
3927 3982          for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3928 3983                  if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3929 3984                      (lock->l_flock.l_sysid != sysid ||
3930 3985                      lock->l_flock.l_pid != pid) &&
3931 3986                      lock_blocks_io(op, offset, length,
3932 3987                                  lock->l_type, lock->l_start, lock->l_end)) {
3933 3988                          conflict = 1;
3934 3989                          break;
3935 3990                  }
3936 3991          }
3937 3992          mutex_exit(&gp->gp_mutex);
3938 3993  
3939 3994          return (conflict);
3940 3995  }
3941 3996  
3942 3997  /*
3943 3998   * Return non-zero if the given I/O request conflicts with the given lock.
3944 3999   */
3945 4000  
3946 4001  static int
3947 4002  lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3948 4003              int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3949 4004  {
3950 4005          ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3951 4006          ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3952 4007  
3953 4008          if (op == NBL_READ && lock_type == F_RDLCK)
3954 4009                  return (0);
3955 4010  
3956 4011          if (offset <= lock_start && lock_start < offset + length)
3957 4012                  return (1);
3958 4013          if (lock_start <= offset && offset <= lock_end)
3959 4014                  return (1);
3960 4015  
3961 4016          return (0);
3962 4017  }
3963 4018  
3964 4019  #ifdef DEBUG
3965 4020  static void
3966 4021  check_active_locks(graph_t *gp)
3967 4022  {
3968 4023          lock_descriptor_t *lock, *lock1;
3969 4024          edge_t  *ep;
3970 4025  
3971 4026          for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3972 4027                                                  lock = lock->l_next) {
3973 4028                  ASSERT(IS_ACTIVE(lock));
3974 4029                  ASSERT(NOT_BLOCKED(lock));
3975 4030                  ASSERT(!IS_BARRIER(lock));
3976 4031  
3977 4032                  ep = FIRST_IN(lock);
3978 4033  
3979 4034                  while (ep != HEAD(lock)) {
3980 4035                          ASSERT(IS_SLEEPING(ep->from_vertex));
3981 4036                          ASSERT(!NOT_BLOCKED(ep->from_vertex));
3982 4037                          ep = NEXT_IN(ep);
3983 4038                  }
3984 4039  
3985 4040                  for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
3986 4041                                          lock1 = lock1->l_next) {
3987 4042                          if (lock1->l_vnode == lock->l_vnode) {
3988 4043                          if (BLOCKS(lock1, lock)) {
3989 4044                                  cmn_err(CE_PANIC,
3990 4045                                      "active lock %p blocks %p",
3991 4046                                      (void *)lock1, (void *)lock);
3992 4047                          } else if (BLOCKS(lock, lock1)) {
3993 4048                                  cmn_err(CE_PANIC,
3994 4049                                      "active lock %p blocks %p",
3995 4050                                      (void *)lock, (void *)lock1);
3996 4051                          }
3997 4052                          }
3998 4053                  }
3999 4054          }
4000 4055  }
4001 4056  
4002 4057  /*
4003 4058   * Effect: This functions checks to see if the transition from 'old_state' to
4004 4059   *      'new_state' is a valid one.  It returns 0 if the transition is valid
4005 4060   *      and 1 if it is not.
4006 4061   *      For a map of valid transitions, see sys/flock_impl.h
4007 4062   */
4008 4063  static int
4009 4064  check_lock_transition(int old_state, int new_state)
4010 4065  {
4011 4066          switch (old_state) {
4012 4067          case FLK_INITIAL_STATE:
4013 4068                  if ((new_state == FLK_START_STATE) ||
4014 4069                      (new_state == FLK_SLEEPING_STATE) ||
4015 4070                      (new_state == FLK_ACTIVE_STATE) ||
4016 4071                      (new_state == FLK_DEAD_STATE)) {
4017 4072                          return (0);
4018 4073                  } else {
4019 4074                          return (1);
4020 4075                  }
4021 4076          case FLK_START_STATE:
4022 4077                  if ((new_state == FLK_ACTIVE_STATE) ||
4023 4078                      (new_state == FLK_DEAD_STATE)) {
4024 4079                          return (0);
4025 4080                  } else {
4026 4081                          return (1);
4027 4082                  }
4028 4083          case FLK_ACTIVE_STATE:
4029 4084                  if (new_state == FLK_DEAD_STATE) {
4030 4085                          return (0);
4031 4086                  } else {
4032 4087                          return (1);
4033 4088                  }
4034 4089          case FLK_SLEEPING_STATE:
4035 4090                  if ((new_state == FLK_GRANTED_STATE) ||
4036 4091                      (new_state == FLK_INTERRUPTED_STATE) ||
4037 4092                      (new_state == FLK_CANCELLED_STATE)) {
4038 4093                          return (0);
4039 4094                  } else {
4040 4095                          return (1);
4041 4096                  }
4042 4097          case FLK_GRANTED_STATE:
4043 4098                  if ((new_state == FLK_START_STATE) ||
4044 4099                      (new_state == FLK_INTERRUPTED_STATE) ||
4045 4100                      (new_state == FLK_CANCELLED_STATE)) {
4046 4101                          return (0);
4047 4102                  } else {
4048 4103                          return (1);
4049 4104                  }
4050 4105          case FLK_CANCELLED_STATE:
4051 4106                  if ((new_state == FLK_INTERRUPTED_STATE) ||
4052 4107                      (new_state == FLK_DEAD_STATE)) {
4053 4108                          return (0);
4054 4109                  } else {
4055 4110                          return (1);
4056 4111                  }
4057 4112          case FLK_INTERRUPTED_STATE:
4058 4113                  if (new_state == FLK_DEAD_STATE) {
4059 4114                          return (0);
4060 4115                  } else {
4061 4116                          return (1);
4062 4117                  }
4063 4118          case FLK_DEAD_STATE:
4064 4119                  /* May be set more than once */
4065 4120                  if (new_state == FLK_DEAD_STATE) {
4066 4121                          return (0);
4067 4122                  } else {
4068 4123                          return (1);
4069 4124                  }
4070 4125          default:
4071 4126                  return (1);
4072 4127          }
4073 4128  }
4074 4129  
4075 4130  static void
4076 4131  check_sleeping_locks(graph_t *gp)
4077 4132  {
4078 4133          lock_descriptor_t *lock1, *lock2;
4079 4134          edge_t *ep;
4080 4135          for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4081 4136                                  lock1 = lock1->l_next) {
4082 4137                                  ASSERT(!IS_BARRIER(lock1));
4083 4138          for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4084 4139                                  lock2 = lock2->l_next) {
4085 4140                  if (lock1->l_vnode == lock2->l_vnode) {
4086 4141                          if (BLOCKS(lock2, lock1)) {
4087 4142                                  ASSERT(!IS_GRANTED(lock1));
4088 4143                                  ASSERT(!NOT_BLOCKED(lock1));
4089 4144                                  path(lock1, lock2);
4090 4145                          }
4091 4146                  }
4092 4147          }
4093 4148  
4094 4149          for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4095 4150                                          lock2 = lock2->l_next) {
4096 4151                                  ASSERT(!IS_BARRIER(lock1));
4097 4152                  if (lock1->l_vnode == lock2->l_vnode) {
4098 4153                          if (BLOCKS(lock2, lock1)) {
4099 4154                                  ASSERT(!IS_GRANTED(lock1));
4100 4155                                  ASSERT(!NOT_BLOCKED(lock1));
4101 4156                                  path(lock1, lock2);
4102 4157                          }
4103 4158                  }
4104 4159          }
4105 4160          ep = FIRST_ADJ(lock1);
4106 4161          while (ep != HEAD(lock1)) {
4107 4162                  ASSERT(BLOCKS(ep->to_vertex, lock1));
4108 4163                  ep = NEXT_ADJ(ep);
4109 4164          }
4110 4165          }
4111 4166  }
4112 4167  
4113 4168  static int
4114 4169  level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4115 4170  {
4116 4171          edge_t  *ep;
4117 4172          lock_descriptor_t       *vertex;
4118 4173          lock_descriptor_t *vertex_stack;
4119 4174  
4120 4175          STACK_INIT(vertex_stack);
4121 4176  
4122 4177          flk_graph_uncolor(lock1->l_graph);
4123 4178          ep = FIRST_ADJ(lock1);
4124 4179          ASSERT(ep != HEAD(lock1));
4125 4180          while (ep != HEAD(lock1)) {
4126 4181                  if (no_path)
4127 4182                          ASSERT(ep->to_vertex != lock2);
4128 4183                  STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4129 4184                  COLOR(ep->to_vertex);
4130 4185                  ep = NEXT_ADJ(ep);
4131 4186          }
4132 4187  
4133 4188          while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4134 4189                  STACK_POP(vertex_stack, l_dstack);
4135 4190                  for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4136 4191                                                  ep = NEXT_ADJ(ep)) {
4137 4192                          if (COLORED(ep->to_vertex))
4138 4193                                  continue;
4139 4194                          COLOR(ep->to_vertex);
4140 4195                          if (ep->to_vertex == lock2)
4141 4196                                  return (1);
4142 4197  
4143 4198                          STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4144 4199                  }
4145 4200          }
4146 4201          return (0);
4147 4202  }
4148 4203  
4149 4204  static void
4150 4205  check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4151 4206  {
4152 4207          lock_descriptor_t *lock;
4153 4208  
4154 4209          SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4155 4210  
4156 4211          if (lock) {
4157 4212                  while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4158 4213                          if (lock->l_flock.l_pid == pid &&
4159 4214                              lock->l_flock.l_sysid == sysid)
4160 4215                                  cmn_err(CE_PANIC,
4161 4216                                      "owner pid %d's lock %p in active queue",
4162 4217                                      pid, (void *)lock);
4163 4218                          lock = lock->l_next;
4164 4219                  }
4165 4220          }
4166 4221          SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4167 4222  
4168 4223          if (lock) {
4169 4224                  while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4170 4225                          if (lock->l_flock.l_pid == pid &&
4171 4226                              lock->l_flock.l_sysid == sysid)
4172 4227                                  cmn_err(CE_PANIC,
4173 4228                                      "owner pid %d's lock %p in sleep queue",
4174 4229                                      pid, (void *)lock);
4175 4230                          lock = lock->l_next;
4176 4231                  }
4177 4232          }
4178 4233  }
4179 4234  
4180 4235  static int
4181 4236  level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4182 4237  {
4183 4238          edge_t *ep = FIRST_ADJ(lock1);
4184 4239  
4185 4240          while (ep != HEAD(lock1)) {
4186 4241                  if (ep->to_vertex == lock2)
4187 4242                          return (1);
4188 4243                  else
4189 4244                          ep = NEXT_ADJ(ep);
4190 4245          }
4191 4246          return (0);
4192 4247  }
4193 4248  
4194 4249  static int
4195 4250  no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4196 4251  {
4197 4252          return (!level_two_path(lock1, lock2, 1));
4198 4253  }
4199 4254  
4200 4255  static void
4201 4256  path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4202 4257  {
4203 4258          if (level_one_path(lock1, lock2)) {
4204 4259                  if (level_two_path(lock1, lock2, 0) != 0) {
4205 4260                          cmn_err(CE_WARN,
4206 4261                              "one edge one path from lock1 %p lock2 %p",
4207 4262                              (void *)lock1, (void *)lock2);
4208 4263                  }
4209 4264          } else if (no_path(lock1, lock2)) {
4210 4265                  cmn_err(CE_PANIC,
4211 4266                      "No path from  lock1 %p to lock2 %p",
4212 4267                      (void *)lock1, (void *)lock2);
4213 4268          }
4214 4269  }
4215 4270  #endif /* DEBUG */

↓ open down ↓

1924 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX