illumos-gate Wdiff usr/src/uts/common/fs/nfs/nfs4_rnode.c

Print this page

4827 nfs4: slow file locking
4837 NFSv4 client lock retry delay upper limit should be shorter

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs4_rnode.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_rnode.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
  25   25  
  26   26  /*
  27   27   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28   28   *      All Rights Reserved
  29   29   */
  30   30  
  31   31  /*
  32   32   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  33   33   */
  34   34  
  35   35  #include <sys/param.h>
  36   36  #include <sys/types.h>
  37   37  #include <sys/systm.h>
  38   38  #include <sys/cred.h>
  39   39  #include <sys/proc.h>
  40   40  #include <sys/user.h>
  41   41  #include <sys/time.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/socket.h>
  46   46  #include <sys/uio.h>
  47   47  #include <sys/tiuser.h>
  48   48  #include <sys/swap.h>
  49   49  #include <sys/errno.h>
  50   50  #include <sys/debug.h>
  51   51  #include <sys/kmem.h>
  52   52  #include <sys/kstat.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/vtrace.h>
  55   55  #include <sys/session.h>
  56   56  #include <sys/dnlc.h>
  57   57  #include <sys/bitmap.h>
  58   58  #include <sys/acl.h>
  59   59  #include <sys/ddi.h>
  60   60  #include <sys/pathname.h>
  61   61  #include <sys/flock.h>
  62   62  #include <sys/dirent.h>
  63   63  #include <sys/flock.h>
  64   64  #include <sys/callb.h>
  65   65  #include <sys/sdt.h>
  66   66  
  67   67  #include <vm/pvn.h>
  68   68  
  69   69  #include <rpc/types.h>
  70   70  #include <rpc/xdr.h>
  71   71  #include <rpc/auth.h>
  72   72  #include <rpc/rpcsec_gss.h>
  73   73  #include <rpc/clnt.h>
  74   74  
  75   75  #include <nfs/nfs.h>
  76   76  #include <nfs/nfs_clnt.h>
  77   77  #include <nfs/nfs_acl.h>
  78   78  
  79   79  #include <nfs/nfs4.h>
  80   80  #include <nfs/rnode4.h>
  81   81  #include <nfs/nfs4_clnt.h>
  82   82  
  83   83  /*
  84   84   * The hash queues for the access to active and cached rnodes
  85   85   * are organized as doubly linked lists.  A reader/writer lock
  86   86   * for each hash bucket is used to control access and to synchronize
  87   87   * lookups, additions, and deletions from the hash queue.
  88   88   *
  89   89   * The rnode freelist is organized as a doubly linked list with
  90   90   * a head pointer.  Additions and deletions are synchronized via
  91   91   * a single mutex.
  92   92   *
  93   93   * In order to add an rnode to the free list, it must be hashed into
  94   94   * a hash queue and the exclusive lock to the hash queue be held.
  95   95   * If an rnode is not hashed into a hash queue, then it is destroyed
  96   96   * because it represents no valuable information that can be reused
  97   97   * about the file.  The exclusive lock to the hash queue must be
  98   98   * held in order to prevent a lookup in the hash queue from finding
  99   99   * the rnode and using it and assuming that the rnode is not on the
 100  100   * freelist.  The lookup in the hash queue will have the hash queue
 101  101   * locked, either exclusive or shared.
 102  102   *
 103  103   * The vnode reference count for each rnode is not allowed to drop
 104  104   * below 1.  This prevents external entities, such as the VM
 105  105   * subsystem, from acquiring references to vnodes already on the
 106  106   * freelist and then trying to place them back on the freelist
 107  107   * when their reference is released.  This means that the when an
 108  108   * rnode is looked up in the hash queues, then either the rnode
 109  109   * is removed from the freelist and that reference is transferred to
 110  110   * the new reference or the vnode reference count must be incremented
 111  111   * accordingly.  The mutex for the freelist must be held in order to
 112  112   * accurately test to see if the rnode is on the freelist or not.
 113  113   * The hash queue lock might be held shared and it is possible that
 114  114   * two different threads may race to remove the rnode from the
 115  115   * freelist.  This race can be resolved by holding the mutex for the
 116  116   * freelist.  Please note that the mutex for the freelist does not
 117  117   * need to be held if the rnode is not on the freelist.  It can not be
 118  118   * placed on the freelist due to the requirement that the thread
 119  119   * putting the rnode on the freelist must hold the exclusive lock
 120  120   * to the hash queue and the thread doing the lookup in the hash
 121  121   * queue is holding either a shared or exclusive lock to the hash
 122  122   * queue.
 123  123   *
 124  124   * The lock ordering is:
 125  125   *
 126  126   *      hash bucket lock -> vnode lock
 127  127   *      hash bucket lock -> freelist lock -> r_statelock
 128  128   */
 129  129  r4hashq_t *rtable4;
 130  130  
 131  131  static kmutex_t rp4freelist_lock;
 132  132  static rnode4_t *rp4freelist = NULL;
 133  133  static long rnode4_new = 0;
 134  134  int rtable4size;
 135  135  static int rtable4mask;
 136  136  static struct kmem_cache *rnode4_cache;
 137  137  static int rnode4_hashlen = 4;
 138  138  
 139  139  static void     r4inactive(rnode4_t *, cred_t *);
 140  140  static vnode_t  *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
 141  141                      struct vnodeops *,
 142  142                      int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
 143  143                      cred_t *),
 144  144                      int *, cred_t *);
 145  145  static void     rp4_rmfree(rnode4_t *);
 146  146  int             nfs4_free_data_reclaim(rnode4_t *);
 147  147  static int      nfs4_active_data_reclaim(rnode4_t *);
 148  148  static int      nfs4_free_reclaim(void);
 149  149  static int      nfs4_active_reclaim(void);
 150  150  static int      nfs4_rnode_reclaim(void);
 151  151  static void     nfs4_reclaim(void *);
 152  152  static int      isrootfh(nfs4_sharedfh_t *, rnode4_t *);
 153  153  static void     uninit_rnode4(rnode4_t *);
 154  154  static void     destroy_rnode4(rnode4_t *);
 155  155  static void     r4_stub_set(rnode4_t *, nfs4_stub_type_t);
 156  156  
 157  157  #ifdef DEBUG
 158  158  static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
 159  159  static int nfs4_rnode_debug = 0;
 160  160  /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
 161  161  static int nfs4_rnode_nofreelist = 0;
 162  162  /* give messages on colliding shared filehandles */
 163  163  static void     r4_dup_check(rnode4_t *, vfs_t *);
 164  164  #endif
 165  165  
 166  166  /*
 167  167   * If the vnode has pages, run the list and check for any that are
 168  168   * still dangling.  We call this routine before putting an rnode on
 169  169   * the free list.
 170  170   */
 171  171  static int
 172  172  nfs4_dross_pages(vnode_t *vp)
 173  173  {
 174  174          page_t *pp;
 175  175          kmutex_t *vphm;
 176  176  
 177  177          vphm = page_vnode_mutex(vp);
 178  178          mutex_enter(vphm);
 179  179          if ((pp = vp->v_pages) != NULL) {
 180  180                  do {
 181  181                          if (pp->p_hash != PVN_VPLIST_HASH_TAG &&
 182  182                              pp->p_fsdata != C_NOCOMMIT) {
 183  183                                  mutex_exit(vphm);
 184  184                                  return (1);
 185  185                          }
 186  186                  } while ((pp = pp->p_vpnext) != vp->v_pages);
 187  187          }
 188  188          mutex_exit(vphm);
 189  189  
 190  190          return (0);
 191  191  }
 192  192  
 193  193  /*
 194  194   * Flush any pages left on this rnode.
 195  195   */
 196  196  static void
 197  197  r4flushpages(rnode4_t *rp, cred_t *cr)
 198  198  {
 199  199          vnode_t *vp;
 200  200          int error;
 201  201  
 202  202          /*
 203  203           * Before freeing anything, wait until all asynchronous
 204  204           * activity is done on this rnode.  This will allow all
 205  205           * asynchronous read ahead and write behind i/o's to
 206  206           * finish.
 207  207           */
 208  208          mutex_enter(&rp->r_statelock);
 209  209          while (rp->r_count > 0)
 210  210                  cv_wait(&rp->r_cv, &rp->r_statelock);
 211  211          mutex_exit(&rp->r_statelock);
 212  212  
 213  213          /*
 214  214           * Flush and invalidate all pages associated with the vnode.
 215  215           */
 216  216          vp = RTOV4(rp);
 217  217          if (nfs4_has_pages(vp)) {
 218  218                  ASSERT(vp->v_type != VCHR);
 219  219                  if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
 220  220                          error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
 221  221                          if (error && (error == ENOSPC || error == EDQUOT)) {
 222  222                                  mutex_enter(&rp->r_statelock);
 223  223                                  if (!rp->r_error)
 224  224                                          rp->r_error = error;
 225  225                                  mutex_exit(&rp->r_statelock);
 226  226                          }
 227  227                  }
 228  228                  nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
 229  229          }
 230  230  }
 231  231  
 232  232  /*
 233  233   * Free the resources associated with an rnode.
 234  234   */
 235  235  static void
 236  236  r4inactive(rnode4_t *rp, cred_t *cr)
 237  237  {
 238  238          vnode_t *vp;
 239  239          char *contents;
 240  240          int size;
 241  241          vsecattr_t *vsp;
 242  242          vnode_t *xattr;
 243  243  
 244  244          r4flushpages(rp, cr);
 245  245  
 246  246          vp = RTOV4(rp);
 247  247  
 248  248          /*
 249  249           * Free any held caches which may be
 250  250           * associated with this rnode.
 251  251           */
 252  252          mutex_enter(&rp->r_statelock);
 253  253          contents = rp->r_symlink.contents;
 254  254          size = rp->r_symlink.size;
 255  255          rp->r_symlink.contents = NULL;
 256  256          vsp = rp->r_secattr;
 257  257          rp->r_secattr = NULL;
 258  258          xattr = rp->r_xattr_dir;
 259  259          rp->r_xattr_dir = NULL;
 260  260          mutex_exit(&rp->r_statelock);
 261  261  
 262  262          /*
 263  263           * Free the access cache entries.
 264  264           */
 265  265          (void) nfs4_access_purge_rp(rp);
 266  266  
 267  267          /*
 268  268           * Free the readdir cache entries.
 269  269           */
 270  270          nfs4_purge_rddir_cache(vp);
 271  271  
 272  272          /*
 273  273           * Free the symbolic link cache.
 274  274           */
 275  275          if (contents != NULL) {
 276  276  
 277  277                  kmem_free((void *)contents, size);
 278  278          }
 279  279  
 280  280          /*
 281  281           * Free any cached ACL.
 282  282           */
 283  283          if (vsp != NULL)
 284  284                  nfs4_acl_free_cache(vsp);
 285  285  
 286  286          /*
 287  287           * Release the cached xattr_dir
 288  288           */
 289  289          if (xattr != NULL)
 290  290                  VN_RELE(xattr);
 291  291  }
 292  292  
 293  293  /*
 294  294   * We have seen a case that the fh passed in is for "." which
 295  295   * should be a VROOT node, however, the fh is different from the
 296  296   * root fh stored in the mntinfo4_t. The invalid fh might be
 297  297   * from a misbehaved server and will panic the client system at
 298  298   * a later time. To avoid the panic, we drop the bad fh, use
 299  299   * the root fh from mntinfo4_t, and print an error message
 300  300   * for attention.
 301  301   */
 302  302  nfs4_sharedfh_t *
 303  303  badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
 304  304      int *wasbad)
 305  305  {
 306  306          char *s;
 307  307  
 308  308          *wasbad = 0;
 309  309          s = fn_name(nm);
 310  310          ASSERT(strcmp(s, "..") != 0);
 311  311  
 312  312          if ((s[0] == '.' && s[1] == '\0') && fh &&
 313  313              !SFH4_SAME(mi->mi_rootfh, fh)) {
 314  314  #ifdef DEBUG
 315  315                  nfs4_fhandle_t fhandle;
 316  316  
 317  317                  zcmn_err(mi->mi_zone->zone_id, CE_WARN,
 318  318                      "Server %s returns a different "
 319  319                      "root filehandle for the path %s:",
 320  320                      mi->mi_curr_serv->sv_hostname,
 321  321                      mi->mi_curr_serv->sv_path);
 322  322  
 323  323                  /* print the bad fh */
 324  324                  fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
 325  325                  bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
 326  326                      fhandle.fh_len);
 327  327                  nfs4_printfhandle(&fhandle);
 328  328  
 329  329                  /* print mi_rootfh */
 330  330                  fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
 331  331                  bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
 332  332                      fhandle.fh_len);
 333  333                  nfs4_printfhandle(&fhandle);
 334  334  #endif
 335  335                  /* use mi_rootfh instead; fh will be rele by the caller */
 336  336                  fh = mi->mi_rootfh;
 337  337                  *wasbad = 1;
 338  338          }
 339  339  
 340  340          kmem_free(s, MAXNAMELEN);
 341  341          return (fh);
 342  342  }
 343  343  
 344  344  void
 345  345  r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
 346  346      hrtime_t t, cred_t *cr, int index)
 347  347  {
 348  348          int is_stub;
 349  349          vattr_t *attr;
 350  350          /*
 351  351           * Don't add to attrcache if time overflow, but
 352  352           * no need to check because either attr is null or the time
 353  353           * values in it were processed by nfs4_time_ntov(), which checks
 354  354           * for time overflows.
 355  355           */
 356  356          attr = garp ? &garp->n4g_va : NULL;
 357  357  
 358  358          if (attr) {
 359  359                  if (!newnode) {
 360  360                          rw_exit(&rtable4[index].r_lock);
 361  361  #ifdef DEBUG
 362  362                          if (vp->v_type != attr->va_type &&
 363  363                              vp->v_type != VNON && attr->va_type != VNON) {
 364  364                                  zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
 365  365                                      "makenfs4node: type (%d) doesn't "
 366  366                                      "match type of found node at %p (%d)",
 367  367                                      attr->va_type, (void *)vp, vp->v_type);
 368  368                          }
 369  369  #endif
 370  370                          nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
 371  371                  } else {
 372  372                          rnode4_t *rp = VTOR4(vp);
 373  373  
 374  374                          vp->v_type = attr->va_type;
 375  375                          vp->v_rdev = attr->va_rdev;
 376  376  
 377  377                          /*
 378  378                           * Turn this object into a "stub" object if we
 379  379                           * crossed an underlying server fs boundary.
 380  380                           * To make this check, during mount we save the
 381  381                           * fsid of the server object being mounted.
 382  382                           * Here we compare this object's server fsid
 383  383                           * with the fsid we saved at mount.  If they
 384  384                           * are different, we crossed server fs boundary.
 385  385                           *
 386  386                           * The stub type is set (or not) at rnode
 387  387                           * creation time and it never changes for life
 388  388                           * of the rnode.
 389  389                           *
 390  390                           * This stub will be for a mirror-mount, rather than
 391  391                           * a referral (the latter also sets R4SRVSTUB).
 392  392                           *
 393  393                           * The stub type is also set during RO failover,
 394  394                           * nfs4_remap_file().
 395  395                           *
 396  396                           * We don't bother with taking r_state_lock to
 397  397                           * set the stub type because this is a new rnode
 398  398                           * and we're holding the hash bucket r_lock RW_WRITER.
 399  399                           * No other thread could have obtained access
 400  400                           * to this rnode.
 401  401                           */
 402  402                          is_stub = 0;
 403  403                          if (garp->n4g_fsid_valid) {
 404  404                                  fattr4_fsid ga_fsid = garp->n4g_fsid;
 405  405                                  servinfo4_t *svp = rp->r_server;
 406  406  
 407  407                                  rp->r_srv_fsid = ga_fsid;
 408  408  
 409  409                                  (void) nfs_rw_enter_sig(&svp->sv_lock,
 410  410                                      RW_READER, 0);
 411  411                                  if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
 412  412                                          is_stub = 1;
 413  413                                  nfs_rw_exit(&svp->sv_lock);
 414  414                          }
 415  415  
 416  416                          if (is_stub)
 417  417                                  r4_stub_mirrormount(rp);
 418  418                          else
 419  419                                  r4_stub_none(rp);
 420  420  
 421  421                          /* Can not cache partial attr */
 422  422                          if (attr->va_mask == AT_ALL)
 423  423                                  nfs4_attrcache_noinval(vp, garp, t);
 424  424                          else
 425  425                                  PURGE_ATTRCACHE4(vp);
 426  426  
 427  427                          rw_exit(&rtable4[index].r_lock);
 428  428                  }
 429  429          } else {
 430  430                  if (newnode) {
 431  431                          PURGE_ATTRCACHE4(vp);
 432  432                  }
 433  433                  rw_exit(&rtable4[index].r_lock);
 434  434          }
 435  435  }
 436  436  
 437  437  /*
 438  438   * Find or create an rnode based primarily on filehandle.  To be
 439  439   * used when dvp (vnode for parent directory) is not available;
 440  440   * otherwise, makenfs4node() should be used.
 441  441   *
 442  442   * The nfs4_fname_t argument *npp is consumed and nulled out.
 443  443   */
 444  444  
 445  445  vnode_t *
 446  446  makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
 447  447      nfs4_fname_t **npp, nfs4_ga_res_t *garp,
 448  448      mntinfo4_t *mi, cred_t *cr, hrtime_t t)
 449  449  {
 450  450          vfs_t *vfsp = mi->mi_vfsp;
 451  451          int newnode = 0;
 452  452          vnode_t *vp;
 453  453          rnode4_t *rp;
 454  454          svnode_t *svp;
 455  455          nfs4_fname_t *name, *svpname;
 456  456          int index;
 457  457  
 458  458          ASSERT(npp && *npp);
 459  459          name = *npp;
 460  460          *npp = NULL;
 461  461  
 462  462          index = rtable4hash(sfh);
 463  463          rw_enter(&rtable4[index].r_lock, RW_READER);
 464  464  
 465  465          vp = make_rnode4(sfh, &rtable4[index], vfsp,
 466  466              nfs4_vnodeops, nfs4_putapage, &newnode, cr);
 467  467  
 468  468          svp = VTOSV(vp);
 469  469          rp = VTOR4(vp);
 470  470          if (newnode) {
 471  471                  svp->sv_forw = svp->sv_back = svp;
 472  472                  svp->sv_name = name;
 473  473                  if (psfh != NULL)
 474  474                          sfh4_hold(psfh);
 475  475                  svp->sv_dfh = psfh;
 476  476          } else {
 477  477                  /*
 478  478                   * It is possible that due to a server
 479  479                   * side rename fnames have changed.
 480  480                   * update the fname here.
 481  481                   */
 482  482                  mutex_enter(&rp->r_svlock);
 483  483                  svpname = svp->sv_name;
 484  484                  if (svp->sv_name != name) {
 485  485                          svp->sv_name = name;
 486  486                          mutex_exit(&rp->r_svlock);
 487  487                          fn_rele(&svpname);
 488  488                  } else {
 489  489                          mutex_exit(&rp->r_svlock);
 490  490                          fn_rele(&name);
 491  491                  }
 492  492          }
 493  493  
 494  494          ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
 495  495          r4_do_attrcache(vp, garp, newnode, t, cr, index);
 496  496          ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
 497  497  
 498  498          return (vp);
 499  499  }
 500  500  
 501  501  /*
 502  502   * Find or create a vnode for the given filehandle, filesystem, parent, and
 503  503   * name.  The reference to nm is consumed, so the caller must first do an
 504  504   * fn_hold() if it wants to continue using nm after this call.
 505  505   */
 506  506  vnode_t *
 507  507  makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
 508  508      hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
 509  509  {
 510  510          vnode_t *vp;
 511  511          int newnode;
 512  512          int index;
 513  513          mntinfo4_t *mi = VFTOMI4(vfsp);
 514  514          int had_badfh = 0;
 515  515          rnode4_t *rp;
 516  516  
 517  517          ASSERT(dvp != NULL);
 518  518  
 519  519          fh = badrootfh_check(fh, nm, mi, &had_badfh);
 520  520  
 521  521          index = rtable4hash(fh);
 522  522          rw_enter(&rtable4[index].r_lock, RW_READER);
 523  523  
 524  524          /*
 525  525           * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
 526  526           */
 527  527          vp = make_rnode4(fh, &rtable4[index], vfsp, nfs4_vnodeops,
 528  528              nfs4_putapage, &newnode, cr);
 529  529  
 530  530          rp = VTOR4(vp);
 531  531          sv_activate(&vp, dvp, &nm, newnode);
 532  532          if (dvp->v_flag & V_XATTRDIR) {
 533  533                  mutex_enter(&rp->r_statelock);
 534  534                  rp->r_flags |= R4ISXATTR;
 535  535                  mutex_exit(&rp->r_statelock);
 536  536          }
 537  537  
 538  538          /* if getting a bad file handle, do not cache the attributes. */
 539  539          if (had_badfh) {
 540  540                  rw_exit(&rtable4[index].r_lock);
 541  541                  return (vp);
 542  542          }
 543  543  
 544  544          ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
 545  545          r4_do_attrcache(vp, garp, newnode, t, cr, index);
 546  546          ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
 547  547  
 548  548          return (vp);
 549  549  }
 550  550  
 551  551  /*
 552  552   * Hash on address of filehandle object.
 553  553   * XXX totally untuned.
 554  554   */
 555  555  
 556  556  int
 557  557  rtable4hash(nfs4_sharedfh_t *fh)
 558  558  {
 559  559          return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
 560  560  }
 561  561  
 562  562  /*
 563  563   * Find or create the vnode for the given filehandle and filesystem.
 564  564   * *newnode is set to zero if the vnode already existed; non-zero if it had
 565  565   * to be created.
 566  566   *
 567  567   * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
 568  568   */
 569  569  
 570  570  static vnode_t *
 571  571  make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
 572  572      struct vnodeops *vops,
 573  573      int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
 574  574      int *newnode, cred_t *cr)
 575  575  {
 576  576          rnode4_t *rp;
 577  577          rnode4_t *trp;
 578  578          vnode_t *vp;
 579  579          mntinfo4_t *mi;
 580  580  
 581  581          ASSERT(RW_READ_HELD(&rhtp->r_lock));
 582  582  
 583  583          mi = VFTOMI4(vfsp);
 584  584  
 585  585  start:
 586  586          if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
 587  587                  vp = RTOV4(rp);
 588  588                  *newnode = 0;
 589  589                  return (vp);
 590  590          }
 591  591          rw_exit(&rhtp->r_lock);
 592  592  
 593  593          mutex_enter(&rp4freelist_lock);
 594  594  
 595  595          if (rp4freelist != NULL && rnode4_new >= nrnode) {
 596  596                  rp = rp4freelist;
 597  597                  rp4_rmfree(rp);
 598  598                  mutex_exit(&rp4freelist_lock);
 599  599  
 600  600                  vp = RTOV4(rp);
 601  601  
 602  602                  if (rp->r_flags & R4HASHED) {
 603  603                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
 604  604                          mutex_enter(&vp->v_lock);
 605  605                          if (vp->v_count > 1) {
 606  606                                  vp->v_count--;
 607  607                                  mutex_exit(&vp->v_lock);
 608  608                                  rw_exit(&rp->r_hashq->r_lock);
 609  609                                  rw_enter(&rhtp->r_lock, RW_READER);
 610  610                                  goto start;
 611  611                          }
 612  612                          mutex_exit(&vp->v_lock);
 613  613                          rp4_rmhash_locked(rp);
 614  614                          rw_exit(&rp->r_hashq->r_lock);
 615  615                  }
 616  616  
 617  617                  r4inactive(rp, cr);
 618  618  
 619  619                  mutex_enter(&vp->v_lock);
 620  620                  if (vp->v_count > 1) {
 621  621                          vp->v_count--;
 622  622                          mutex_exit(&vp->v_lock);
 623  623                          rw_enter(&rhtp->r_lock, RW_READER);
 624  624                          goto start;
 625  625                  }
 626  626                  mutex_exit(&vp->v_lock);
 627  627                  vn_invalid(vp);
 628  628  
 629  629                  /*
 630  630                   * destroy old locks before bzero'ing and
 631  631                   * recreating the locks below.
 632  632                   */
 633  633                  uninit_rnode4(rp);
 634  634  
 635  635                  /*
 636  636                   * Make sure that if rnode is recycled then
 637  637                   * VFS count is decremented properly before
 638  638                   * reuse.
 639  639                   */
 640  640                  VFS_RELE(vp->v_vfsp);
 641  641                  vn_reinit(vp);
 642  642          } else {
 643  643                  vnode_t *new_vp;
 644  644  
 645  645                  mutex_exit(&rp4freelist_lock);
 646  646  
 647  647                  rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
 648  648                  new_vp = vn_alloc(KM_SLEEP);
 649  649  
 650  650                  atomic_add_long((ulong_t *)&rnode4_new, 1);
 651  651  #ifdef DEBUG
 652  652                  clstat4_debug.nrnode.value.ui64++;
 653  653  #endif
 654  654                  vp = new_vp;
 655  655          }
 656  656  
 657  657          bzero(rp, sizeof (*rp));
 658  658          rp->r_vnode = vp;
 659  659          nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
 660  660          nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
 661  661          mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
 662  662          mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
 663  663          mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
 664  664          mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
 665  665          rp->created_v4 = 0;
 666  666          list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
 667  667              offsetof(nfs4_open_stream_t, os_node));
 668  668          rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
 669  669          rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
 670  670          cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
 671  671          cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
 672  672          rp->r_flags = R4READDIRWATTR;
 673  673          rp->r_fh = fh;
 674  674          rp->r_hashq = rhtp;
 675  675          sfh4_hold(rp->r_fh);
 676  676          rp->r_server = mi->mi_curr_serv;
 677  677          rp->r_deleg_type = OPEN_DELEGATE_NONE;
 678  678          rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
 679  679          nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
 680  680  
 681  681          rddir4_cache_create(rp);
 682  682          rp->r_putapage = putapage;
 683  683          vn_setops(vp, vops);
 684  684          vp->v_data = (caddr_t)rp;
 685  685          vp->v_vfsp = vfsp;
 686  686          VFS_HOLD(vfsp);
 687  687          vp->v_type = VNON;
 688  688          vp->v_flag |= VMODSORT;
 689  689          if (isrootfh(fh, rp))
 690  690                  vp->v_flag = VROOT;
 691  691          vn_exists(vp);
 692  692  
 693  693          /*
 694  694           * There is a race condition if someone else
 695  695           * alloc's the rnode while no locks are held, so we
 696  696           * check again and recover if found.
 697  697           */
 698  698          rw_enter(&rhtp->r_lock, RW_WRITER);
 699  699          if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
 700  700                  vp = RTOV4(trp);
 701  701                  *newnode = 0;
 702  702                  rw_exit(&rhtp->r_lock);
 703  703                  rp4_addfree(rp, cr);
 704  704                  rw_enter(&rhtp->r_lock, RW_READER);
 705  705                  return (vp);
 706  706          }
 707  707          rp4_addhash(rp);
 708  708          *newnode = 1;
 709  709          return (vp);
 710  710  }
 711  711  
 712  712  static void
 713  713  uninit_rnode4(rnode4_t *rp)
 714  714  {
 715  715          vnode_t *vp = RTOV4(rp);
 716  716  
 717  717          ASSERT(rp != NULL);
 718  718          ASSERT(vp != NULL);
 719  719          ASSERT(vp->v_count == 1);
 720  720          ASSERT(rp->r_count == 0);
 721  721          ASSERT(rp->r_mapcnt == 0);
 722  722          if (rp->r_flags & R4LODANGLERS) {
 723  723                  nfs4_flush_lock_owners(rp);
 724  724          }
 725  725          ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
 726  726          ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
 727  727          ASSERT(!(rp->r_flags & R4HASHED));
 728  728          ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
 729  729          nfs4_clear_open_streams(rp);
 730  730          list_destroy(&rp->r_open_streams);
 731  731  
 732  732          /*
 733  733           * Destroy the rddir cache first since we need to grab the r_statelock.
 734  734           */
 735  735          mutex_enter(&rp->r_statelock);
 736  736          rddir4_cache_destroy(rp);
 737  737          mutex_exit(&rp->r_statelock);
 738  738          sv_uninit(&rp->r_svnode);
 739  739          sfh4_rele(&rp->r_fh);
 740  740          nfs_rw_destroy(&rp->r_rwlock);
 741  741          nfs_rw_destroy(&rp->r_lkserlock);
 742  742          mutex_destroy(&rp->r_statelock);
 743  743          mutex_destroy(&rp->r_statev4_lock);
 744  744          mutex_destroy(&rp->r_os_lock);
 745  745          cv_destroy(&rp->r_cv);
 746  746          cv_destroy(&rp->r_commit.c_cv);
 747  747          nfs_rw_destroy(&rp->r_deleg_recall_lock);
 748  748          if (rp->r_flags & R4DELMAPLIST)
 749  749                  list_destroy(&rp->r_indelmap);
 750  750  }
 751  751  
 752  752  /*
 753  753   * Put an rnode on the free list.
 754  754   *
 755  755   * Rnodes which were allocated above and beyond the normal limit
 756  756   * are immediately freed.
 757  757   */
 758  758  void
 759  759  rp4_addfree(rnode4_t *rp, cred_t *cr)
 760  760  {
 761  761          vnode_t *vp;
 762  762          vnode_t *xattr;
 763  763          struct vfs *vfsp;
 764  764  
 765  765          vp = RTOV4(rp);
 766  766          ASSERT(vp->v_count >= 1);
 767  767          ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
 768  768  
 769  769          /*
 770  770           * If we have too many rnodes allocated and there are no
 771  771           * references to this rnode, or if the rnode is no longer
 772  772           * accessible by it does not reside in the hash queues,
 773  773           * or if an i/o error occurred while writing to the file,
 774  774           * then just free it instead of putting it on the rnode
 775  775           * freelist.
 776  776           */
 777  777          vfsp = vp->v_vfsp;
 778  778          if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
 779  779  #ifdef DEBUG
 780  780              (nfs4_rnode_nofreelist != 0) ||
 781  781  #endif
 782  782              rp->r_error || (rp->r_flags & R4RECOVERR) ||
 783  783              (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
 784  784                  if (rp->r_flags & R4HASHED) {
 785  785                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
 786  786                          mutex_enter(&vp->v_lock);
 787  787                          if (vp->v_count > 1) {
 788  788                                  vp->v_count--;
 789  789                                  mutex_exit(&vp->v_lock);
 790  790                                  rw_exit(&rp->r_hashq->r_lock);
 791  791                                  return;
 792  792                          }
 793  793                          mutex_exit(&vp->v_lock);
 794  794                          rp4_rmhash_locked(rp);
 795  795                          rw_exit(&rp->r_hashq->r_lock);
 796  796                  }
 797  797  
 798  798                  /*
 799  799                   * Make sure we don't have a delegation on this rnode
 800  800                   * before destroying it.
 801  801                   */
 802  802                  if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
 803  803                          (void) nfs4delegreturn(rp,
 804  804                              NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
 805  805                  }
 806  806  
 807  807                  r4inactive(rp, cr);
 808  808  
 809  809                  /*
 810  810                   * Recheck the vnode reference count.  We need to
 811  811                   * make sure that another reference has not been
 812  812                   * acquired while we were not holding v_lock.  The
 813  813                   * rnode is not in the rnode hash queues; one
 814  814                   * way for a reference to have been acquired
 815  815                   * is for a VOP_PUTPAGE because the rnode was marked
 816  816                   * with R4DIRTY or for a modified page.  This
 817  817                   * reference may have been acquired before our call
 818  818                   * to r4inactive.  The i/o may have been completed,
 819  819                   * thus allowing r4inactive to complete, but the
 820  820                   * reference to the vnode may not have been released
 821  821                   * yet.  In any case, the rnode can not be destroyed
 822  822                   * until the other references to this vnode have been
 823  823                   * released.  The other references will take care of
 824  824                   * either destroying the rnode or placing it on the
 825  825                   * rnode freelist.  If there are no other references,
 826  826                   * then the rnode may be safely destroyed.
 827  827                   */
 828  828                  mutex_enter(&vp->v_lock);
 829  829                  if (vp->v_count > 1) {
 830  830                          vp->v_count--;
 831  831                          mutex_exit(&vp->v_lock);
 832  832                          return;
 833  833                  }
 834  834                  mutex_exit(&vp->v_lock);
 835  835  
 836  836                  destroy_rnode4(rp);
 837  837                  return;
 838  838          }
 839  839  
 840  840          /*
 841  841           * Lock the hash queue and then recheck the reference count
 842  842           * to ensure that no other threads have acquired a reference
 843  843           * to indicate that the rnode should not be placed on the
 844  844           * freelist.  If another reference has been acquired, then
 845  845           * just release this one and let the other thread complete
 846  846           * the processing of adding this rnode to the freelist.
 847  847           */
 848  848  again:
 849  849          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
 850  850  
 851  851          mutex_enter(&vp->v_lock);
 852  852          if (vp->v_count > 1) {
 853  853                  vp->v_count--;
 854  854                  mutex_exit(&vp->v_lock);
 855  855                  rw_exit(&rp->r_hashq->r_lock);
 856  856                  return;
 857  857          }
 858  858          mutex_exit(&vp->v_lock);
 859  859  
 860  860          /*
 861  861           * Make sure we don't put an rnode with a delegation
 862  862           * on the free list.
 863  863           */
 864  864          if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
 865  865                  rw_exit(&rp->r_hashq->r_lock);
 866  866                  (void) nfs4delegreturn(rp,
 867  867                      NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
 868  868                  goto again;
 869  869          }
 870  870  
 871  871          /*
 872  872           * Now that we have the hash queue lock, and we know there
 873  873           * are not anymore references on the vnode, check to make
 874  874           * sure there aren't any open streams still on the rnode.
 875  875           * If so, drop the hash queue lock, remove the open streams,
 876  876           * and recheck the v_count.
 877  877           */
 878  878          mutex_enter(&rp->r_os_lock);
 879  879          if (list_head(&rp->r_open_streams) != NULL) {
 880  880                  mutex_exit(&rp->r_os_lock);
 881  881                  rw_exit(&rp->r_hashq->r_lock);
 882  882                  if (nfs_zone() != VTOMI4(vp)->mi_zone)
 883  883                          nfs4_clear_open_streams(rp);
 884  884                  else
 885  885                          (void) nfs4close_all(vp, cr);
 886  886                  goto again;
 887  887          }
 888  888          mutex_exit(&rp->r_os_lock);
 889  889  
 890  890          /*
 891  891           * Before we put it on the freelist, make sure there are no pages.
 892  892           * If there are, flush and commit of all of the dirty and
 893  893           * uncommitted pages, assuming the file system isn't read only.
 894  894           */
 895  895          if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
 896  896                  rw_exit(&rp->r_hashq->r_lock);
 897  897                  r4flushpages(rp, cr);
 898  898                  goto again;
 899  899          }
 900  900  
 901  901          /*
 902  902           * Before we put it on the freelist, make sure there is no
 903  903           * active xattr directory cached, the freelist will not
 904  904           * have its entries r4inactive'd if there is still an active
 905  905           * rnode, thus nothing in the freelist can hold another
 906  906           * rnode active.
 907  907           */
 908  908          xattr = rp->r_xattr_dir;
 909  909          rp->r_xattr_dir = NULL;
 910  910  
 911  911          /*
 912  912           * If there is no cached data or metadata for this file, then
 913  913           * put the rnode on the front of the freelist so that it will
 914  914           * be reused before other rnodes which may have cached data or
 915  915           * metadata associated with them.
 916  916           */
 917  917          mutex_enter(&rp4freelist_lock);
 918  918          if (rp4freelist == NULL) {
 919  919                  rp->r_freef = rp;
 920  920                  rp->r_freeb = rp;
 921  921                  rp4freelist = rp;
 922  922          } else {
 923  923                  rp->r_freef = rp4freelist;
 924  924                  rp->r_freeb = rp4freelist->r_freeb;
 925  925                  rp4freelist->r_freeb->r_freef = rp;
 926  926                  rp4freelist->r_freeb = rp;
 927  927                  if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
 928  928                      rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
 929  929                          rp4freelist = rp;
 930  930          }
 931  931          mutex_exit(&rp4freelist_lock);
 932  932  
 933  933          rw_exit(&rp->r_hashq->r_lock);
 934  934  
 935  935          if (xattr)
 936  936                  VN_RELE(xattr);
 937  937  }
 938  938  
 939  939  /*
 940  940   * Remove an rnode from the free list.
 941  941   *
 942  942   * The caller must be holding rp4freelist_lock and the rnode
 943  943   * must be on the freelist.
 944  944   */
 945  945  static void
 946  946  rp4_rmfree(rnode4_t *rp)
 947  947  {
 948  948  
 949  949          ASSERT(MUTEX_HELD(&rp4freelist_lock));
 950  950          ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
 951  951  
 952  952          if (rp == rp4freelist) {
 953  953                  rp4freelist = rp->r_freef;
 954  954                  if (rp == rp4freelist)
 955  955                          rp4freelist = NULL;
 956  956          }
 957  957          rp->r_freeb->r_freef = rp->r_freef;
 958  958          rp->r_freef->r_freeb = rp->r_freeb;
 959  959  
 960  960          rp->r_freef = rp->r_freeb = NULL;
 961  961  }
 962  962  
 963  963  /*
 964  964   * Put a rnode in the hash table.
 965  965   *
 966  966   * The caller must be holding the exclusive hash queue lock
 967  967   */
 968  968  void
 969  969  rp4_addhash(rnode4_t *rp)
 970  970  {
 971  971          ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
 972  972          ASSERT(!(rp->r_flags & R4HASHED));
 973  973  
 974  974  #ifdef DEBUG
 975  975          r4_dup_check(rp, RTOV4(rp)->v_vfsp);
 976  976  #endif
 977  977  
 978  978          rp->r_hashf = rp->r_hashq->r_hashf;
 979  979          rp->r_hashq->r_hashf = rp;
 980  980          rp->r_hashb = (rnode4_t *)rp->r_hashq;
 981  981          rp->r_hashf->r_hashb = rp;
 982  982  
 983  983          mutex_enter(&rp->r_statelock);
 984  984          rp->r_flags |= R4HASHED;
 985  985          mutex_exit(&rp->r_statelock);
 986  986  }
 987  987  
 988  988  /*
 989  989   * Remove a rnode from the hash table.
 990  990   *
 991  991   * The caller must be holding the hash queue lock.
 992  992   */
 993  993  void
 994  994  rp4_rmhash_locked(rnode4_t *rp)
 995  995  {
 996  996          ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
 997  997          ASSERT(rp->r_flags & R4HASHED);
 998  998  
 999  999          rp->r_hashb->r_hashf = rp->r_hashf;
1000 1000          rp->r_hashf->r_hashb = rp->r_hashb;
1001 1001  
1002 1002          mutex_enter(&rp->r_statelock);
1003 1003          rp->r_flags &= ~R4HASHED;
1004 1004          mutex_exit(&rp->r_statelock);
1005 1005  }
1006 1006  
1007 1007  /*
1008 1008   * Remove a rnode from the hash table.
1009 1009   *
1010 1010   * The caller must not be holding the hash queue lock.
1011 1011   */
1012 1012  void
1013 1013  rp4_rmhash(rnode4_t *rp)
1014 1014  {
1015 1015          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1016 1016          rp4_rmhash_locked(rp);
1017 1017          rw_exit(&rp->r_hashq->r_lock);
1018 1018  }
1019 1019  
1020 1020  /*
1021 1021   * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1022 1022   * Returns NULL if no match.  If an rnode is returned, the reference count
1023 1023   * on the master vnode is incremented.
1024 1024   *
1025 1025   * The caller must be holding the hash queue lock, either shared or exclusive.
1026 1026   */
1027 1027  rnode4_t *
1028 1028  r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1029 1029  {
1030 1030          rnode4_t *rp;
1031 1031          vnode_t *vp;
1032 1032  
1033 1033          ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1034 1034  
1035 1035          for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1036 1036                  vp = RTOV4(rp);
1037 1037                  if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1038 1038  
1039 1039                          mutex_enter(&rp->r_statelock);
1040 1040                          if (rp->r_flags & R4RECOVERR) {
1041 1041                                  mutex_exit(&rp->r_statelock);
1042 1042                                  continue;
1043 1043                          }
1044 1044                          mutex_exit(&rp->r_statelock);
1045 1045  #ifdef DEBUG
1046 1046                          r4_dup_check(rp, vfsp);
1047 1047  #endif
1048 1048                          if (rp->r_freef != NULL) {
1049 1049                                  mutex_enter(&rp4freelist_lock);
1050 1050                                  /*
1051 1051                                   * If the rnode is on the freelist,
1052 1052                                   * then remove it and use that reference
1053 1053                                   * as the new reference.  Otherwise,
1054 1054                                   * need to increment the reference count.
1055 1055                                   */
1056 1056                                  if (rp->r_freef != NULL) {
1057 1057                                          rp4_rmfree(rp);
1058 1058                                          mutex_exit(&rp4freelist_lock);
1059 1059                                  } else {
1060 1060                                          mutex_exit(&rp4freelist_lock);
1061 1061                                          VN_HOLD(vp);
1062 1062                                  }
1063 1063                          } else
1064 1064                                  VN_HOLD(vp);
1065 1065  
1066 1066                          /*
1067 1067                           * if root vnode, set v_flag to indicate that
1068 1068                           */
1069 1069                          if (isrootfh(fh, rp)) {
1070 1070                                  if (!(vp->v_flag & VROOT)) {
1071 1071                                          mutex_enter(&vp->v_lock);
1072 1072                                          vp->v_flag |= VROOT;
1073 1073                                          mutex_exit(&vp->v_lock);
1074 1074                                  }
1075 1075                          }
1076 1076                          return (rp);
1077 1077                  }
1078 1078          }
1079 1079          return (NULL);
1080 1080  }
1081 1081  
1082 1082  /*
1083 1083   * Lookup an rnode by fhandle. Just a wrapper for r4find()
1084 1084   * that assumes the caller hasn't already got the lock
1085 1085   * on the hash bucket.
1086 1086   */
1087 1087  rnode4_t *
1088 1088  r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1089 1089  {
1090 1090          rnode4_t *rp;
1091 1091          int index;
1092 1092  
1093 1093          index = rtable4hash(fh);
1094 1094          rw_enter(&rtable4[index].r_lock, RW_READER);
1095 1095          rp = r4find(&rtable4[index], fh, vfsp);
1096 1096          rw_exit(&rtable4[index].r_lock);
1097 1097  
1098 1098          return (rp);
1099 1099  }
1100 1100  
1101 1101  /*
1102 1102   * Return >0 if there is a active vnode belonging to this vfs in the
1103 1103   * rtable4 cache.
1104 1104   *
1105 1105   * Several of these checks are done without holding the usual
1106 1106   * locks.  This is safe because destroy_rtable(), rp_addfree(),
1107 1107   * etc. will redo the necessary checks before actually destroying
1108 1108   * any rnodes.
1109 1109   */
1110 1110  int
1111 1111  check_rtable4(struct vfs *vfsp)
1112 1112  {
1113 1113          rnode4_t *rp;
1114 1114          vnode_t *vp;
1115 1115          int busy = NFSV4_RTABLE4_OK;
1116 1116          int index;
1117 1117  
1118 1118          for (index = 0; index < rtable4size; index++) {
1119 1119                  rw_enter(&rtable4[index].r_lock, RW_READER);
1120 1120  
1121 1121                  for (rp = rtable4[index].r_hashf;
1122 1122                      rp != (rnode4_t *)(&rtable4[index]);
1123 1123                      rp = rp->r_hashf) {
1124 1124  
1125 1125                          vp = RTOV4(rp);
1126 1126                          if (vp->v_vfsp == vfsp) {
1127 1127                                  if (rp->r_freef == NULL) {
1128 1128                                          busy = NFSV4_RTABLE4_NOT_FREE_LIST;
1129 1129                                  } else if (nfs4_has_pages(vp) &&
1130 1130                                      (rp->r_flags & R4DIRTY)) {
1131 1131                                          busy = NFSV4_RTABLE4_DIRTY_PAGES;
1132 1132                                  } else if (rp->r_count > 0) {
1133 1133                                          busy = NFSV4_RTABLE4_POS_R_COUNT;
1134 1134                                  }
1135 1135  
1136 1136                                  if (busy != NFSV4_RTABLE4_OK) {
1137 1137  #ifdef DEBUG
1138 1138                                          char *path;
1139 1139  
1140 1140                                          path = fn_path(rp->r_svnode.sv_name);
1141 1141                                          DTRACE_NFSV4_3(rnode__e__debug,
1142 1142                                              int, busy, char *, path,
1143 1143                                              rnode4_t *, rp);
1144 1144                                          kmem_free(path, strlen(path)+1);
1145 1145  #endif
1146 1146                                          rw_exit(&rtable4[index].r_lock);
1147 1147                                          return (busy);
1148 1148                                  }
1149 1149                          }
1150 1150                  }
1151 1151                  rw_exit(&rtable4[index].r_lock);
1152 1152          }
1153 1153          return (busy);
1154 1154  }
1155 1155  
1156 1156  /*
1157 1157   * Destroy inactive vnodes from the hash queues which
1158 1158   * belong to this vfs. All of the vnodes should be inactive.
1159 1159   * It is essential that we destroy all rnodes in case of
1160 1160   * forced unmount as well as in normal unmount case.
1161 1161   */
1162 1162  
1163 1163  void
1164 1164  destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1165 1165  {
1166 1166          int index;
1167 1167          vnode_t *vp;
1168 1168          rnode4_t *rp, *r_hashf, *rlist;
1169 1169  
1170 1170          rlist = NULL;
1171 1171  
1172 1172          for (index = 0; index < rtable4size; index++) {
1173 1173                  rw_enter(&rtable4[index].r_lock, RW_WRITER);
1174 1174                  for (rp = rtable4[index].r_hashf;
1175 1175                      rp != (rnode4_t *)(&rtable4[index]);
1176 1176                      rp = r_hashf) {
1177 1177                          /* save the hash pointer before destroying */
1178 1178                          r_hashf = rp->r_hashf;
1179 1179  
1180 1180                          vp = RTOV4(rp);
1181 1181                          if (vp->v_vfsp == vfsp) {
1182 1182                                  mutex_enter(&rp4freelist_lock);
1183 1183                                  if (rp->r_freef != NULL) {
1184 1184                                          rp4_rmfree(rp);
1185 1185                                          mutex_exit(&rp4freelist_lock);
1186 1186                                          rp4_rmhash_locked(rp);
1187 1187                                          rp->r_hashf = rlist;
1188 1188                                          rlist = rp;
1189 1189                                  } else
1190 1190                                          mutex_exit(&rp4freelist_lock);
1191 1191                          }
1192 1192                  }
1193 1193                  rw_exit(&rtable4[index].r_lock);
1194 1194          }
1195 1195  
1196 1196          for (rp = rlist; rp != NULL; rp = r_hashf) {
1197 1197                  r_hashf = rp->r_hashf;
1198 1198                  /*
1199 1199                   * This call to rp4_addfree will end up destroying the
1200 1200                   * rnode, but in a safe way with the appropriate set
1201 1201                   * of checks done.
1202 1202                   */
1203 1203                  rp4_addfree(rp, cr);
1204 1204          }
1205 1205  }
1206 1206  
1207 1207  /*
1208 1208   * This routine destroys all the resources of an rnode
1209 1209   * and finally the rnode itself.
1210 1210   */
1211 1211  static void
1212 1212  destroy_rnode4(rnode4_t *rp)
1213 1213  {
1214 1214          vnode_t *vp;
1215 1215          vfs_t *vfsp;
1216 1216  
1217 1217          ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1218 1218  
1219 1219          vp = RTOV4(rp);
1220 1220          vfsp = vp->v_vfsp;
1221 1221  
1222 1222          uninit_rnode4(rp);
1223 1223          atomic_add_long((ulong_t *)&rnode4_new, -1);
1224 1224  #ifdef DEBUG
1225 1225          clstat4_debug.nrnode.value.ui64--;
1226 1226  #endif
1227 1227          kmem_cache_free(rnode4_cache, rp);
1228 1228          vn_invalid(vp);
1229 1229          vn_free(vp);
1230 1230          VFS_RELE(vfsp);
1231 1231  }
1232 1232  
1233 1233  /*
1234 1234   * Invalidate the attributes on all rnodes forcing the next getattr
1235 1235   * to go over the wire.  Used to flush stale uid and gid mappings.
1236 1236   * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1237 1237   */
1238 1238  void
1239 1239  nfs4_rnode_invalidate(struct vfs *vfsp)
1240 1240  {
1241 1241          int index;
1242 1242          rnode4_t *rp;
1243 1243          vnode_t *vp;
1244 1244  
1245 1245          /*
1246 1246           * Walk the hash queues looking for rnodes.
1247 1247           */
1248 1248          for (index = 0; index < rtable4size; index++) {
1249 1249                  rw_enter(&rtable4[index].r_lock, RW_READER);
1250 1250                  for (rp = rtable4[index].r_hashf;
1251 1251                      rp != (rnode4_t *)(&rtable4[index]);
1252 1252                      rp = rp->r_hashf) {
1253 1253                          vp = RTOV4(rp);
1254 1254                          if (vfsp != NULL && vp->v_vfsp != vfsp)
1255 1255                                  continue;
1256 1256  
1257 1257                          if (!mutex_tryenter(&rp->r_statelock))
1258 1258                                  continue;
1259 1259  
1260 1260                          /*
1261 1261                           * Expire the attributes by resetting the change
1262 1262                           * and attr timeout.
1263 1263                           */
1264 1264                          rp->r_change = 0;
1265 1265                          PURGE_ATTRCACHE4_LOCKED(rp);
1266 1266                          mutex_exit(&rp->r_statelock);
1267 1267                  }
1268 1268                  rw_exit(&rtable4[index].r_lock);
1269 1269          }
1270 1270  }
1271 1271  
1272 1272  /*
1273 1273   * Flush all vnodes in this (or every) vfs.
1274 1274   * Used by nfs_sync and by nfs_unmount.
1275 1275   */
1276 1276  void
1277 1277  r4flush(struct vfs *vfsp, cred_t *cr)
1278 1278  {
1279 1279          int index;
1280 1280          rnode4_t *rp;
1281 1281          vnode_t *vp, **vplist;
1282 1282          long num, cnt;
1283 1283  
1284 1284          /*
1285 1285           * Check to see whether there is anything to do.
1286 1286           */
1287 1287          num = rnode4_new;
1288 1288          if (num == 0)
1289 1289                  return;
1290 1290  
1291 1291          /*
1292 1292           * Allocate a slot for all currently active rnodes on the
1293 1293           * supposition that they all may need flushing.
1294 1294           */
1295 1295          vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1296 1296          cnt = 0;
1297 1297  
1298 1298          /*
1299 1299           * Walk the hash queues looking for rnodes with page
1300 1300           * lists associated with them.  Make a list of these
1301 1301           * files.
1302 1302           */
1303 1303          for (index = 0; index < rtable4size; index++) {
1304 1304                  rw_enter(&rtable4[index].r_lock, RW_READER);
1305 1305                  for (rp = rtable4[index].r_hashf;
1306 1306                      rp != (rnode4_t *)(&rtable4[index]);
1307 1307                      rp = rp->r_hashf) {
1308 1308                          vp = RTOV4(rp);
1309 1309                          /*
1310 1310                           * Don't bother sync'ing a vp if it
1311 1311                           * is part of virtual swap device or
1312 1312                           * if VFS is read-only
1313 1313                           */
1314 1314                          if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1315 1315                                  continue;
1316 1316                          /*
1317 1317                           * If flushing all mounted file systems or
1318 1318                           * the vnode belongs to this vfs, has pages
1319 1319                           * and is marked as either dirty or mmap'd,
1320 1320                           * hold and add this vnode to the list of
1321 1321                           * vnodes to flush.
1322 1322                           */
1323 1323                          if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
1324 1324                              nfs4_has_pages(vp) &&
1325 1325                              ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1326 1326                                  VN_HOLD(vp);
1327 1327                                  vplist[cnt++] = vp;
1328 1328                                  if (cnt == num) {
1329 1329                                          rw_exit(&rtable4[index].r_lock);
1330 1330                                          goto toomany;
1331 1331                                  }
1332 1332                          }
1333 1333                  }
1334 1334                  rw_exit(&rtable4[index].r_lock);
1335 1335          }
1336 1336  toomany:
1337 1337  
1338 1338          /*
1339 1339           * Flush and release all of the files on the list.
1340 1340           */
1341 1341          while (cnt-- > 0) {
1342 1342                  vp = vplist[cnt];
1343 1343                  (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
1344 1344                  VN_RELE(vp);
1345 1345          }
1346 1346  
1347 1347          /*
1348 1348           * Free the space allocated to hold the list.
1349 1349           */
1350 1350          kmem_free(vplist, num * sizeof (*vplist));
1351 1351  }
1352 1352  
1353 1353  int
1354 1354  nfs4_free_data_reclaim(rnode4_t *rp)
1355 1355  {
1356 1356          char *contents;
1357 1357          vnode_t *xattr;
1358 1358          int size;
1359 1359          vsecattr_t *vsp;
1360 1360          int freed;
1361 1361          bool_t rdc = FALSE;
1362 1362  
1363 1363          /*
1364 1364           * Free any held caches which may
1365 1365           * be associated with this rnode.
1366 1366           */
1367 1367          mutex_enter(&rp->r_statelock);
1368 1368          if (rp->r_dir != NULL)
1369 1369                  rdc = TRUE;
1370 1370          contents = rp->r_symlink.contents;
1371 1371          size = rp->r_symlink.size;
1372 1372          rp->r_symlink.contents = NULL;
1373 1373          vsp = rp->r_secattr;
1374 1374          rp->r_secattr = NULL;
1375 1375          xattr = rp->r_xattr_dir;
1376 1376          rp->r_xattr_dir = NULL;
1377 1377          mutex_exit(&rp->r_statelock);
1378 1378  
1379 1379          /*
1380 1380           * Free the access cache entries.
1381 1381           */
1382 1382          freed = nfs4_access_purge_rp(rp);
1383 1383  
1384 1384          if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1385 1385                  return (freed);
1386 1386  
1387 1387          /*
1388 1388           * Free the readdir cache entries, incompletely if we can't block.
1389 1389           */
1390 1390          nfs4_purge_rddir_cache(RTOV4(rp));
1391 1391  
1392 1392          /*
1393 1393           * Free the symbolic link cache.
1394 1394           */
1395 1395          if (contents != NULL) {
1396 1396  
1397 1397                  kmem_free((void *)contents, size);
1398 1398          }
1399 1399  
1400 1400          /*
1401 1401           * Free any cached ACL.
1402 1402           */
1403 1403          if (vsp != NULL)
1404 1404                  nfs4_acl_free_cache(vsp);
1405 1405  
1406 1406          /*
1407 1407           * Release the xattr directory vnode
1408 1408           */
1409 1409          if (xattr != NULL)
1410 1410                  VN_RELE(xattr);
1411 1411  
1412 1412          return (1);
1413 1413  }
1414 1414  
1415 1415  static int
1416 1416  nfs4_active_data_reclaim(rnode4_t *rp)
1417 1417  {
1418 1418          char *contents;
1419 1419          vnode_t *xattr = NULL;
1420 1420          int size;
1421 1421          vsecattr_t *vsp;
1422 1422          int freed;
1423 1423          bool_t rdc = FALSE;
1424 1424  
1425 1425          /*
1426 1426           * Free any held credentials and caches which
1427 1427           * may be associated with this rnode.
1428 1428           */
1429 1429          if (!mutex_tryenter(&rp->r_statelock))
1430 1430                  return (0);
1431 1431          contents = rp->r_symlink.contents;
1432 1432          size = rp->r_symlink.size;
1433 1433          rp->r_symlink.contents = NULL;
1434 1434          vsp = rp->r_secattr;
1435 1435          rp->r_secattr = NULL;
1436 1436          if (rp->r_dir != NULL)
1437 1437                  rdc = TRUE;
1438 1438          /*
1439 1439           * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1440 1440           * on the same r_hashq queue. We are not mandated to free all caches.
1441 1441           * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1442 1442           * rnode 'rp' is freed or put on the free list.
1443 1443           *
1444 1444           * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1445 1445           * - it has no associated rnode4_t (its v_data is NULL),
1446 1446           * - it is preallocated statically and will never go away,
1447 1447           * so we cannot save anything by releasing it.
1448 1448           */
1449 1449          if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1450 1450              VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1451 1451                  xattr = rp->r_xattr_dir;
1452 1452                  rp->r_xattr_dir = NULL;
1453 1453          }
1454 1454          mutex_exit(&rp->r_statelock);
1455 1455  
1456 1456          /*
1457 1457           * Free the access cache entries.
1458 1458           */
1459 1459          freed = nfs4_access_purge_rp(rp);
1460 1460  
1461 1461          if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1462 1462                  return (freed);
1463 1463  
1464 1464          /*
1465 1465           * Free the symbolic link cache.
1466 1466           */
1467 1467          if (contents != NULL) {
1468 1468  
1469 1469                  kmem_free((void *)contents, size);
1470 1470          }
1471 1471  
1472 1472          /*
1473 1473           * Free any cached ACL.
1474 1474           */
1475 1475          if (vsp != NULL)
1476 1476                  nfs4_acl_free_cache(vsp);
1477 1477  
1478 1478          nfs4_purge_rddir_cache(RTOV4(rp));
1479 1479  
1480 1480          /*
1481 1481           * Release the xattr directory vnode
1482 1482           */
1483 1483          if (xattr != NULL)
1484 1484                  VN_RELE(xattr);
1485 1485  
1486 1486          return (1);
1487 1487  }
1488 1488  
1489 1489  static int
1490 1490  nfs4_free_reclaim(void)
1491 1491  {
1492 1492          int freed;
1493 1493          rnode4_t *rp;
1494 1494  
1495 1495  #ifdef DEBUG
1496 1496          clstat4_debug.f_reclaim.value.ui64++;
1497 1497  #endif
1498 1498          freed = 0;
1499 1499          mutex_enter(&rp4freelist_lock);
1500 1500          rp = rp4freelist;
1501 1501          if (rp != NULL) {
1502 1502                  do {
1503 1503                          if (nfs4_free_data_reclaim(rp))
1504 1504                                  freed = 1;
1505 1505                  } while ((rp = rp->r_freef) != rp4freelist);
1506 1506          }
1507 1507          mutex_exit(&rp4freelist_lock);
1508 1508          return (freed);
1509 1509  }
1510 1510  
1511 1511  static int
1512 1512  nfs4_active_reclaim(void)
1513 1513  {
1514 1514          int freed;
1515 1515          int index;
1516 1516          rnode4_t *rp;
1517 1517  
1518 1518  #ifdef DEBUG
1519 1519          clstat4_debug.a_reclaim.value.ui64++;
1520 1520  #endif
1521 1521          freed = 0;
1522 1522          for (index = 0; index < rtable4size; index++) {
1523 1523                  rw_enter(&rtable4[index].r_lock, RW_READER);
1524 1524                  for (rp = rtable4[index].r_hashf;
1525 1525                      rp != (rnode4_t *)(&rtable4[index]);
1526 1526                      rp = rp->r_hashf) {
1527 1527                          if (nfs4_active_data_reclaim(rp))
1528 1528                                  freed = 1;
1529 1529                  }
1530 1530                  rw_exit(&rtable4[index].r_lock);
1531 1531          }
1532 1532          return (freed);
1533 1533  }
1534 1534  
1535 1535  static int
1536 1536  nfs4_rnode_reclaim(void)
1537 1537  {
1538 1538          int freed;
1539 1539          rnode4_t *rp;
1540 1540          vnode_t *vp;
1541 1541  
1542 1542  #ifdef DEBUG
1543 1543          clstat4_debug.r_reclaim.value.ui64++;
1544 1544  #endif
1545 1545          freed = 0;
1546 1546          mutex_enter(&rp4freelist_lock);
1547 1547          while ((rp = rp4freelist) != NULL) {
1548 1548                  rp4_rmfree(rp);
1549 1549                  mutex_exit(&rp4freelist_lock);
1550 1550                  if (rp->r_flags & R4HASHED) {
1551 1551                          vp = RTOV4(rp);
1552 1552                          rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1553 1553                          mutex_enter(&vp->v_lock);
1554 1554                          if (vp->v_count > 1) {
1555 1555                                  vp->v_count--;
1556 1556                                  mutex_exit(&vp->v_lock);
1557 1557                                  rw_exit(&rp->r_hashq->r_lock);
1558 1558                                  mutex_enter(&rp4freelist_lock);
1559 1559                                  continue;
1560 1560                          }
1561 1561                          mutex_exit(&vp->v_lock);
1562 1562                          rp4_rmhash_locked(rp);
1563 1563                          rw_exit(&rp->r_hashq->r_lock);
1564 1564                  }
1565 1565                  /*
1566 1566                   * This call to rp_addfree will end up destroying the
1567 1567                   * rnode, but in a safe way with the appropriate set
1568 1568                   * of checks done.
1569 1569                   */
1570 1570                  rp4_addfree(rp, CRED());
1571 1571                  mutex_enter(&rp4freelist_lock);
1572 1572          }
1573 1573          mutex_exit(&rp4freelist_lock);
1574 1574          return (freed);
1575 1575  }
1576 1576  
1577 1577  /*ARGSUSED*/
1578 1578  static void
1579 1579  nfs4_reclaim(void *cdrarg)
1580 1580  {
1581 1581  #ifdef DEBUG
1582 1582          clstat4_debug.reclaim.value.ui64++;
1583 1583  #endif
1584 1584          if (nfs4_free_reclaim())
1585 1585                  return;
1586 1586  
1587 1587          if (nfs4_active_reclaim())
1588 1588                  return;
1589 1589  
1590 1590          (void) nfs4_rnode_reclaim();
1591 1591  }
1592 1592  
1593 1593  /*
1594 1594   * Returns the clientid4 to use for the given mntinfo4.  Note that the
1595 1595   * clientid can change if the caller drops mi_recovlock.
1596 1596   */
1597 1597  
1598 1598  clientid4
1599 1599  mi2clientid(mntinfo4_t *mi)
1600 1600  {
1601 1601          nfs4_server_t   *sp;
1602 1602          clientid4       clientid = 0;
1603 1603  
1604 1604          /* this locks down sp if it is found */

↓ open down ↓

1604 lines elided

↑ open up ↑

1605 1605          sp = find_nfs4_server(mi);
1606 1606          if (sp != NULL) {
1607 1607                  clientid = sp->clientid;
1608 1608                  mutex_exit(&sp->s_lock);
1609 1609                  nfs4_server_rele(sp);
1610 1610          }
1611 1611          return (clientid);
1612 1612  }
1613 1613  
1614 1614  /*
1615      - * Return the current lease time for the server associated with the given
1616      - * file.  Note that the lease time could change immediately after this
1617      - * call.
1618      - */
1619      -
1620      -time_t
1621      -r2lease_time(rnode4_t *rp)
1622      -{
1623      -        nfs4_server_t   *sp;
1624      -        time_t          lease_time;
1625      -        mntinfo4_t      *mi = VTOMI4(RTOV4(rp));
1626      -
1627      -        (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1628      -
1629      -        /* this locks down sp if it is found */
1630      -        sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1631      -
1632      -        if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1633      -                if (sp != NULL) {
1634      -                        mutex_exit(&sp->s_lock);
1635      -                        nfs4_server_rele(sp);
1636      -                }
1637      -                nfs_rw_exit(&mi->mi_recovlock);
1638      -                return (1);             /* 1 second */
1639      -        }
1640      -
1641      -        ASSERT(sp != NULL);
1642      -
1643      -        lease_time = sp->s_lease_time;
1644      -
1645      -        mutex_exit(&sp->s_lock);
1646      -        nfs4_server_rele(sp);
1647      -        nfs_rw_exit(&mi->mi_recovlock);
1648      -
1649      -        return (lease_time);
1650      -}
1651      -
1652      -/*
1653 1615   * Return a list with information about all the known open instances for
1654 1616   * a filesystem. The caller must call r4releopenlist() when done with the
1655 1617   * list.
1656 1618   *
1657 1619   * We are safe at looking at os_valid and os_pending_close across dropping
1658 1620   * the 'os_sync_lock' to count up the number of open streams and then
1659 1621   * allocate memory for the osp list due to:
1660 1622   *      -Looking at os_pending_close is safe since this routine is
1661 1623   *      only called via recovery, and os_pending_close can only be set via
1662 1624   *      a non-recovery operation (which are all blocked when recovery

1663 1625   *      is active).
1664 1626   *
1665 1627   *      -Examining os_valid is safe since non-recovery operations, which
1666 1628   *      could potentially switch os_valid to 0, are blocked (via
1667 1629   *      nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1668 1630   *      (which means we are the only recovery thread potentially acting
1669 1631   *      on this open stream).
1670 1632   */
1671 1633  
1672 1634  nfs4_opinst_t *
1673 1635  r4mkopenlist(mntinfo4_t *mi)
1674 1636  {
1675 1637          nfs4_opinst_t *reopenlist, *rep;
1676 1638          rnode4_t *rp;
1677 1639          vnode_t *vp;
1678 1640          vfs_t *vfsp = mi->mi_vfsp;
1679 1641          int numosp;
1680 1642          nfs4_open_stream_t *osp;
1681 1643          int index;
1682 1644          open_delegation_type4 dtype;
1683 1645          int hold_vnode;
1684 1646  
1685 1647          reopenlist = NULL;
1686 1648  
1687 1649          for (index = 0; index < rtable4size; index++) {
1688 1650                  rw_enter(&rtable4[index].r_lock, RW_READER);
1689 1651                  for (rp = rtable4[index].r_hashf;
1690 1652                      rp != (rnode4_t *)(&rtable4[index]);
1691 1653                      rp = rp->r_hashf) {
1692 1654  
1693 1655                          vp = RTOV4(rp);
1694 1656                          if (vp->v_vfsp != vfsp)
1695 1657                                  continue;
1696 1658                          hold_vnode = 0;
1697 1659  
1698 1660                          mutex_enter(&rp->r_os_lock);
1699 1661  
1700 1662                          /* Count the number of valid open_streams of the file */
1701 1663                          numosp = 0;
1702 1664                          for (osp = list_head(&rp->r_open_streams); osp != NULL;
1703 1665                              osp = list_next(&rp->r_open_streams, osp)) {
1704 1666                                  mutex_enter(&osp->os_sync_lock);
1705 1667                                  if (osp->os_valid && !osp->os_pending_close)
1706 1668                                          numosp++;
1707 1669                                  mutex_exit(&osp->os_sync_lock);
1708 1670                          }
1709 1671  
1710 1672                          /* Fill in the valid open streams per vp */
1711 1673                          if (numosp > 0) {
1712 1674                                  int j;
1713 1675  
1714 1676                                  hold_vnode = 1;
1715 1677  
1716 1678                                  /*
1717 1679                                   * Add a new open instance to the list
1718 1680                                   */
1719 1681                                  rep = kmem_zalloc(sizeof (*reopenlist),
1720 1682                                      KM_SLEEP);
1721 1683                                  rep->re_next = reopenlist;
1722 1684                                  reopenlist = rep;
1723 1685  
1724 1686                                  rep->re_vp = vp;
1725 1687                                  rep->re_osp = kmem_zalloc(
1726 1688                                      numosp * sizeof (*(rep->re_osp)),
1727 1689                                      KM_SLEEP);
1728 1690                                  rep->re_numosp = numosp;
1729 1691  
1730 1692                                  j = 0;
1731 1693                                  for (osp = list_head(&rp->r_open_streams);
1732 1694                                      osp != NULL;
1733 1695                                      osp = list_next(&rp->r_open_streams, osp)) {
1734 1696  
1735 1697                                          mutex_enter(&osp->os_sync_lock);
1736 1698                                          if (osp->os_valid &&
1737 1699                                              !osp->os_pending_close) {
1738 1700                                                  osp->os_ref_count++;
1739 1701                                                  rep->re_osp[j] = osp;
1740 1702                                                  j++;
1741 1703                                          }
1742 1704                                          mutex_exit(&osp->os_sync_lock);
1743 1705                                  }
1744 1706                                  /*
1745 1707                                   * Assuming valid osp(s) stays valid between
1746 1708                                   * the time obtaining j and numosp.
1747 1709                                   */
1748 1710                                  ASSERT(j == numosp);
1749 1711                          }
1750 1712  
1751 1713                          mutex_exit(&rp->r_os_lock);
1752 1714                          /* do this here to keep v_lock > r_os_lock */
1753 1715                          if (hold_vnode)
1754 1716                                  VN_HOLD(vp);
1755 1717                          mutex_enter(&rp->r_statev4_lock);
1756 1718                          if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1757 1719                                  /*
1758 1720                                   * If this rnode holds a delegation,
1759 1721                                   * but if there are no valid open streams,
1760 1722                                   * then just discard the delegation
1761 1723                                   * without doing delegreturn.
1762 1724                                   */
1763 1725                                  if (numosp > 0)
1764 1726                                          rp->r_deleg_needs_recovery =
1765 1727                                              rp->r_deleg_type;
1766 1728                          }
1767 1729                          /* Save the delegation type for use outside the lock */
1768 1730                          dtype = rp->r_deleg_type;
1769 1731                          mutex_exit(&rp->r_statev4_lock);
1770 1732  
1771 1733                          /*
1772 1734                           * If we have a delegation then get rid of it.
1773 1735                           * We've set rp->r_deleg_needs_recovery so we have
1774 1736                           * enough information to recover.
1775 1737                           */
1776 1738                          if (dtype != OPEN_DELEGATE_NONE) {
1777 1739                                  (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1778 1740                          }
1779 1741                  }
1780 1742                  rw_exit(&rtable4[index].r_lock);
1781 1743          }
1782 1744          return (reopenlist);
1783 1745  }
1784 1746  
1785 1747  /*
1786 1748   * Given a filesystem id, check to see if any rnodes
1787 1749   * within this fsid reside in the rnode cache, other
1788 1750   * than one we know about.
1789 1751   *
1790 1752   * Return 1 if an rnode is found, 0 otherwise
1791 1753   */
1792 1754  int
1793 1755  r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1794 1756  {
1795 1757          rnode4_t *rp;
1796 1758          vnode_t *vp;
1797 1759          vfs_t *vfsp = mi->mi_vfsp;
1798 1760          fattr4_fsid *fsid;
1799 1761          int index, found = 0;
1800 1762  
1801 1763          for (index = 0; index < rtable4size; index++) {
1802 1764                  rw_enter(&rtable4[index].r_lock, RW_READER);
1803 1765                  for (rp = rtable4[index].r_hashf;
1804 1766                      rp != (rnode4_t *)(&rtable4[index]);
1805 1767                      rp = rp->r_hashf) {
1806 1768  
1807 1769                          vp = RTOV4(rp);
1808 1770                          if (vp->v_vfsp != vfsp)
1809 1771                                  continue;
1810 1772  
1811 1773                          /*
1812 1774                           * XXX there might be a case where a
1813 1775                           * replicated fs may have the same fsid
1814 1776                           * across two different servers. This
1815 1777                           * check isn't good enough in that case
1816 1778                           */
1817 1779                          fsid = &rp->r_srv_fsid;
1818 1780                          if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1819 1781                                  found = 1;
1820 1782                                  break;
1821 1783                          }
1822 1784                  }
1823 1785                  rw_exit(&rtable4[index].r_lock);
1824 1786  
1825 1787                  if (found)
1826 1788                          break;
1827 1789          }
1828 1790          return (found);
1829 1791  }
1830 1792  
1831 1793  /*
1832 1794   * Release the list of open instance references.
1833 1795   */
1834 1796  
1835 1797  void
1836 1798  r4releopenlist(nfs4_opinst_t *reopenp)
1837 1799  {
1838 1800          nfs4_opinst_t *rep, *next;
1839 1801          int i;
1840 1802  
1841 1803          for (rep = reopenp; rep; rep = next) {
1842 1804                  next = rep->re_next;
1843 1805  
1844 1806                  for (i = 0; i < rep->re_numosp; i++)
1845 1807                          open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1846 1808  
1847 1809                  VN_RELE(rep->re_vp);
1848 1810                  kmem_free(rep->re_osp,
1849 1811                      rep->re_numosp * sizeof (*(rep->re_osp)));
1850 1812  
1851 1813                  kmem_free(rep, sizeof (*rep));
1852 1814          }
1853 1815  }
1854 1816  
1855 1817  int
1856 1818  nfs4_rnode_init(void)
1857 1819  {
1858 1820          ulong_t nrnode4_max;
1859 1821          int i;
1860 1822  
1861 1823          /*
1862 1824           * Compute the size of the rnode4 hash table
1863 1825           */
1864 1826          if (nrnode <= 0)
1865 1827                  nrnode = ncsize;
1866 1828          nrnode4_max =
1867 1829              (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1868 1830          if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1869 1831                  zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1870 1832                      "!setting nrnode to max value of %ld", nrnode4_max);
1871 1833                  nrnode = nrnode4_max;
1872 1834          }
1873 1835          rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1874 1836          rtable4mask = rtable4size - 1;
1875 1837  
1876 1838          /*
1877 1839           * Allocate and initialize the hash buckets
1878 1840           */
1879 1841          rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1880 1842          for (i = 0; i < rtable4size; i++) {
1881 1843                  rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1882 1844                  rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1883 1845                  rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1884 1846          }
1885 1847  
1886 1848          rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1887 1849              0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1888 1850  
1889 1851          return (0);
1890 1852  }
1891 1853  
1892 1854  int
1893 1855  nfs4_rnode_fini(void)
1894 1856  {
1895 1857          int i;
1896 1858  
1897 1859          /*
1898 1860           * Deallocate the rnode hash queues
1899 1861           */
1900 1862          kmem_cache_destroy(rnode4_cache);
1901 1863  
1902 1864          for (i = 0; i < rtable4size; i++)
1903 1865                  rw_destroy(&rtable4[i].r_lock);
1904 1866  
1905 1867          kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1906 1868  
1907 1869          return (0);
1908 1870  }
1909 1871  
1910 1872  /*
1911 1873   * Return non-zero if the given filehandle refers to the root filehandle
1912 1874   * for the given rnode.
1913 1875   */
1914 1876  
1915 1877  static int
1916 1878  isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1917 1879  {
1918 1880          int isroot;
1919 1881  
1920 1882          isroot = 0;
1921 1883          if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1922 1884                  isroot = 1;
1923 1885  
1924 1886          return (isroot);
1925 1887  }
1926 1888  
1927 1889  /*
1928 1890   * The r4_stub_* routines assume that the rnode is newly activated, and
1929 1891   * that the caller either holds the hash bucket r_lock for this rnode as
1930 1892   * RW_WRITER, or holds r_statelock.
1931 1893   */
1932 1894  static void
1933 1895  r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1934 1896  {
1935 1897          vnode_t *vp = RTOV4(rp);
1936 1898          krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1937 1899  
1938 1900          ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1939 1901  
1940 1902          rp->r_stub_type = type;
1941 1903  
1942 1904          /*
1943 1905           * Safely switch this vnode to the trigger vnodeops.
1944 1906           *
1945 1907           * Currently, we don't ever switch a trigger vnode back to using
1946 1908           * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1947 1909           * a new v4 object is not a trigger, and it will already have the
1948 1910           * correct v4 vnodeops by default. So, no "else" case required here.
1949 1911           */
1950 1912          if (type != NFS4_STUB_NONE)
1951 1913                  vn_setops(vp, nfs4_trigger_vnodeops);
1952 1914  }
1953 1915  
1954 1916  void
1955 1917  r4_stub_mirrormount(rnode4_t *rp)
1956 1918  {
1957 1919          r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
1958 1920  }
1959 1921  
1960 1922  void
1961 1923  r4_stub_referral(rnode4_t *rp)
1962 1924  {
1963 1925          DTRACE_PROBE1(nfs4clnt__func__referral__moved,
1964 1926              vnode_t *, RTOV4(rp));
1965 1927          r4_stub_set(rp, NFS4_STUB_REFERRAL);
1966 1928  }
1967 1929  
1968 1930  void
1969 1931  r4_stub_none(rnode4_t *rp)
1970 1932  {
1971 1933          r4_stub_set(rp, NFS4_STUB_NONE);
1972 1934  }
1973 1935  
1974 1936  #ifdef DEBUG
1975 1937  
1976 1938  /*
1977 1939   * Look in the rnode table for other rnodes that have the same filehandle.
1978 1940   * Assume the lock is held for the hash chain of checkrp
1979 1941   */
1980 1942  
1981 1943  static void
1982 1944  r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
1983 1945  {
1984 1946          rnode4_t *rp;
1985 1947          vnode_t *tvp;
1986 1948          nfs4_fhandle_t fh, fh2;
1987 1949          int index;
1988 1950  
1989 1951          if (!r4_check_for_dups)
1990 1952                  return;
1991 1953  
1992 1954          ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
1993 1955  
1994 1956          sfh4_copyval(checkrp->r_fh, &fh);
1995 1957  
1996 1958          for (index = 0; index < rtable4size; index++) {
1997 1959  
1998 1960                  if (&rtable4[index] != checkrp->r_hashq)
1999 1961                          rw_enter(&rtable4[index].r_lock, RW_READER);
2000 1962  
2001 1963                  for (rp = rtable4[index].r_hashf;
2002 1964                      rp != (rnode4_t *)(&rtable4[index]);
2003 1965                      rp = rp->r_hashf) {
2004 1966  
2005 1967                          if (rp == checkrp)
2006 1968                                  continue;
2007 1969  
2008 1970                          tvp = RTOV4(rp);
2009 1971                          if (tvp->v_vfsp != vfsp)
2010 1972                                  continue;
2011 1973  
2012 1974                          sfh4_copyval(rp->r_fh, &fh2);
2013 1975                          if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2014 1976                                  cmn_err(CE_PANIC, "rnodes with same fs, fh "
2015 1977                                      "(%p, %p)", (void *)checkrp, (void *)rp);
2016 1978                          }
2017 1979                  }
2018 1980  
2019 1981                  if (&rtable4[index] != checkrp->r_hashq)
2020 1982                          rw_exit(&rtable4[index].r_lock);
2021 1983          }
2022 1984  }
2023 1985  
2024 1986  #endif /* DEBUG */

↓ open down ↓

362 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX